From 01f3a2f17d1a40150637dd87b9e8388faa3708d5 Mon Sep 17 00:00:00 2001 From: James Chua Date: Thu, 23 Mar 2023 21:41:02 +0800 Subject: [PATCH 01/10] add smoke tests --- elk/extraction/extraction.py | 31 +++++++++++++++++-------------- tests/test_smoke_elicit.py | 23 +++++++++++++++++++++++ 2 files changed, 40 insertions(+), 14 deletions(-) create mode 100644 tests/test_smoke_elicit.py diff --git a/elk/extraction/extraction.py b/elk/extraction/extraction.py index b68a9ed4..21c3c9f8 100644 --- a/elk/extraction/extraction.py +++ b/elk/extraction/extraction.py @@ -1,15 +1,10 @@ """Functions for extracting the hidden states of a model.""" -from .prompt_dataset import Prompt, PromptDataset, PromptConfig -from ..utils import ( - assert_type, - float32_to_int16, - infer_label_column, - select_train_val_splits, - select_usable_devices, -) -from .generator import _GeneratorBuilder +import logging from dataclasses import dataclass, InitVar +from typing import Iterable, Literal, Union + +import torch from datasets import ( Array3D, DatasetDict, @@ -20,6 +15,7 @@ SplitDict, SplitInfo, Value, + Dataset, ) from simple_parsing.helpers import field, Serializable from transformers import ( @@ -29,9 +25,16 @@ BatchEncoding, PreTrainedModel, ) -from typing import Iterable, Literal, Union -import logging -import torch + +from .generator import _GeneratorBuilder +from .prompt_dataset import Prompt, PromptDataset, PromptConfig +from ..utils import ( + assert_type, + float32_to_int16, + infer_label_column, + select_train_val_splits, + select_usable_devices, +) @dataclass @@ -272,7 +275,7 @@ def get_splits() -> SplitDict: ), } devices = select_usable_devices(max_gpus) - builders = { + builders: dict[Split, _GeneratorBuilder] = { split_name: _GeneratorBuilder( cache_dir=None, features=Features({**layer_cols, **other_cols}), @@ -290,7 +293,7 @@ def get_splits() -> SplitDict: for (split_name, split_info) in splits.items() } - ds = dict() + ds: dict[Split, Union[Dataset, DatasetDict]] = dict() for split, builder in builders.items(): builder.download_and_prepare(num_proc=len(devices)) ds[split] = builder.as_dataset(split=split) diff --git a/tests/test_smoke_elicit.py b/tests/test_smoke_elicit.py new file mode 100644 index 00000000..cc027c07 --- /dev/null +++ b/tests/test_smoke_elicit.py @@ -0,0 +1,23 @@ +from pathlib import Path + +from elk import ExtractionConfig +from elk.extraction import extract, PromptConfig +from elk.training.train import train, RunConfig + + +def test_smoke_elicit_run(tmp_path: Path): + # We'll use the tiny DeBERTa model for this test + model_path = "hf-internal-testing/tiny-deberta" + # todo: support tiny-imdb. But somnehow we need to convince promptsource + dataset_name = "imdb" + config = RunConfig( + data=ExtractionConfig( + model=model_path, + prompts=PromptConfig(dataset=dataset_name, max_examples=[10]), + layers=(1,), + ), + ) + dataset = train(config) + dataset.save_to_disk(tmp_path) + # get the files in the tmp_path + files = list(tmp_path.iterdir()) From 73bd2b0457cf8ce7607be7081bfd46ab114b1b13 Mon Sep 17 00:00:00 2001 From: James Chua Date: Thu, 23 Mar 2023 22:49:10 +0800 Subject: [PATCH 02/10] fix test --- elk/extraction/extraction.py | 1 + elk/training/ccs_reporter.py | 4 +++- tests/test_smoke_elicit.py | 15 ++++++++++----- 3 files changed, 14 insertions(+), 6 deletions(-) diff --git a/elk/extraction/extraction.py b/elk/extraction/extraction.py index 21c3c9f8..995b508d 100644 --- a/elk/extraction/extraction.py +++ b/elk/extraction/extraction.py @@ -277,6 +277,7 @@ def get_splits() -> SplitDict: devices = select_usable_devices(max_gpus) builders: dict[Split, _GeneratorBuilder] = { split_name: _GeneratorBuilder( + # disable cacheing cache_dir=None, features=Features({**layer_cols, **other_cols}), generator=_extraction_worker, diff --git a/elk/training/ccs_reporter.py b/elk/training/ccs_reporter.py index 54fc2292..d130aa44 100644 --- a/elk/training/ccs_reporter.py +++ b/elk/training/ccs_reporter.py @@ -195,7 +195,9 @@ def loss( alpha = self.config.supervised_weight preds = p0.add(1 - p1).mul(0.5).squeeze(-1) - bce_loss = bce(preds, labels.type_as(preds)) + # TODO: wrong dims on CCS? + sum_preds = preds.sum(-1).sigmoid() + bce_loss = bce(sum_preds, labels.type_as(sum_preds)) loss = alpha * bce_loss + (1 - alpha) * loss elif self.config.supervised_weight > 0: diff --git a/tests/test_smoke_elicit.py b/tests/test_smoke_elicit.py index cc027c07..4b82b7ac 100644 --- a/tests/test_smoke_elicit.py +++ b/tests/test_smoke_elicit.py @@ -2,12 +2,13 @@ from elk import ExtractionConfig from elk.extraction import extract, PromptConfig +from elk.training import CcsReporterConfig from elk.training.train import train, RunConfig def test_smoke_elicit_run(tmp_path: Path): - # We'll use the tiny DeBERTa model for this test - model_path = "hf-internal-testing/tiny-deberta" + # We'll use the tiny gpt2 model for this test + model_path = "sshleifer/tiny-gpt2" # todo: support tiny-imdb. But somnehow we need to convince promptsource dataset_name = "imdb" config = RunConfig( @@ -16,8 +17,12 @@ def test_smoke_elicit_run(tmp_path: Path): prompts=PromptConfig(dataset=dataset_name, max_examples=[10]), layers=(1,), ), + net=CcsReporterConfig(), ) - dataset = train(config) - dataset.save_to_disk(tmp_path) + dataset = train(config, tmp_path) # get the files in the tmp_path - files = list(tmp_path.iterdir()) + files: Path = list(tmp_path.iterdir()) + created_file_names = {file.name for file in files} + expected_files = ["cfg.yaml", "metadata.yaml", "lr_models", "reporters", "eval.csv"] + for file in expected_files: + assert file in created_file_names From 9a824bcaef9682c5f540250d232257daa2267e1f Mon Sep 17 00:00:00 2001 From: James Chua Date: Thu, 23 Mar 2023 23:02:27 +0800 Subject: [PATCH 03/10] patch CCS --- elk/training/ccs_reporter.py | 7 ++++--- tests/test_smoke_elicit.py | 5 +++-- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/elk/training/ccs_reporter.py b/elk/training/ccs_reporter.py index d130aa44..fd6f756f 100644 --- a/elk/training/ccs_reporter.py +++ b/elk/training/ccs_reporter.py @@ -195,9 +195,10 @@ def loss( alpha = self.config.supervised_weight preds = p0.add(1 - p1).mul(0.5).squeeze(-1) - # TODO: wrong dims on CCS? - sum_preds = preds.sum(-1).sigmoid() - bce_loss = bce(sum_preds, labels.type_as(sum_preds)) + # unsqueeze and broadcast to match the shape of preds + # TODO: not sure what to do here actually + labels_unsqueezed = labels.unsqueeze(-1).expand_as(preds) + bce_loss = bce(preds, labels_unsqueezed.type_as(preds)) loss = alpha * bce_loss + (1 - alpha) * loss elif self.config.supervised_weight > 0: diff --git a/tests/test_smoke_elicit.py b/tests/test_smoke_elicit.py index 4b82b7ac..66b402ce 100644 --- a/tests/test_smoke_elicit.py +++ b/tests/test_smoke_elicit.py @@ -8,8 +8,9 @@ def test_smoke_elicit_run(tmp_path: Path): # We'll use the tiny gpt2 model for this test + # TODO: work with deberta but that doesn't support cpus model_path = "sshleifer/tiny-gpt2" - # todo: support tiny-imdb. But somnehow we need to convince promptsource + # TODO: support tiny-imdb. But somnehow we need to convince promptsource dataset_name = "imdb" config = RunConfig( data=ExtractionConfig( @@ -19,7 +20,7 @@ def test_smoke_elicit_run(tmp_path: Path): ), net=CcsReporterConfig(), ) - dataset = train(config, tmp_path) + train(config, tmp_path) # get the files in the tmp_path files: Path = list(tmp_path.iterdir()) created_file_names = {file.name for file in files} From d46bf2a9f9c38eadae3afad9226fddc6f41ab23c Mon Sep 17 00:00:00 2001 From: James Chua Date: Thu, 23 Mar 2023 23:04:15 +0800 Subject: [PATCH 04/10] remove comments --- elk/extraction/extraction.py | 1 - 1 file changed, 1 deletion(-) diff --git a/elk/extraction/extraction.py b/elk/extraction/extraction.py index 995b508d..21c3c9f8 100644 --- a/elk/extraction/extraction.py +++ b/elk/extraction/extraction.py @@ -277,7 +277,6 @@ def get_splits() -> SplitDict: devices = select_usable_devices(max_gpus) builders: dict[Split, _GeneratorBuilder] = { split_name: _GeneratorBuilder( - # disable cacheing cache_dir=None, features=Features({**layer_cols, **other_cols}), generator=_extraction_worker, From c124d40da1ec2b1b4ea0a444eed75c12dca29bd1 Mon Sep 17 00:00:00 2001 From: James Chua Date: Fri, 24 Mar 2023 12:35:03 +0800 Subject: [PATCH 05/10] undo changes --- elk/extraction/extraction.py | 31 ++++++++++++++----------------- 1 file changed, 14 insertions(+), 17 deletions(-) diff --git a/elk/extraction/extraction.py b/elk/extraction/extraction.py index 21c3c9f8..b68a9ed4 100644 --- a/elk/extraction/extraction.py +++ b/elk/extraction/extraction.py @@ -1,10 +1,15 @@ """Functions for extracting the hidden states of a model.""" -import logging +from .prompt_dataset import Prompt, PromptDataset, PromptConfig +from ..utils import ( + assert_type, + float32_to_int16, + infer_label_column, + select_train_val_splits, + select_usable_devices, +) +from .generator import _GeneratorBuilder from dataclasses import dataclass, InitVar -from typing import Iterable, Literal, Union - -import torch from datasets import ( Array3D, DatasetDict, @@ -15,7 +20,6 @@ SplitDict, SplitInfo, Value, - Dataset, ) from simple_parsing.helpers import field, Serializable from transformers import ( @@ -25,16 +29,9 @@ BatchEncoding, PreTrainedModel, ) - -from .generator import _GeneratorBuilder -from .prompt_dataset import Prompt, PromptDataset, PromptConfig -from ..utils import ( - assert_type, - float32_to_int16, - infer_label_column, - select_train_val_splits, - select_usable_devices, -) +from typing import Iterable, Literal, Union +import logging +import torch @dataclass @@ -275,7 +272,7 @@ def get_splits() -> SplitDict: ), } devices = select_usable_devices(max_gpus) - builders: dict[Split, _GeneratorBuilder] = { + builders = { split_name: _GeneratorBuilder( cache_dir=None, features=Features({**layer_cols, **other_cols}), @@ -293,7 +290,7 @@ def get_splits() -> SplitDict: for (split_name, split_info) in splits.items() } - ds: dict[Split, Union[Dataset, DatasetDict]] = dict() + ds = dict() for split, builder in builders.items(): builder.download_and_prepare(num_proc=len(devices)) ds[split] = builder.as_dataset(split=split) From b2335175b8b312b453a36170518acb3743f57185 Mon Sep 17 00:00:00 2001 From: James Chua Date: Fri, 24 Mar 2023 13:45:03 +0800 Subject: [PATCH 06/10] add eigenreporter test for tiny-gpt2 --- tests/test_smoke_elicit.py | 50 ++++++++++++++++++++++++++++++++------ 1 file changed, 43 insertions(+), 7 deletions(-) diff --git a/tests/test_smoke_elicit.py b/tests/test_smoke_elicit.py index 66b402ce..f5162c70 100644 --- a/tests/test_smoke_elicit.py +++ b/tests/test_smoke_elicit.py @@ -1,22 +1,27 @@ from pathlib import Path +import pytest + from elk import ExtractionConfig -from elk.extraction import extract, PromptConfig -from elk.training import CcsReporterConfig +from elk.extraction import PromptConfig +from elk.training import CcsReporterConfig, EigenReporterConfig from elk.training.train import train, RunConfig +""" +TODO: These tests should +work with deberta but you'll need to make deberta fp32 instead of fp16 +because cpu doesn't support fp16 +""" + -def test_smoke_elicit_run(tmp_path: Path): - # We'll use the tiny gpt2 model for this test - # TODO: work with deberta but that doesn't support cpus +def test_smoke_elicit_run_tiny_gpt2_ccs(tmp_path: Path): model_path = "sshleifer/tiny-gpt2" - # TODO: support tiny-imdb. But somnehow we need to convince promptsource dataset_name = "imdb" config = RunConfig( data=ExtractionConfig( model=model_path, prompts=PromptConfig(dataset=dataset_name, max_examples=[10]), - layers=(1,), + # run on all layers, tiny-gpt only has 2 layers ), net=CcsReporterConfig(), ) @@ -27,3 +32,34 @@ def test_smoke_elicit_run(tmp_path: Path): expected_files = ["cfg.yaml", "metadata.yaml", "lr_models", "reporters", "eval.csv"] for file in expected_files: assert file in created_file_names + + +@pytest.mark.skip(reason="Fix me: EigenReporter crashes with tiny gpt2") +def test_smoke_elicit_run_tiny_gpt2_eigen(tmp_path: Path): + """ + Currently this test fails with + u -= torch.einsum("...ij,...i->...j", V[..., :k, :], proj) + V[..., k, :] = F.normalize(u, dim=-1) + ~~~~~~~~~ <--- HERE + + u[:] = torch.einsum("...ij,...j->...i", A, V[..., k, :]) + + RuntimeError: select(): index 1 out of range for tensor of size [1, 2] at dimension 0 + """ + model_path = "sshleifer/tiny-gpt2" + dataset_name = "imdb" + config = RunConfig( + data=ExtractionConfig( + model=model_path, + prompts=PromptConfig(dataset=dataset_name, max_examples=[10]), + # run on all layers, tiny-gpt only has 2 layers + ), + net=EigenReporterConfig(), + ) + train(config, tmp_path) + # get the files in the tmp_path + files: Path = list(tmp_path.iterdir()) + created_file_names = {file.name for file in files} + expected_files = ["cfg.yaml", "metadata.yaml", "lr_models", "reporters", "eval.csv"] + for file in expected_files: + assert file in created_file_names From c1cf9a0fa3e76779ac0c57a66dcc07c48ea7fc3a Mon Sep 17 00:00:00 2001 From: James Chua Date: Fri, 24 Mar 2023 13:50:33 +0800 Subject: [PATCH 07/10] use repeat_interleave like auc calculation --- elk/training/ccs_reporter.py | 9 +++++---- tests/test_smoke_elicit.py | 6 +++--- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/elk/training/ccs_reporter.py b/elk/training/ccs_reporter.py index fd6f756f..c922ac4a 100644 --- a/elk/training/ccs_reporter.py +++ b/elk/training/ccs_reporter.py @@ -195,10 +195,11 @@ def loss( alpha = self.config.supervised_weight preds = p0.add(1 - p1).mul(0.5).squeeze(-1) - # unsqueeze and broadcast to match the shape of preds - # TODO: not sure what to do here actually - labels_unsqueezed = labels.unsqueeze(-1).expand_as(preds) - bce_loss = bce(preds, labels_unsqueezed.type_as(preds)) + # broadcast the labels, and flatten the predictions + # so that both are 1D tensors + broadcast_labels = labels.repeat_interleave(preds.shape[1]).float() + flattened_preds = preds.cpu().flatten() + bce_loss = bce(flattened_preds, broadcast_labels.type_as(flattened_preds)) loss = alpha * bce_loss + (1 - alpha) * loss elif self.config.supervised_weight > 0: diff --git a/tests/test_smoke_elicit.py b/tests/test_smoke_elicit.py index f5162c70..5dd9d468 100644 --- a/tests/test_smoke_elicit.py +++ b/tests/test_smoke_elicit.py @@ -8,9 +8,9 @@ from elk.training.train import train, RunConfig """ -TODO: These tests should -work with deberta but you'll need to make deberta fp32 instead of fp16 -because cpu doesn't support fp16 +TODO: These tests should work with deberta +but you'll need to make deberta fp32 instead of fp16 +because pytorch cpu doesn't support fp16 """ From e43d2c5f4b894bef2cb6fa9ccf4b8e4242ac991c Mon Sep 17 00:00:00 2001 From: James Chua Date: Fri, 24 Mar 2023 13:59:52 +0800 Subject: [PATCH 08/10] make flake happy --- tests/test_smoke_elicit.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_smoke_elicit.py b/tests/test_smoke_elicit.py index 5dd9d468..a744cb9d 100644 --- a/tests/test_smoke_elicit.py +++ b/tests/test_smoke_elicit.py @@ -44,7 +44,8 @@ def test_smoke_elicit_run_tiny_gpt2_eigen(tmp_path: Path): u[:] = torch.einsum("...ij,...j->...i", A, V[..., k, :]) - RuntimeError: select(): index 1 out of range for tensor of size [1, 2] at dimension 0 + RuntimeError: select(): index 1 out of range for tensor of size [1, 2] + at dimension 0 """ model_path = "sshleifer/tiny-gpt2" dataset_name = "imdb" From 89fbee5f27bd03d3959e470b52338edc83538f7e Mon Sep 17 00:00:00 2001 From: Nora Belrose Date: Fri, 24 Mar 2023 10:41:01 +0000 Subject: [PATCH 09/10] Fix lanczos_eigsh for small matrices --- elk/eigsh.py | 20 +++++++++++++++++++- tests/test_eigsh.py | 16 +++++++--------- tests/test_smoke_elicit.py | 3 --- 3 files changed, 26 insertions(+), 13 deletions(-) diff --git a/elk/eigsh.py b/elk/eigsh.py index 93cc6675..10c1de60 100644 --- a/elk/eigsh.py +++ b/elk/eigsh.py @@ -13,7 +13,7 @@ def lanczos_eigsh( tol: Optional[float] = None, seed: Optional[int] = None, v0: Optional[Tensor] = None, - which: Literal["LA", "LM", "SA"] = "LM", + which: Literal["LA", "LM", "SA"] = "LA", ) -> tuple[Tensor, Tensor]: """Lanczos method for computing the top k eigenpairs of a symmetric matrix. @@ -21,6 +21,10 @@ def lanczos_eigsh( based on `scipy.sparse.linalg.eigsh`. Unlike the CuPy and SciPy functions, this function supports batched inputs with arbitrary leading dimensions. + Unlike the above implementations, we use which='LA' as the default instead of + which='LM' because we are interested in algebraic eigenvalues, not magnitude. + Largest magnitude is also harder to implement in TorchScript. + Args: A (Tensor): The matrix or batch of matrices of shape `[..., n, n]` for which to compute eigenpairs. Must be symmetric, but need not be positive definite. @@ -43,6 +47,20 @@ def lanczos_eigsh( *leading, n, m = A.shape assert n == m, "A must be a square matrix or a batch of square matrices." + # Short circuit if the matrix is too small; we can't outcompete the naive method. + if n <= 32: + L, Q = torch.linalg.eigh(A) + if which == "LA": + return L[..., -k:], Q[..., :, -k:] + elif which == "LM": + # Resort the eigenvalues and eigenvectors. + idx = L.abs().argsort(dim=-1, descending=True) + L = L.gather(-1, idx) + Q = Q.gather(-1, idx.unsqueeze(-1).expand(*idx.shape, n)) + return L[..., :k], Q[..., :, :k] + elif which == "SA": + return L[..., :k], Q[..., :, :k] + if ncv is None: ncv = min(max(2 * k, k + 32), n - 1) else: diff --git a/tests/test_eigsh.py b/tests/test_eigsh.py index 603c7e07..dc206d90 100644 --- a/tests/test_eigsh.py +++ b/tests/test_eigsh.py @@ -5,31 +5,29 @@ import torch +@pytest.mark.parametrize("n", [20, 40]) @pytest.mark.parametrize("which", ["LA", "SA"]) -def test_lanczos_eigsh(which): +def test_lanczos_eigsh(n, which): torch.manual_seed(42) - # Generate a random symmetric matrix - n = 10 A = torch.randn(n, n) A = A + A.T # Compute the top k eigenpairs using our implementation - k = 3 - w, v = lanczos_eigsh(A, k=k, which=which) + w, v = lanczos_eigsh(A, which=which) # Compute the top k eigenpairs using scipy - w_scipy, v_scipy = eigsh(A.numpy(), k=k, which=which) + w_scipy, v_scipy = eigsh(A.numpy(), which=which) # Check that the eigenvalues match to within the tolerance - assert np.allclose(w, w_scipy, rtol=1e-3) + torch.testing.assert_allclose(w, torch.from_numpy(w_scipy), atol=1e-3, rtol=1e-3) # Normalize the sign of the eigenvectors - for i in range(k): + for i in range(v.shape[-1]): if v[torch.argmax(torch.abs(v[:, i])), i] < 0: v[:, i] *= -1 if v_scipy[np.argmax(np.abs(v_scipy[:, i])), i] < 0: v_scipy[:, i] *= -1 # Check that the eigenvectors match to within the tolerance - assert np.allclose(v.numpy(), v_scipy, rtol=1e-3) + torch.testing.assert_allclose(v, torch.from_numpy(v_scipy), atol=1e-3, rtol=1e-3) diff --git a/tests/test_smoke_elicit.py b/tests/test_smoke_elicit.py index a744cb9d..3211271c 100644 --- a/tests/test_smoke_elicit.py +++ b/tests/test_smoke_elicit.py @@ -1,7 +1,5 @@ from pathlib import Path -import pytest - from elk import ExtractionConfig from elk.extraction import PromptConfig from elk.training import CcsReporterConfig, EigenReporterConfig @@ -34,7 +32,6 @@ def test_smoke_elicit_run_tiny_gpt2_ccs(tmp_path: Path): assert file in created_file_names -@pytest.mark.skip(reason="Fix me: EigenReporter crashes with tiny gpt2") def test_smoke_elicit_run_tiny_gpt2_eigen(tmp_path: Path): """ Currently this test fails with From d876f88c720d353d83b271845a7c4aca40c61615 Mon Sep 17 00:00:00 2001 From: Nora Belrose Date: Fri, 24 Mar 2023 10:44:40 +0000 Subject: [PATCH 10/10] Always use float32 on CPU --- elk/extraction/extraction.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/elk/extraction/extraction.py b/elk/extraction/extraction.py index b68a9ed4..f9dc35dc 100644 --- a/elk/extraction/extraction.py +++ b/elk/extraction/extraction.py @@ -101,7 +101,9 @@ def extract_hiddens( # AutoModel should do the right thing here in nearly all cases. We don't actually # care what head the model has, since we are just extracting hidden states. - model = AutoModel.from_pretrained(cfg.model, torch_dtype="auto").to(device) + model = AutoModel.from_pretrained( + cfg.model, torch_dtype="auto" if device != "cpu" else torch.float32 + ).to(device) # TODO: Maybe also make this configurable? # We want to make sure the answer is never truncated tokenizer = AutoTokenizer.from_pretrained(cfg.model, truncation_side="left")