From 01f3a2f17d1a40150637dd87b9e8388faa3708d5 Mon Sep 17 00:00:00 2001
From: James Chua <james@leadiq.com>
Date: Thu, 23 Mar 2023 21:41:02 +0800
Subject: [PATCH 01/10] add smoke tests

---
 elk/extraction/extraction.py | 31 +++++++++++++++++--------------
 tests/test_smoke_elicit.py   | 23 +++++++++++++++++++++++
 2 files changed, 40 insertions(+), 14 deletions(-)
 create mode 100644 tests/test_smoke_elicit.py

diff --git a/elk/extraction/extraction.py b/elk/extraction/extraction.py
index b68a9ed4..21c3c9f8 100644
--- a/elk/extraction/extraction.py
+++ b/elk/extraction/extraction.py
@@ -1,15 +1,10 @@
 """Functions for extracting the hidden states of a model."""
 
-from .prompt_dataset import Prompt, PromptDataset, PromptConfig
-from ..utils import (
-    assert_type,
-    float32_to_int16,
-    infer_label_column,
-    select_train_val_splits,
-    select_usable_devices,
-)
-from .generator import _GeneratorBuilder
+import logging
 from dataclasses import dataclass, InitVar
+from typing import Iterable, Literal, Union
+
+import torch
 from datasets import (
     Array3D,
     DatasetDict,
@@ -20,6 +15,7 @@
     SplitDict,
     SplitInfo,
     Value,
+    Dataset,
 )
 from simple_parsing.helpers import field, Serializable
 from transformers import (
@@ -29,9 +25,16 @@
     BatchEncoding,
     PreTrainedModel,
 )
-from typing import Iterable, Literal, Union
-import logging
-import torch
+
+from .generator import _GeneratorBuilder
+from .prompt_dataset import Prompt, PromptDataset, PromptConfig
+from ..utils import (
+    assert_type,
+    float32_to_int16,
+    infer_label_column,
+    select_train_val_splits,
+    select_usable_devices,
+)
 
 
 @dataclass
@@ -272,7 +275,7 @@ def get_splits() -> SplitDict:
         ),
     }
     devices = select_usable_devices(max_gpus)
-    builders = {
+    builders: dict[Split, _GeneratorBuilder] = {
         split_name: _GeneratorBuilder(
             cache_dir=None,
             features=Features({**layer_cols, **other_cols}),
@@ -290,7 +293,7 @@ def get_splits() -> SplitDict:
         for (split_name, split_info) in splits.items()
     }
 
-    ds = dict()
+    ds: dict[Split, Union[Dataset, DatasetDict]] = dict()
     for split, builder in builders.items():
         builder.download_and_prepare(num_proc=len(devices))
         ds[split] = builder.as_dataset(split=split)
diff --git a/tests/test_smoke_elicit.py b/tests/test_smoke_elicit.py
new file mode 100644
index 00000000..cc027c07
--- /dev/null
+++ b/tests/test_smoke_elicit.py
@@ -0,0 +1,23 @@
+from pathlib import Path
+
+from elk import ExtractionConfig
+from elk.extraction import extract, PromptConfig
+from elk.training.train import train, RunConfig
+
+
+def test_smoke_elicit_run(tmp_path: Path):
+    # We'll use the tiny DeBERTa model for this test
+    model_path = "hf-internal-testing/tiny-deberta"
+    # todo: support tiny-imdb. But somnehow we need to convince promptsource
+    dataset_name = "imdb"
+    config = RunConfig(
+        data=ExtractionConfig(
+            model=model_path,
+            prompts=PromptConfig(dataset=dataset_name, max_examples=[10]),
+            layers=(1,),
+        ),
+    )
+    dataset = train(config)
+    dataset.save_to_disk(tmp_path)
+    # get the files in the tmp_path
+    files = list(tmp_path.iterdir())

From 73bd2b0457cf8ce7607be7081bfd46ab114b1b13 Mon Sep 17 00:00:00 2001
From: James Chua <james@leadiq.com>
Date: Thu, 23 Mar 2023 22:49:10 +0800
Subject: [PATCH 02/10] fix test

---
 elk/extraction/extraction.py |  1 +
 elk/training/ccs_reporter.py |  4 +++-
 tests/test_smoke_elicit.py   | 15 ++++++++++-----
 3 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/elk/extraction/extraction.py b/elk/extraction/extraction.py
index 21c3c9f8..995b508d 100644
--- a/elk/extraction/extraction.py
+++ b/elk/extraction/extraction.py
@@ -277,6 +277,7 @@ def get_splits() -> SplitDict:
     devices = select_usable_devices(max_gpus)
     builders: dict[Split, _GeneratorBuilder] = {
         split_name: _GeneratorBuilder(
+            # disable cacheing
             cache_dir=None,
             features=Features({**layer_cols, **other_cols}),
             generator=_extraction_worker,
diff --git a/elk/training/ccs_reporter.py b/elk/training/ccs_reporter.py
index 54fc2292..d130aa44 100644
--- a/elk/training/ccs_reporter.py
+++ b/elk/training/ccs_reporter.py
@@ -195,7 +195,9 @@ def loss(
 
             alpha = self.config.supervised_weight
             preds = p0.add(1 - p1).mul(0.5).squeeze(-1)
-            bce_loss = bce(preds, labels.type_as(preds))
+            # TODO: wrong dims on CCS?
+            sum_preds = preds.sum(-1).sigmoid()
+            bce_loss = bce(sum_preds, labels.type_as(sum_preds))
             loss = alpha * bce_loss + (1 - alpha) * loss
 
         elif self.config.supervised_weight > 0:
diff --git a/tests/test_smoke_elicit.py b/tests/test_smoke_elicit.py
index cc027c07..4b82b7ac 100644
--- a/tests/test_smoke_elicit.py
+++ b/tests/test_smoke_elicit.py
@@ -2,12 +2,13 @@
 
 from elk import ExtractionConfig
 from elk.extraction import extract, PromptConfig
+from elk.training import CcsReporterConfig
 from elk.training.train import train, RunConfig
 
 
 def test_smoke_elicit_run(tmp_path: Path):
-    # We'll use the tiny DeBERTa model for this test
-    model_path = "hf-internal-testing/tiny-deberta"
+    # We'll use the tiny gpt2 model for this test
+    model_path = "sshleifer/tiny-gpt2"
     # todo: support tiny-imdb. But somnehow we need to convince promptsource
     dataset_name = "imdb"
     config = RunConfig(
@@ -16,8 +17,12 @@ def test_smoke_elicit_run(tmp_path: Path):
             prompts=PromptConfig(dataset=dataset_name, max_examples=[10]),
             layers=(1,),
         ),
+        net=CcsReporterConfig(),
     )
-    dataset = train(config)
-    dataset.save_to_disk(tmp_path)
+    dataset = train(config, tmp_path)
     # get the files in the tmp_path
-    files = list(tmp_path.iterdir())
+    files: Path = list(tmp_path.iterdir())
+    created_file_names = {file.name for file in files}
+    expected_files = ["cfg.yaml", "metadata.yaml", "lr_models", "reporters", "eval.csv"]
+    for file in expected_files:
+        assert file in created_file_names

From 9a824bcaef9682c5f540250d232257daa2267e1f Mon Sep 17 00:00:00 2001
From: James Chua <james@leadiq.com>
Date: Thu, 23 Mar 2023 23:02:27 +0800
Subject: [PATCH 03/10] patch CCS

---
 elk/training/ccs_reporter.py | 7 ++++---
 tests/test_smoke_elicit.py   | 5 +++--
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/elk/training/ccs_reporter.py b/elk/training/ccs_reporter.py
index d130aa44..fd6f756f 100644
--- a/elk/training/ccs_reporter.py
+++ b/elk/training/ccs_reporter.py
@@ -195,9 +195,10 @@ def loss(
 
             alpha = self.config.supervised_weight
             preds = p0.add(1 - p1).mul(0.5).squeeze(-1)
-            # TODO: wrong dims on CCS?
-            sum_preds = preds.sum(-1).sigmoid()
-            bce_loss = bce(sum_preds, labels.type_as(sum_preds))
+            # unsqueeze and broadcast to match the shape of preds
+            # TODO: not sure what to do here actually
+            labels_unsqueezed = labels.unsqueeze(-1).expand_as(preds)
+            bce_loss = bce(preds, labels_unsqueezed.type_as(preds))
             loss = alpha * bce_loss + (1 - alpha) * loss
 
         elif self.config.supervised_weight > 0:
diff --git a/tests/test_smoke_elicit.py b/tests/test_smoke_elicit.py
index 4b82b7ac..66b402ce 100644
--- a/tests/test_smoke_elicit.py
+++ b/tests/test_smoke_elicit.py
@@ -8,8 +8,9 @@
 
 def test_smoke_elicit_run(tmp_path: Path):
     # We'll use the tiny gpt2 model for this test
+    # TODO: work with deberta but that doesn't support cpus
     model_path = "sshleifer/tiny-gpt2"
-    # todo: support tiny-imdb. But somnehow we need to convince promptsource
+    # TODO: support tiny-imdb. But somnehow we need to convince promptsource
     dataset_name = "imdb"
     config = RunConfig(
         data=ExtractionConfig(
@@ -19,7 +20,7 @@ def test_smoke_elicit_run(tmp_path: Path):
         ),
         net=CcsReporterConfig(),
     )
-    dataset = train(config, tmp_path)
+    train(config, tmp_path)
     # get the files in the tmp_path
     files: Path = list(tmp_path.iterdir())
     created_file_names = {file.name for file in files}

From d46bf2a9f9c38eadae3afad9226fddc6f41ab23c Mon Sep 17 00:00:00 2001
From: James Chua <james@leadiq.com>
Date: Thu, 23 Mar 2023 23:04:15 +0800
Subject: [PATCH 04/10] remove comments

---
 elk/extraction/extraction.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/elk/extraction/extraction.py b/elk/extraction/extraction.py
index 995b508d..21c3c9f8 100644
--- a/elk/extraction/extraction.py
+++ b/elk/extraction/extraction.py
@@ -277,7 +277,6 @@ def get_splits() -> SplitDict:
     devices = select_usable_devices(max_gpus)
     builders: dict[Split, _GeneratorBuilder] = {
         split_name: _GeneratorBuilder(
-            # disable cacheing
             cache_dir=None,
             features=Features({**layer_cols, **other_cols}),
             generator=_extraction_worker,

From c124d40da1ec2b1b4ea0a444eed75c12dca29bd1 Mon Sep 17 00:00:00 2001
From: James Chua <james@leadiq.com>
Date: Fri, 24 Mar 2023 12:35:03 +0800
Subject: [PATCH 05/10] undo changes

---
 elk/extraction/extraction.py | 31 ++++++++++++++-----------------
 1 file changed, 14 insertions(+), 17 deletions(-)

diff --git a/elk/extraction/extraction.py b/elk/extraction/extraction.py
index 21c3c9f8..b68a9ed4 100644
--- a/elk/extraction/extraction.py
+++ b/elk/extraction/extraction.py
@@ -1,10 +1,15 @@
 """Functions for extracting the hidden states of a model."""
 
-import logging
+from .prompt_dataset import Prompt, PromptDataset, PromptConfig
+from ..utils import (
+    assert_type,
+    float32_to_int16,
+    infer_label_column,
+    select_train_val_splits,
+    select_usable_devices,
+)
+from .generator import _GeneratorBuilder
 from dataclasses import dataclass, InitVar
-from typing import Iterable, Literal, Union
-
-import torch
 from datasets import (
     Array3D,
     DatasetDict,
@@ -15,7 +20,6 @@
     SplitDict,
     SplitInfo,
     Value,
-    Dataset,
 )
 from simple_parsing.helpers import field, Serializable
 from transformers import (
@@ -25,16 +29,9 @@
     BatchEncoding,
     PreTrainedModel,
 )
-
-from .generator import _GeneratorBuilder
-from .prompt_dataset import Prompt, PromptDataset, PromptConfig
-from ..utils import (
-    assert_type,
-    float32_to_int16,
-    infer_label_column,
-    select_train_val_splits,
-    select_usable_devices,
-)
+from typing import Iterable, Literal, Union
+import logging
+import torch
 
 
 @dataclass
@@ -275,7 +272,7 @@ def get_splits() -> SplitDict:
         ),
     }
     devices = select_usable_devices(max_gpus)
-    builders: dict[Split, _GeneratorBuilder] = {
+    builders = {
         split_name: _GeneratorBuilder(
             cache_dir=None,
             features=Features({**layer_cols, **other_cols}),
@@ -293,7 +290,7 @@ def get_splits() -> SplitDict:
         for (split_name, split_info) in splits.items()
     }
 
-    ds: dict[Split, Union[Dataset, DatasetDict]] = dict()
+    ds = dict()
     for split, builder in builders.items():
         builder.download_and_prepare(num_proc=len(devices))
         ds[split] = builder.as_dataset(split=split)

From b2335175b8b312b453a36170518acb3743f57185 Mon Sep 17 00:00:00 2001
From: James Chua <james@leadiq.com>
Date: Fri, 24 Mar 2023 13:45:03 +0800
Subject: [PATCH 06/10] add eigenreporter test for tiny-gpt2

---
 tests/test_smoke_elicit.py | 50 ++++++++++++++++++++++++++++++++------
 1 file changed, 43 insertions(+), 7 deletions(-)

diff --git a/tests/test_smoke_elicit.py b/tests/test_smoke_elicit.py
index 66b402ce..f5162c70 100644
--- a/tests/test_smoke_elicit.py
+++ b/tests/test_smoke_elicit.py
@@ -1,22 +1,27 @@
 from pathlib import Path
 
+import pytest
+
 from elk import ExtractionConfig
-from elk.extraction import extract, PromptConfig
-from elk.training import CcsReporterConfig
+from elk.extraction import PromptConfig
+from elk.training import CcsReporterConfig, EigenReporterConfig
 from elk.training.train import train, RunConfig
 
+"""
+TODO: These tests should
+work with deberta but you'll need to make deberta fp32 instead of fp16
+because cpu doesn't support fp16
+"""
+
 
-def test_smoke_elicit_run(tmp_path: Path):
-    # We'll use the tiny gpt2 model for this test
-    # TODO: work with deberta but that doesn't support cpus
+def test_smoke_elicit_run_tiny_gpt2_ccs(tmp_path: Path):
     model_path = "sshleifer/tiny-gpt2"
-    # TODO: support tiny-imdb. But somnehow we need to convince promptsource
     dataset_name = "imdb"
     config = RunConfig(
         data=ExtractionConfig(
             model=model_path,
             prompts=PromptConfig(dataset=dataset_name, max_examples=[10]),
-            layers=(1,),
+            # run on all layers, tiny-gpt only has 2 layers
         ),
         net=CcsReporterConfig(),
     )
@@ -27,3 +32,34 @@ def test_smoke_elicit_run(tmp_path: Path):
     expected_files = ["cfg.yaml", "metadata.yaml", "lr_models", "reporters", "eval.csv"]
     for file in expected_files:
         assert file in created_file_names
+
+
+@pytest.mark.skip(reason="Fix me: EigenReporter crashes with tiny gpt2")
+def test_smoke_elicit_run_tiny_gpt2_eigen(tmp_path: Path):
+    """
+    Currently this test fails with
+    u -= torch.einsum("...ij,...i->...j", V[..., :k, :], proj)
+    V[..., k, :] = F.normalize(u, dim=-1)
+    ~~~~~~~~~ <--- HERE
+
+    u[:] = torch.einsum("...ij,...j->...i", A, V[..., k, :])
+
+    RuntimeError: select(): index 1 out of range for tensor of size [1, 2] at dimension 0
+    """
+    model_path = "sshleifer/tiny-gpt2"
+    dataset_name = "imdb"
+    config = RunConfig(
+        data=ExtractionConfig(
+            model=model_path,
+            prompts=PromptConfig(dataset=dataset_name, max_examples=[10]),
+            # run on all layers, tiny-gpt only has 2 layers
+        ),
+        net=EigenReporterConfig(),
+    )
+    train(config, tmp_path)
+    # get the files in the tmp_path
+    files: Path = list(tmp_path.iterdir())
+    created_file_names = {file.name for file in files}
+    expected_files = ["cfg.yaml", "metadata.yaml", "lr_models", "reporters", "eval.csv"]
+    for file in expected_files:
+        assert file in created_file_names

From c1cf9a0fa3e76779ac0c57a66dcc07c48ea7fc3a Mon Sep 17 00:00:00 2001
From: James Chua <james@leadiq.com>
Date: Fri, 24 Mar 2023 13:50:33 +0800
Subject: [PATCH 07/10] use repeat_interleave like auc calculation

---
 elk/training/ccs_reporter.py | 9 +++++----
 tests/test_smoke_elicit.py   | 6 +++---
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/elk/training/ccs_reporter.py b/elk/training/ccs_reporter.py
index fd6f756f..c922ac4a 100644
--- a/elk/training/ccs_reporter.py
+++ b/elk/training/ccs_reporter.py
@@ -195,10 +195,11 @@ def loss(
 
             alpha = self.config.supervised_weight
             preds = p0.add(1 - p1).mul(0.5).squeeze(-1)
-            # unsqueeze and broadcast to match the shape of preds
-            # TODO: not sure what to do here actually
-            labels_unsqueezed = labels.unsqueeze(-1).expand_as(preds)
-            bce_loss = bce(preds, labels_unsqueezed.type_as(preds))
+            # broadcast the labels, and flatten the predictions
+            # so that both are 1D tensors
+            broadcast_labels = labels.repeat_interleave(preds.shape[1]).float()
+            flattened_preds = preds.cpu().flatten()
+            bce_loss = bce(flattened_preds, broadcast_labels.type_as(flattened_preds))
             loss = alpha * bce_loss + (1 - alpha) * loss
 
         elif self.config.supervised_weight > 0:
diff --git a/tests/test_smoke_elicit.py b/tests/test_smoke_elicit.py
index f5162c70..5dd9d468 100644
--- a/tests/test_smoke_elicit.py
+++ b/tests/test_smoke_elicit.py
@@ -8,9 +8,9 @@
 from elk.training.train import train, RunConfig
 
 """
-TODO: These tests should
-work with deberta but you'll need to make deberta fp32 instead of fp16
-because cpu doesn't support fp16
+TODO: These tests should work with deberta
+but you'll need to make deberta fp32 instead of fp16
+because pytorch cpu doesn't support fp16
 """
 
 

From e43d2c5f4b894bef2cb6fa9ccf4b8e4242ac991c Mon Sep 17 00:00:00 2001
From: James Chua <james@leadiq.com>
Date: Fri, 24 Mar 2023 13:59:52 +0800
Subject: [PATCH 08/10] make flake happy

---
 tests/test_smoke_elicit.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/test_smoke_elicit.py b/tests/test_smoke_elicit.py
index 5dd9d468..a744cb9d 100644
--- a/tests/test_smoke_elicit.py
+++ b/tests/test_smoke_elicit.py
@@ -44,7 +44,8 @@ def test_smoke_elicit_run_tiny_gpt2_eigen(tmp_path: Path):
 
     u[:] = torch.einsum("...ij,...j->...i", A, V[..., k, :])
 
-    RuntimeError: select(): index 1 out of range for tensor of size [1, 2] at dimension 0
+    RuntimeError: select(): index 1 out of range for tensor of size [1, 2]
+    at dimension 0
     """
     model_path = "sshleifer/tiny-gpt2"
     dataset_name = "imdb"

From 89fbee5f27bd03d3959e470b52338edc83538f7e Mon Sep 17 00:00:00 2001
From: Nora Belrose <belrose.nora@gmail.com>
Date: Fri, 24 Mar 2023 10:41:01 +0000
Subject: [PATCH 09/10] Fix lanczos_eigsh for small matrices

---
 elk/eigsh.py               | 20 +++++++++++++++++++-
 tests/test_eigsh.py        | 16 +++++++---------
 tests/test_smoke_elicit.py |  3 ---
 3 files changed, 26 insertions(+), 13 deletions(-)

diff --git a/elk/eigsh.py b/elk/eigsh.py
index 93cc6675..10c1de60 100644
--- a/elk/eigsh.py
+++ b/elk/eigsh.py
@@ -13,7 +13,7 @@ def lanczos_eigsh(
     tol: Optional[float] = None,
     seed: Optional[int] = None,
     v0: Optional[Tensor] = None,
-    which: Literal["LA", "LM", "SA"] = "LM",
+    which: Literal["LA", "LM", "SA"] = "LA",
 ) -> tuple[Tensor, Tensor]:
     """Lanczos method for computing the top k eigenpairs of a symmetric matrix.
 
@@ -21,6 +21,10 @@ def lanczos_eigsh(
     based on `scipy.sparse.linalg.eigsh`. Unlike the CuPy and SciPy functions, this
     function supports batched inputs with arbitrary leading dimensions.
 
+    Unlike the above implementations, we use which='LA' as the default instead of
+    which='LM' because we are interested in algebraic eigenvalues, not magnitude.
+    Largest magnitude is also harder to implement in TorchScript.
+
     Args:
         A (Tensor): The matrix or batch of matrices of shape `[..., n, n]` for which to
             compute eigenpairs. Must be symmetric, but need not be positive definite.
@@ -43,6 +47,20 @@ def lanczos_eigsh(
     *leading, n, m = A.shape
     assert n == m, "A must be a square matrix or a batch of square matrices."
 
+    # Short circuit if the matrix is too small; we can't outcompete the naive method.
+    if n <= 32:
+        L, Q = torch.linalg.eigh(A)
+        if which == "LA":
+            return L[..., -k:], Q[..., :, -k:]
+        elif which == "LM":
+            # Resort the eigenvalues and eigenvectors.
+            idx = L.abs().argsort(dim=-1, descending=True)
+            L = L.gather(-1, idx)
+            Q = Q.gather(-1, idx.unsqueeze(-1).expand(*idx.shape, n))
+            return L[..., :k], Q[..., :, :k]
+        elif which == "SA":
+            return L[..., :k], Q[..., :, :k]
+
     if ncv is None:
         ncv = min(max(2 * k, k + 32), n - 1)
     else:
diff --git a/tests/test_eigsh.py b/tests/test_eigsh.py
index 603c7e07..dc206d90 100644
--- a/tests/test_eigsh.py
+++ b/tests/test_eigsh.py
@@ -5,31 +5,29 @@
 import torch
 
 
+@pytest.mark.parametrize("n", [20, 40])
 @pytest.mark.parametrize("which", ["LA", "SA"])
-def test_lanczos_eigsh(which):
+def test_lanczos_eigsh(n, which):
     torch.manual_seed(42)
 
-    # Generate a random symmetric matrix
-    n = 10
     A = torch.randn(n, n)
     A = A + A.T
 
     # Compute the top k eigenpairs using our implementation
-    k = 3
-    w, v = lanczos_eigsh(A, k=k, which=which)
+    w, v = lanczos_eigsh(A, which=which)
 
     # Compute the top k eigenpairs using scipy
-    w_scipy, v_scipy = eigsh(A.numpy(), k=k, which=which)
+    w_scipy, v_scipy = eigsh(A.numpy(), which=which)
 
     # Check that the eigenvalues match to within the tolerance
-    assert np.allclose(w, w_scipy, rtol=1e-3)
+    torch.testing.assert_allclose(w, torch.from_numpy(w_scipy), atol=1e-3, rtol=1e-3)
 
     # Normalize the sign of the eigenvectors
-    for i in range(k):
+    for i in range(v.shape[-1]):
         if v[torch.argmax(torch.abs(v[:, i])), i] < 0:
             v[:, i] *= -1
         if v_scipy[np.argmax(np.abs(v_scipy[:, i])), i] < 0:
             v_scipy[:, i] *= -1
 
     # Check that the eigenvectors match to within the tolerance
-    assert np.allclose(v.numpy(), v_scipy, rtol=1e-3)
+    torch.testing.assert_allclose(v, torch.from_numpy(v_scipy), atol=1e-3, rtol=1e-3)
diff --git a/tests/test_smoke_elicit.py b/tests/test_smoke_elicit.py
index a744cb9d..3211271c 100644
--- a/tests/test_smoke_elicit.py
+++ b/tests/test_smoke_elicit.py
@@ -1,7 +1,5 @@
 from pathlib import Path
 
-import pytest
-
 from elk import ExtractionConfig
 from elk.extraction import PromptConfig
 from elk.training import CcsReporterConfig, EigenReporterConfig
@@ -34,7 +32,6 @@ def test_smoke_elicit_run_tiny_gpt2_ccs(tmp_path: Path):
         assert file in created_file_names
 
 
-@pytest.mark.skip(reason="Fix me: EigenReporter crashes with tiny gpt2")
 def test_smoke_elicit_run_tiny_gpt2_eigen(tmp_path: Path):
     """
     Currently this test fails with

From d876f88c720d353d83b271845a7c4aca40c61615 Mon Sep 17 00:00:00 2001
From: Nora Belrose <belrose.nora@gmail.com>
Date: Fri, 24 Mar 2023 10:44:40 +0000
Subject: [PATCH 10/10] Always use float32 on CPU

---
 elk/extraction/extraction.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/elk/extraction/extraction.py b/elk/extraction/extraction.py
index b68a9ed4..f9dc35dc 100644
--- a/elk/extraction/extraction.py
+++ b/elk/extraction/extraction.py
@@ -101,7 +101,9 @@ def extract_hiddens(
 
     # AutoModel should do the right thing here in nearly all cases. We don't actually
     # care what head the model has, since we are just extracting hidden states.
-    model = AutoModel.from_pretrained(cfg.model, torch_dtype="auto").to(device)
+    model = AutoModel.from_pretrained(
+        cfg.model, torch_dtype="auto" if device != "cpu" else torch.float32
+    ).to(device)
     # TODO: Maybe also make this configurable?
     # We want to make sure the answer is never truncated
     tokenizer = AutoTokenizer.from_pretrained(cfg.model, truncation_side="left")