first commit

openai · Dec 15, 2023 · 8f74a1c · 8f74a1c
commit 8f74a1c
Show file tree

Hide file tree

Showing 9 changed files with 402 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,129 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+# Usually these files are written by a python script from a template
+# before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# pipenv
+# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+# However, in case of collaboration, if having platform-specific dependencies or dependencies
+# having no cross-platform support, pipenv may install dependencies that don't work, or not
+# install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -0,0 +1,9 @@
+repos:
+- repo: local
+ hooks:
+ - id: trufflehog
+ name: TruffleHog
+ description: Detect secrets in your data.
+ entry: bash -c 'trufflehog git file:https://. --since-commit HEAD --fail --no-update'
+ language: system
+ stages: ["commit", "push"]
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2023 OpenAI
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.md b/README.md
@@ -0,0 +1,69 @@
+# Sparse autoencoder for GPT2 small
+
+This repository hosts a sparse autoencoder trained on the GPT2-small model's activations.
+The autoencoder's purpose is to expand the MLP layer activations into a larger number of dimensions,
+providing an overcomplete basis of the MLP activation space. The learned dimensions have been
+shown to be more interpretable than the original MLP dimensions.
+
+### Install
+
+```sh
+pip install git+https://github.com/openai/sparse_autoencoder.git
+```
+
+### Example usage
+
+```py
+import torch
+import blobfile as bf
+import transformer_lens
+import sparse_autoencoder
+
+# Load the autoencoder
+layer_index = 0 # in range(12)
+autoencoder_input = ["mlp_post_act", "resid_delta_mlp"][0]
+filename = f"az:https://openaipublic/sparse-autoencoder/gpt2-small/{autoencoder_input}/autoencoders/{layer_index}.pt"
+with bf.BlobFile(filename, mode="rb") as f:
+ state_dict = torch.load(f)
+ autoencoder = sparse_autoencoder.Autoencoder.from_state_dict(state_dict)
+
+# Extract neuron activations with transformer_lens
+model = transformer_lens.HookedTransformer.from_pretrained("gpt2", center_writing_weights=False)
+prompt = "This is an example of a prompt that"
+tokens = model.to_tokens(prompt) # (1, n_tokens)
+print(model.to_str_tokens(tokens))
+with torch.no_grad():
+ logits, activation_cache = model.run_with_cache(tokens, remove_batch_dim=True)
+if autoencoder_input == "mlp_post_act":
+ input_tensor = activation_cache[f"blocks.{layer_index}.mlp.hook_post"] # (n_tokens, n_neurons)
+elif autoencoder_input == "resid_delta_mlp":
+ input_tensor = activation_cache[f"blocks.{layer_index}.hook_mlp_out"] # (n_tokens, n_residual_channels)
+
+# Encode neuron activations with the autoencoder
+device = next(model.parameters()).device
+autoencoder.to(device)
+with torch.no_grad():
+ latent_activations = autoencoder.encode(input_tensor) # (n_tokens, n_latents)
+```
+
+### Autoencoder settings
+
+- Model used: "gpt2-small", 12 layers
+- Autoencoder architecture: see `model.py`
+- Autoencoder input: "mlp_post_act" (3072 dimensions) or "resid_delta_mlp" (768 dimensions)
+- Number of autoencoder latents: 32768
+- Loss function: see `loss.py`
+- Number of training tokens: ~64M
+- L1 regularization strength: 0.01
+
+### Data files
+
+- `autoencoder_input` is in ["mlp_post_act", "resid_delta_mlp"]
+- `layer_index` is in range(12) (GPT2-small)
+- `latent_index` is in range(32768)
+
+Autoencoder files:
+`az:https://openaipublic/sparse-autoencoder/gpt2-small/{autoencoder_input}/autoencoders/{layer_index}.pt`
+
+NeuronRecord files:
+`az:https://openaipublic/sparse-autoencoder/gpt2-small/{autoencoder_input}/collated_activations/{layer_index}/{latent_index}.json`
diff --git a/SECURITY.md b/SECURITY.md
@@ -0,0 +1,5 @@
+# Security Policy
+For a more in-depth look at our security policy, please check out our
+[Coordinated Vulnerability Disclosure Policy](https://openai.com/security/disclosure/#:~:text=Disclosure%20Policy,-Security%20is%20essential&text=OpenAI%27s%20coordinated%20vulnerability%20disclosure%20policy,expect%20from%20us%20in%20return.).
+
+Our PGP key can located [at this address.](https://cdn.openai.com/security.txt)
diff --git a/pyproject.toml b/pyproject.toml
@@ -0,0 +1,18 @@
+[project]
+name = "sparse_autoencoder"
+description="Sparse autoencoder for GPT2"
+version = "0.1"
+authors = [{name = "OpenAI"}]
+dependencies = [
+ "blobfile == 2.0.2",
+ "torch == 2.1.0",
+ "transformer_lens == 1.9.1",
+]
+readme = "README.md"
+
+[build-system]
+requires = ["setuptools>=64.0"]
+build-backend = "setuptools.build_meta"
+
+[tool.setuptools.packages.find]
+include = ["sparse_autoencoder*"]
diff --git a/sparse_autoencoder/__init__.py b/sparse_autoencoder/__init__.py
@@ -0,0 +1,3 @@
+from .model import Autoencoder
+
+__all__ = ["Autoencoder"]
diff --git a/sparse_autoencoder/loss.py b/sparse_autoencoder/loss.py
@@ -0,0 +1,46 @@
+import torch
+
+
+def autoencoder_loss(
+ reconstruction: torch.Tensor,
+ original_input: torch.Tensor,
+ latent_activations: torch.Tensor,
+ l1_weight: float,
+) -> torch.Tensor:
+ """
+ :param reconstruction: output of Autoencoder.decode (shape: [batch, n_inputs])
+ :param original_input: input of Autoencoder.encode (shape: [batch, n_inputs])
+ :param latent_activations: output of Autoencoder.encode (shape: [batch, n_latents])
+ :param l1_weight: weight of L1 loss
+ :return: loss (shape: [1])
+ """
+ return (
+ normalized_mean_squared_error(reconstruction, original_input)
+ + normalized_L1_loss(latent_activations, original_input) * l1_weight
+ )
+
+
+def normalized_mean_squared_error(
+ reconstruction: torch.Tensor,
+ original_input: torch.Tensor,
+) -> torch.Tensor:
+ """
+ :param reconstruction: output of Autoencoder.decode (shape: [batch, n_inputs])
+ :param original_input: input of Autoencoder.encode (shape: [batch, n_inputs])
+ :return: normalized mean squared error (shape: [1])
+ """
+ return (
+ ((reconstruction - original_input) ** 2).mean(dim=1) / (original_input**2).mean(dim=1)
+ ).mean()
+
+
+def normalized_L1_loss(
+ latent_activations: torch.Tensor,
+ original_input: torch.Tensor,
+) -> torch.Tensor:
+ """
+ :param latent_activations: output of Autoencoder.encode (shape: [batch, n_latents])
+ :param original_input: input of Autoencoder.encode (shape: [batch, n_inputs])
+ :return: normalized L1 loss (shape: [1])
+ """
+ return (latent_activations.abs().sum(dim=1) / original_input.norm(dim=1)).mean()