Black

SALT-NLP · May 10, 2024 · 06efeee · 06efeee
1 parent 542532a
commit 06efeee
Show file tree

Hide file tree

Showing 11 changed files with 517 additions and 309 deletions.
diff --git a/LICENSE~ b/LICENSE~
@@ -0,0 +1,13 @@
+Copyright 2019 William Held, Caleb Ziems, Diyi Yang
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http:https://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
diff --git a/pyproject.toml~ b/pyproject.toml~
@@ -0,0 +1,106 @@
+[project]
+name = "value"
+version = "1.1"
+authors = [
+ { name="David Hall", email="[email protected]" },
+ {name="Ivan Zhou", email="[email protected]"}
+]
+description = "Scalable Training for Foundation Models with Named Tensors and JAX"
+readme = "README.md"
+requires-python = ">=3.10"
+classifiers = [
+ "Programming Language :: Python :: 3",
+ "License :: OSI Approved :: Apache Software License",
+ "Operating System :: POSIX :: Linux",
+ "Operating System :: MacOS :: MacOS X",
+ "Development Status :: 4 - Beta",
+ "Intended Audience :: Science/Research",
+]
+dependencies = [
+ # we require that you install jax yourself, since the extras vary by system.
+ # jax = {version = ">=0.4.10,<0.5.0"}
+# "haliax>=1.3,<2.0",
+# Haliax changes in step with levanter, so we'll just use the git version except for releases.
+# "haliax @ git+https://github.com/stanford-crfm/haliax.git@main",
+ "haliax>=1.4.dev291",
+ "equinox>=0.11.4",
+ "jaxtyping>=0.2.20",
+ "transformers>=4.39.3",
+ "optax>=0.1.9",
+ "wandb~=0.16.6",
+ # We don't actually directly depend on scipy, but recent JAX had an issue
+ "scipy<=1.12.0",
+ "draccus>=0.7.2",
+ "pyarrow>=11.0.0",
+ "zstandard>=0.20.0",
+ "datasets~=2.18",
+ "gcsfs>=2024.2,<2024.4",
+ "braceexpand>=0.1.7",
+ "jmp>=0.0.3",
+ "fsspec[http]>=2024.2,<2024.4",
+ "tensorstore==0.1.56",
+ "pytimeparse>=1.1.8",
+ "humanfriendly==10.0",
+ "safetensors[numpy]~=0.4.2",
+ "matplotlib>=3.7.0",
+ "tblib>=1.7.0,<4.0.0",
+ "dataclasses-json~=0.6.4",
+ "ray[default]~=2.10",
+ "pydantic<3", # temporary pin until Ray supports pydantic 2.0
+ "rich~=13.0",
+ "filelock~=3.13",
+]
+
+[tool.hatch.build]
+include = ["config/*.yaml", "config/*/*.yaml", "*.py"]
+dev-mode-dirs = [".", "src"]
+
+[tool.hatch.build.sources]
+"src/levanter" = "levanter"
+"config" = "levanter/config"
+
+[tool.hatch.metadata]
+allow-direct-references = true
+
+
+
+[tool.hatch.build.targets.wheel]
+packages = ["levanter"]
+
+
+[project.urls]
+"Homepage" = "https://github.com/stanford-crfm/levanter"
+"Bug Tracker" = "https://github.com/stanford-crfm/levanter/issues"
+
+
+[tool.black]
+line-length = 119
+target-version = ["py310"]
+preview = true
+
+[tool.isort]
+profile = "black"
+multi_line_output = 3
+lines_after_imports = 2
+include_trailing_comma = true
+force_grid_wrap = 0
+use_parentheses = true
+ensure_newline_before_comments = true
+line_length = 119
+src_paths = ["src", "tests"]
+known_haliax = ["haliax"]
+sections=["FUTURE", "STDLIB", "THIRDPARTY", "HALIAX", "FIRSTPARTY", "LOCALFOLDER"]
+
+[tool.mypy]
+python_version = "3.10"
+mypy_path = ["src"]
+
+[tool.mypy-haliax.core]
+ignore_missing_imports = true
+
+[tool.pytest.ini_options]
+markers = [
+ "slow: marks tests as slow (deselect with '-m \"not slow\"')",
+ "entry: marks tests as entry point tests (deselect with '-m \"not entry\"')",
+ "ray: marks tests that require Ray (deselect with '-m \"not ray\"')",
+]
diff --git a/src/multivalue/Dialects.py b/src/multivalue/Dialects.py
@@ -20,9 +20,7 @@ def __init__(self, vector=[], dialect_name=None, **kwargs):
  self.feature_id_to_function_name = self.load_dict(
  self.relatavize_file("resources/feature_id_to_function_name.json")
  )
- self.attestation_vectors = pd.read_csv(
- self.relatavize_file("resources/attestation_vectors.csv")
- )
+ self.attestation_vectors = pd.read_csv(self.relatavize_file("resources/attestation_vectors.csv"))
 
  self.dialect_code = None
  self.latitude = None
@@ -54,9 +52,7 @@ def __str__(self):
  def initialize_from_vector(self):
  for idx, feature_attestation in enumerate(self.vector):
  feature_id = str(idx)
- if (feature_attestation) and (
- feature_id in self.feature_id_to_function_name
- ):
+ if (feature_attestation) and (feature_id in self.feature_id_to_function_name):
  for feature_name in self.feature_id_to_function_name[feature_id]:
  self.morphosyntax_transforms[feature_name] = feature_attestation
 
@@ -70,12 +66,8 @@ def manhattan_distance(self, other, normalized=True):
 
  def geographical_distance(self, other, metric=True):
  if metric:
- return geopy.distance.geodesic(
- (self.latitude, self.longitude), (other.latitude, other.longitude)
- ).km
- return geopy.distance.geodesic(
- (self.latitude, self.longitude), (other.latitude, other.longitude)
- ).miles
+ return geopy.distance.geodesic((self.latitude, self.longitude), (other.latitude, other.longitude)).km
+ return geopy.distance.geodesic((self.latitude, self.longitude), (other.latitude, other.longitude)).miles
 
 
 class DialectFromFeatureList(BaseDialect):
@@ -110,9 +102,7 @@ def __init__(self, **kwargs):
 
 class AfricanAmericanVernacular(DialectFromVector):
  def __init__(self, **kwargs):
- super().__init__(
- dialect_name="Urban African American Vernacular English", **kwargs
- )
+ super().__init__(dialect_name="Urban African American Vernacular English", **kwargs)
 
 
 class AppalachianDialect(DialectFromVector):
@@ -167,16 +157,12 @@ def __init__(self, **kwargs):
 
 class ColloquialSingaporeDialect(DialectFromVector):
  def __init__(self, **kwargs):
- super().__init__(
- dialect_name="Colloquial Singapore English (Singlish)", **kwargs
- )
+ super().__init__(dialect_name="Colloquial Singapore English (Singlish)", **kwargs)
 
 
 class EarlyAfricanAmericanVernacular(DialectFromVector):
  def __init__(self, **kwargs):
- super().__init__(
- dialect_name="Earlier African American Vernacular English", **kwargs
- )
+ super().__init__(dialect_name="Earlier African American Vernacular English", **kwargs)
 
 
 class EastAnglicanDialect(DialectFromVector):
@@ -271,9 +257,7 @@ def __init__(self, **kwargs):
 
 class NorthEnglandDialect(DialectFromVector):
  def __init__(self, **kwargs):
- super().__init__(
- dialect_name="English dialects in the North of England", **kwargs
- )
+ super().__init__(dialect_name="English dialects in the North of England", **kwargs)
 
 
 class OrkneyShetlandDialect(DialectFromVector):
@@ -298,9 +282,7 @@ def __init__(self, **kwargs):
 
 class RuralAfricanAmericanVernacular(DialectFromVector):
  def __init__(self, **kwargs):
- super().__init__(
- dialect_name="Rural African American Vernacular English", **kwargs
- )
+ super().__init__(dialect_name="Rural African American Vernacular English", **kwargs)
 
 
 class ScottishDialect(DialectFromVector):
@@ -325,16 +307,12 @@ def __init__(self, **kwargs):
 
 class SoutheastEnglandDialect(DialectFromVector):
  def __init__(self, **kwargs):
- super().__init__(
- dialect_name="English dialects in the Southeast of England", **kwargs
- )
+ super().__init__(dialect_name="English dialects in the Southeast of England", **kwargs)
 
 
 class SouthwestEnglandDialect(DialectFromVector):
  def __init__(self, **kwargs):
- super().__init__(
- dialect_name="English dialects in the Southwest of England", **kwargs
- )
+ super().__init__(dialect_name="English dialects in the Southwest of England", **kwargs)
 
 
 class TanzanianDialect(DialectFromVector):

diff --git a/src/multivalue/build_attestation_vectors.py b/src/multivalue/build_attestation_vectors.py
@@ -2,25 +2,25 @@
 import pandas as pd
 import numpy as np
 
-if __name__=='__main__':
- 
+if __name__ == "__main__":
+
  parser = argparse.ArgumentParser()
- parser.add_argument('--A', default=1.0, help="The probability of executing a feature with attestation level A")
- parser.add_argument('--B', default=0.6, help="The probability of executing a feature with attestation level B")
- parser.add_argument('--C', default=0.3, help="The probability of executing a feature with attestation level C")
+ parser.add_argument("--A", default=1.0, help="The probability of executing a feature with attestation level A")
+ parser.add_argument("--B", default=0.6, help="The probability of executing a feature with attestation level B")
+ parser.add_argument("--C", default=0.3, help="The probability of executing a feature with attestation level C")
  args = parser.parse_args()
- 
- ewave = pd.read_csv('resources/ewave.csv')
- language_vectors = {key: np.zeros(max(ewave['parameter_pk'])+1) for key in sorted(set(ewave['language_name']))}
+
+ ewave = pd.read_csv("resources/ewave.csv")
+ language_vectors = {key: np.zeros(max(ewave["parameter_pk"]) + 1) for key in sorted(set(ewave["language_name"]))}
  for _, row in ewave.iterrows():
  pct = 0.0
- if row['attestation'] == 'A':
+ if row["attestation"] == "A":
  pct = args.A
- elif row['attestation'] == 'B':
+ elif row["attestation"] == "B":
  pct = args.B
- elif row['attestation'] == 'C':
+ elif row["attestation"] == "C":
  pct = args.C
- language_vectors[row['language_name']][row['parameter_pk']] = pct
+ language_vectors[row["language_name"]][row["parameter_pk"]] = pct
  df = pd.DataFrame(language_vectors)
- df.index.rename('feature_id', inplace=True)
- df.to_csv('resources/attestation_vectors.csv')
+ df.index.rename("feature_id", inplace=True)
+ df.to_csv("resources/attestation_vectors.csv")
diff --git a/src/multivalue/build_benefactive_ditransitive_verb_list.py b/src/multivalue/build_benefactive_ditransitive_verb_list.py
@@ -1,23 +1,29 @@
 from nltk.corpus import verbnet
 from collections import defaultdict
 
+
 def get_theta_roles(ex):
  theta_roles = set()
- for s in ex['semantics']:
- for a in s['arguments']:
- if a['type']=='ThemRole':
- theta_roles.add(a['value'])
+ for s in ex["semantics"]:
+ for a in s["arguments"]:
+ if a["type"] == "ThemRole":
+ theta_roles.add(a["value"])
  return theta_roles
 
+
 def ditransitive_dobj(theta_roles):
- return ('Agent' in theta_roles and 'Destination' in theta_roles and 'Theme' in theta_roles) or \
- ('Agent' in theta_roles and 'Beneficiary' in theta_roles and 'Theme' in theta_roles)
+ return ("Agent" in theta_roles and "Destination" in theta_roles and "Theme" in theta_roles) or (
+ "Agent" in theta_roles and "Beneficiary" in theta_roles and "Theme" in theta_roles
+ )
+
 
 def transitive_dobj(theta_roles):
- return 'Agent' in theta_roles and ('Theme' in theta_roles or 'Topic' in theta_roles)
+ return "Agent" in theta_roles and ("Theme" in theta_roles or "Topic" in theta_roles)
+
 
 def beneficiary(theta_roles):
- return 'Beneficiary' in theta_roles
+ return "Beneficiary" in theta_roles
+
 
 def main():
  thematic_roles = defaultdict(set)
@@ -29,22 +35,23 @@ def main():
  theta_roles.update(get_theta_roles(ex))
  for lemma in verbnet.lemmas(classid):
  thematic_roles[str(lemma)].update(theta_roles)
- thematic_roles[classid.split('-')[0]].update(theta_roles)
- 
- with open('resources/ditransitive_dobj_verbs.txt', 'w') as outfile:
+ thematic_roles[classid.split("-")[0]].update(theta_roles)
+
+ with open("resources/ditransitive_dobj_verbs.txt", "w") as outfile:
  for lemma in sorted(thematic_roles.keys()):
- if ditransitive_dobj(thematic_roles[lemma]) and '_' not in lemma:
- outfile.write(lemma+'\n')
- 
- with open('resources/transitive_dobj_verbs.txt', 'w') as outfile:
+ if ditransitive_dobj(thematic_roles[lemma]) and "_" not in lemma:
+ outfile.write(lemma + "\n")
+
+ with open("resources/transitive_dobj_verbs.txt", "w") as outfile:
  for lemma in sorted(thematic_roles.keys()):
- if transitive_dobj(thematic_roles[lemma]) and '_' not in lemma:
- outfile.write(lemma+'\n')
- 
- with open('resources/benefactive_verbs.txt', 'w') as outfile:
+ if transitive_dobj(thematic_roles[lemma]) and "_" not in lemma:
+ outfile.write(lemma + "\n")
+
+ with open("resources/benefactive_verbs.txt", "w") as outfile:
  for lemma in sorted(thematic_roles.keys()):
- if beneficiary(thematic_roles[lemma]) and '_' not in lemma:
- outfile.write(lemma+'\n')
-
-if __name__=='__main__':
- main()
+ if beneficiary(thematic_roles[lemma]) and "_" not in lemma:
+ outfile.write(lemma + "\n")
+
+
+if __name__ == "__main__":
+ main()