Skip to content

Commit

Permalink
Black
Browse files Browse the repository at this point in the history
  • Loading branch information
Helw150 committed May 10, 2024
1 parent 542532a commit 06efeee
Show file tree
Hide file tree
Showing 11 changed files with 517 additions and 309 deletions.
13 changes: 13 additions & 0 deletions LICENSE~
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
Copyright 2019 William Held, Caleb Ziems, Diyi Yang

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http:https://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
106 changes: 106 additions & 0 deletions pyproject.toml~
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
[project]
name = "value"
version = "1.1"
authors = [
{ name="David Hall", email="[email protected]" },
{name="Ivan Zhou", email="[email protected]"}
]
description = "Scalable Training for Foundation Models with Named Tensors and JAX"
readme = "README.md"
requires-python = ">=3.10"
classifiers = [
"Programming Language :: Python :: 3",
"License :: OSI Approved :: Apache Software License",
"Operating System :: POSIX :: Linux",
"Operating System :: MacOS :: MacOS X",
"Development Status :: 4 - Beta",
"Intended Audience :: Science/Research",
]
dependencies = [
# we require that you install jax yourself, since the extras vary by system.
# jax = {version = ">=0.4.10,<0.5.0"}
# "haliax>=1.3,<2.0",
# Haliax changes in step with levanter, so we'll just use the git version except for releases.
# "haliax @ git+https://github.com/stanford-crfm/haliax.git@main",
"haliax>=1.4.dev291",
"equinox>=0.11.4",
"jaxtyping>=0.2.20",
"transformers>=4.39.3",
"optax>=0.1.9",
"wandb~=0.16.6",
# We don't actually directly depend on scipy, but recent JAX had an issue
"scipy<=1.12.0",
"draccus>=0.7.2",
"pyarrow>=11.0.0",
"zstandard>=0.20.0",
"datasets~=2.18",
"gcsfs>=2024.2,<2024.4",
"braceexpand>=0.1.7",
"jmp>=0.0.3",
"fsspec[http]>=2024.2,<2024.4",
"tensorstore==0.1.56",
"pytimeparse>=1.1.8",
"humanfriendly==10.0",
"safetensors[numpy]~=0.4.2",
"matplotlib>=3.7.0",
"tblib>=1.7.0,<4.0.0",
"dataclasses-json~=0.6.4",
"ray[default]~=2.10",
"pydantic<3", # temporary pin until Ray supports pydantic 2.0
"rich~=13.0",
"filelock~=3.13",
]

[tool.hatch.build]
include = ["config/*.yaml", "config/*/*.yaml", "*.py"]
dev-mode-dirs = [".", "src"]

[tool.hatch.build.sources]
"src/levanter" = "levanter"
"config" = "levanter/config"

[tool.hatch.metadata]
allow-direct-references = true



[tool.hatch.build.targets.wheel]
packages = ["levanter"]


[project.urls]
"Homepage" = "https://github.com/stanford-crfm/levanter"
"Bug Tracker" = "https://github.com/stanford-crfm/levanter/issues"


[tool.black]
line-length = 119
target-version = ["py310"]
preview = true

[tool.isort]
profile = "black"
multi_line_output = 3
lines_after_imports = 2
include_trailing_comma = true
force_grid_wrap = 0
use_parentheses = true
ensure_newline_before_comments = true
line_length = 119
src_paths = ["src", "tests"]
known_haliax = ["haliax"]
sections=["FUTURE", "STDLIB", "THIRDPARTY", "HALIAX", "FIRSTPARTY", "LOCALFOLDER"]

[tool.mypy]
python_version = "3.10"
mypy_path = ["src"]

[tool.mypy-haliax.core]
ignore_missing_imports = true

[tool.pytest.ini_options]
markers = [
"slow: marks tests as slow (deselect with '-m \"not slow\"')",
"entry: marks tests as entry point tests (deselect with '-m \"not entry\"')",
"ray: marks tests that require Ray (deselect with '-m \"not ray\"')",
]
44 changes: 11 additions & 33 deletions src/multivalue/Dialects.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,7 @@ def __init__(self, vector=[], dialect_name=None, **kwargs):
self.feature_id_to_function_name = self.load_dict(
self.relatavize_file("resources/feature_id_to_function_name.json")
)
self.attestation_vectors = pd.read_csv(
self.relatavize_file("resources/attestation_vectors.csv")
)
self.attestation_vectors = pd.read_csv(self.relatavize_file("resources/attestation_vectors.csv"))

self.dialect_code = None
self.latitude = None
Expand Down Expand Up @@ -54,9 +52,7 @@ def __str__(self):
def initialize_from_vector(self):
for idx, feature_attestation in enumerate(self.vector):
feature_id = str(idx)
if (feature_attestation) and (
feature_id in self.feature_id_to_function_name
):
if (feature_attestation) and (feature_id in self.feature_id_to_function_name):
for feature_name in self.feature_id_to_function_name[feature_id]:
self.morphosyntax_transforms[feature_name] = feature_attestation

Expand All @@ -70,12 +66,8 @@ def manhattan_distance(self, other, normalized=True):

def geographical_distance(self, other, metric=True):
if metric:
return geopy.distance.geodesic(
(self.latitude, self.longitude), (other.latitude, other.longitude)
).km
return geopy.distance.geodesic(
(self.latitude, self.longitude), (other.latitude, other.longitude)
).miles
return geopy.distance.geodesic((self.latitude, self.longitude), (other.latitude, other.longitude)).km
return geopy.distance.geodesic((self.latitude, self.longitude), (other.latitude, other.longitude)).miles


class DialectFromFeatureList(BaseDialect):
Expand Down Expand Up @@ -110,9 +102,7 @@ def __init__(self, **kwargs):

class AfricanAmericanVernacular(DialectFromVector):
def __init__(self, **kwargs):
super().__init__(
dialect_name="Urban African American Vernacular English", **kwargs
)
super().__init__(dialect_name="Urban African American Vernacular English", **kwargs)


class AppalachianDialect(DialectFromVector):
Expand Down Expand Up @@ -167,16 +157,12 @@ def __init__(self, **kwargs):

class ColloquialSingaporeDialect(DialectFromVector):
def __init__(self, **kwargs):
super().__init__(
dialect_name="Colloquial Singapore English (Singlish)", **kwargs
)
super().__init__(dialect_name="Colloquial Singapore English (Singlish)", **kwargs)


class EarlyAfricanAmericanVernacular(DialectFromVector):
def __init__(self, **kwargs):
super().__init__(
dialect_name="Earlier African American Vernacular English", **kwargs
)
super().__init__(dialect_name="Earlier African American Vernacular English", **kwargs)


class EastAnglicanDialect(DialectFromVector):
Expand Down Expand Up @@ -271,9 +257,7 @@ def __init__(self, **kwargs):

class NorthEnglandDialect(DialectFromVector):
def __init__(self, **kwargs):
super().__init__(
dialect_name="English dialects in the North of England", **kwargs
)
super().__init__(dialect_name="English dialects in the North of England", **kwargs)


class OrkneyShetlandDialect(DialectFromVector):
Expand All @@ -298,9 +282,7 @@ def __init__(self, **kwargs):

class RuralAfricanAmericanVernacular(DialectFromVector):
def __init__(self, **kwargs):
super().__init__(
dialect_name="Rural African American Vernacular English", **kwargs
)
super().__init__(dialect_name="Rural African American Vernacular English", **kwargs)


class ScottishDialect(DialectFromVector):
Expand All @@ -325,16 +307,12 @@ def __init__(self, **kwargs):

class SoutheastEnglandDialect(DialectFromVector):
def __init__(self, **kwargs):
super().__init__(
dialect_name="English dialects in the Southeast of England", **kwargs
)
super().__init__(dialect_name="English dialects in the Southeast of England", **kwargs)


class SouthwestEnglandDialect(DialectFromVector):
def __init__(self, **kwargs):
super().__init__(
dialect_name="English dialects in the Southwest of England", **kwargs
)
super().__init__(dialect_name="English dialects in the Southwest of England", **kwargs)


class TanzanianDialect(DialectFromVector):
Expand Down
28 changes: 14 additions & 14 deletions src/multivalue/build_attestation_vectors.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,25 +2,25 @@
import pandas as pd
import numpy as np

if __name__=='__main__':
if __name__ == "__main__":

parser = argparse.ArgumentParser()
parser.add_argument('--A', default=1.0, help="The probability of executing a feature with attestation level A")
parser.add_argument('--B', default=0.6, help="The probability of executing a feature with attestation level B")
parser.add_argument('--C', default=0.3, help="The probability of executing a feature with attestation level C")
parser.add_argument("--A", default=1.0, help="The probability of executing a feature with attestation level A")
parser.add_argument("--B", default=0.6, help="The probability of executing a feature with attestation level B")
parser.add_argument("--C", default=0.3, help="The probability of executing a feature with attestation level C")
args = parser.parse_args()
ewave = pd.read_csv('resources/ewave.csv')
language_vectors = {key: np.zeros(max(ewave['parameter_pk'])+1) for key in sorted(set(ewave['language_name']))}

ewave = pd.read_csv("resources/ewave.csv")
language_vectors = {key: np.zeros(max(ewave["parameter_pk"]) + 1) for key in sorted(set(ewave["language_name"]))}
for _, row in ewave.iterrows():
pct = 0.0
if row['attestation'] == 'A':
if row["attestation"] == "A":
pct = args.A
elif row['attestation'] == 'B':
elif row["attestation"] == "B":
pct = args.B
elif row['attestation'] == 'C':
elif row["attestation"] == "C":
pct = args.C
language_vectors[row['language_name']][row['parameter_pk']] = pct
language_vectors[row["language_name"]][row["parameter_pk"]] = pct
df = pd.DataFrame(language_vectors)
df.index.rename('feature_id', inplace=True)
df.to_csv('resources/attestation_vectors.csv')
df.index.rename("feature_id", inplace=True)
df.to_csv("resources/attestation_vectors.csv")
55 changes: 31 additions & 24 deletions src/multivalue/build_benefactive_ditransitive_verb_list.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,29 @@
from nltk.corpus import verbnet
from collections import defaultdict


def get_theta_roles(ex):
theta_roles = set()
for s in ex['semantics']:
for a in s['arguments']:
if a['type']=='ThemRole':
theta_roles.add(a['value'])
for s in ex["semantics"]:
for a in s["arguments"]:
if a["type"] == "ThemRole":
theta_roles.add(a["value"])
return theta_roles


def ditransitive_dobj(theta_roles):
return ('Agent' in theta_roles and 'Destination' in theta_roles and 'Theme' in theta_roles) or \
('Agent' in theta_roles and 'Beneficiary' in theta_roles and 'Theme' in theta_roles)
return ("Agent" in theta_roles and "Destination" in theta_roles and "Theme" in theta_roles) or (
"Agent" in theta_roles and "Beneficiary" in theta_roles and "Theme" in theta_roles
)


def transitive_dobj(theta_roles):
return 'Agent' in theta_roles and ('Theme' in theta_roles or 'Topic' in theta_roles)
return "Agent" in theta_roles and ("Theme" in theta_roles or "Topic" in theta_roles)


def beneficiary(theta_roles):
return 'Beneficiary' in theta_roles
return "Beneficiary" in theta_roles


def main():
thematic_roles = defaultdict(set)
Expand All @@ -29,22 +35,23 @@ def main():
theta_roles.update(get_theta_roles(ex))
for lemma in verbnet.lemmas(classid):
thematic_roles[str(lemma)].update(theta_roles)
thematic_roles[classid.split('-')[0]].update(theta_roles)
with open('resources/ditransitive_dobj_verbs.txt', 'w') as outfile:
thematic_roles[classid.split("-")[0]].update(theta_roles)

with open("resources/ditransitive_dobj_verbs.txt", "w") as outfile:
for lemma in sorted(thematic_roles.keys()):
if ditransitive_dobj(thematic_roles[lemma]) and '_' not in lemma:
outfile.write(lemma+'\n')
with open('resources/transitive_dobj_verbs.txt', 'w') as outfile:
if ditransitive_dobj(thematic_roles[lemma]) and "_" not in lemma:
outfile.write(lemma + "\n")

with open("resources/transitive_dobj_verbs.txt", "w") as outfile:
for lemma in sorted(thematic_roles.keys()):
if transitive_dobj(thematic_roles[lemma]) and '_' not in lemma:
outfile.write(lemma+'\n')
with open('resources/benefactive_verbs.txt', 'w') as outfile:
if transitive_dobj(thematic_roles[lemma]) and "_" not in lemma:
outfile.write(lemma + "\n")

with open("resources/benefactive_verbs.txt", "w") as outfile:
for lemma in sorted(thematic_roles.keys()):
if beneficiary(thematic_roles[lemma]) and '_' not in lemma:
outfile.write(lemma+'\n')

if __name__=='__main__':
main()
if beneficiary(thematic_roles[lemma]) and "_" not in lemma:
outfile.write(lemma + "\n")


if __name__ == "__main__":
main()
Loading

0 comments on commit 06efeee

Please sign in to comment.