format and small changes

EleutherAI · loubbrad · Nov 22, 2023 · Nov 9, 2023 · Nov 9, 2023 · Nov 10, 2023
commit 607c4387ceb8c445f0d0d3025c3265072ff35dfe
diff --git a/aria/data/datasets.py b/aria/data/datasets.py
@@ -77,14 +77,18 @@ def save(self, save_path: str):
  writer.write(midi_dict.get_msg_dict())
 
  @classmethod
- def load(cls, load_path: str):
+ def load(cls, load_path: str, stream=True):
  """Loads dataset from JSON file."""
+
  def _load():
  with jsonlines.open(load_path) as reader:
  for entry in reader:
  yield MidiDict.from_msg_dict(entry)
 
- return cls(_load())
+ if stream == False:
+ return cls(list(_load()))
+ else:
+ return cls(_load())
 
  @classmethod
  def split_from_file(
@@ -552,8 +556,13 @@ def _get_tokenized_seqs_mp(_midi_dict_iter: Iterable):
  oq = Queue()
 
  _num_proc = os.cpu_count()
- workers = [Process(target=functools.partial(_worker, tokenizer=tokenizer), args=(iq, oq)) for _ in
- range(_num_proc)]
+ workers = [
+ Process(
+ target=functools.partial(_worker, tokenizer=tokenizer),
+ args=(iq, oq),
+ )
+ for _ in range(_num_proc)
+ ]
  for w in workers:
  w.start()
 

diff --git a/aria/data/jsonl_zst.py b/aria/data/jsonl_zst.py
@@ -18,7 +18,7 @@ def __init__(self, path: str):
  self.path = path
 
  def __iter__(self):
- with builtins.open(self.path, 'rb') as fh:
+ with builtins.open(self.path, "rb") as fh:
  cctx = zstandard.ZstdDecompressor()
  reader = io.BufferedReader(cctx.stream_reader(fh))
  yield from jsonlines.Reader(reader)
@@ -36,13 +36,13 @@ def __init__(self, path: str):
  self.path = path
 
  def __enter__(self):
- self.fh = builtins.open(self.path, 'wb')
+ self.fh = builtins.open(self.path, "wb")
  self.cctx = zstandard.ZstdCompressor()
  self.compressor = self.cctx.stream_writer(self.fh)
  return self
 
  def write(self, obj):
- self.compressor.write(json.dumps(obj).encode('UTF-8') + b'\n')
+ self.compressor.write(json.dumps(obj).encode("UTF-8") + b"\n")
 
  def __exit__(self, exc_type, exc_value, traceback):
  self.compressor.flush(zstandard.FLUSH_FRAME)
@@ -62,9 +62,9 @@ def open(path: str, mode: str = "r"):
  Returns:
  Reader or Writer: Reader if mode is 'r', Writer if mode is 'w'.
  """
- if mode == 'r':
+ if mode == "r":
  yield Reader(path)
- elif mode == 'w':
+ elif mode == "w":
  with Writer(path) as writer:
  yield writer
  else:

diff --git a/aria/model/model.py b/aria/model/model.py
@@ -22,6 +22,7 @@ class YaRNConfig:
  mscale_coeff (int): Temperature scaling factor t follows `a ln s + 1.0`,
  and the coefficient `a` is this `mscale_coeff` here.
  """
+
  beta_fast: int = 16
  beta_slow: int = 1
  scale: int = 1.0
@@ -44,7 +45,6 @@ def __post_init__(self):
  if self.yarn_config is not None and isinstance(self.yarn_config, dict):
  self.yarn_config = YaRNConfig(**self.yarn_config)
 
-
  def set_vocab_size(self, vocab_size: int):
  self.vocab_size = vocab_size
 

diff --git a/aria/run.py b/aria/run.py
@@ -124,7 +124,7 @@ def _parse_tokenized_dataset_args():
  argp.add_argument("load_path", help="path midi_dict dataset")
  argp.add_argument("save_path", help="path to save dataset")
  argp.add_argument("-s", help="also produce shuffled", action="store_true")
- argp.add_argument("-l", help="max sequence length", type=int, default=2048)
+ argp.add_argument("-l", help="max sequence length", type=int)
 
  return argp.parse_args(sys.argv[2:])
 

diff --git a/config/config.json b/config/config.json
@@ -70,9 +70,6 @@
  "composer_names": ["bach", "beethoven", "mozart", "chopin", "rachmaninoff", "liszt", "debussy", "schubert", "brahms", "ravel", "satie", "scarlatti"]
  }
  }
- },
- "dataset_gen_args": {
- "max_seq_len": 2048
  }
  },
 

diff --git a/tests/test_data.py b/tests/test_data.py
@@ -248,7 +248,7 @@ def test_augmentation(self):
 
 class TestReaderWriter(unittest.TestCase):
  def test_jsonl_zst(self):
- data = [{"a": i, "b": i+1} for i in range(0, 100, 4)]
+ data = [{"a": i, "b": i + 1} for i in range(0, 100, 4)]
  filename = "tests/test_results/test.jsonl.zst"
  # if test.jsonl.zst exists, delete it
  if os.path.isfile(filename):