cleanup

EleutherAI · lucidrains · Jan 5, 2021 · Jan 5, 2021 · Jan 5, 2021 · Jan 5, 2021
commit 07b37d2d3b2daab4f1513597c3193340be896233
@@ -4,19 +4,20 @@
 from glob import glob
 import shutil
 import random
- 
+
 """
 This registry is for automatically downloading and extracting datasets.
 
-To register a class you need to inherit the DataDownloader class and provide name, filetype and url attributes, and (optionally) 
-provide download / extract / exists functions to check if the data exists, and, if it doesn't, download and extract the data 
-and move it to the correct directory.
+To register a class you need to inherit the DataDownloader class and provide name, filetype and url attributes, and 
+(optionally) provide download / extract / exists functions to check if the data exists, and, if it doesn't, download and 
+extract the data and move it to the correct directory.
 
-When done, add it to the DATA_DOWNLOADERS dict. The function process_data runs the pre-processing for the selected dataset.
+When done, add it to the DATA_DOWNLOADERS dict. The function process_data runs the pre-processing for the selected 
+dataset.
 """
 
-class DataDownloader(ABC):
 
+class DataDownloader(ABC):
  """Dataset registry class to automatically download / extract datasets"""
 
  @property
@@ -35,7 +36,7 @@ def name(self):
  def filetype(self):
  """filetype of dataset"""
  pass
- 
+
  @property
  @abstractmethod
  def url(self):
@@ -60,16 +61,16 @@ def exists(self):
 
  def download(self):
  """downloads dataset"""
- os.makedirs(self.base_dir, exist_ok = True)
+ os.makedirs(self.base_dir, exist_ok=True)
  os.system(f"wget {self.url} -O {os.path.join(self.base_dir, os.path.basename(self.url))}")
- 
+
  def prepare(self):
  if not self.exists():
  self.download()
  self.extract()
 
-class OWT2(DataDownloader):
 
+class OWT2(DataDownloader):
  name = "owt2"
  filetype = "tfrecords"
  url = "http:https://eaidata.bmk.sh/data/owt2_new.tar.gz"
@@ -89,7 +90,7 @@ def extract(self):
  n_eval_tfrecords = total_tfrecords // 10
  # owt2 doesn't have an official train/test split, so sample at random from tfrecords
  random.seed(self.seed)
- random.shuffle(all_files) 
+ random.shuffle(all_files)
  eval_set = all_files[:n_eval_tfrecords]
  train_set = all_files[n_eval_tfrecords:]
  for f in train_set:
@@ -100,8 +101,8 @@ def extract(self):
  for d in dirs_to_remove:
  shutil.rmtree(d)
 
-class Enwik8(DataDownloader):
 
+class Enwik8(DataDownloader):
  name = "owt2"
  filetype = "tar.gz"
  url = "http:https://eaidata.bmk.sh/data/enwik8.gz"
@@ -118,6 +119,7 @@ def exists(self):
  "enwik8": Enwik8
 }
 
+
 def prepare_data(dataset_name):
  DownloaderClass = DATA_DOWNLOADERS.get(dataset_name, None)
  if DownloaderClass is None:

@@ -2,7 +2,9 @@
 from itertools import islice
 import re
 from collections import OrderedDict
-
+import gzip
+import numpy as np
+import torch
 
 class FixedSizeOrderedDict(OrderedDict):
  def __init__(self, *args, max=0, **kwargs):

@@ -1,8 +1,5 @@
-import gzip
 import os
 import tarfile
-import numpy as np
-import torch
 import argparse
 import deepspeed
 import json
@@ -27,12 +24,14 @@ def get_params(model):
  params = json.load(f)
  return defaultdict(lambda: None, params)
 
+
 def is_main(args):
  """
  returns True if process is being run on the main GPU
  """
  return args.local_rank in [0, -1]
 
+
 def get_all_files(filetype, files_dir):
  files = []
  for (dir_path, _, filenames) in os.walk(files_dir):

@@ -1,9 +1,4 @@
-import argparse
-import json
-import os
 import random
-from collections import defaultdict
-
 import deepspeed
 import torch
 from torch.utils.data import DataLoader
@@ -13,12 +8,9 @@
 from gpt_neox import (GPTNeoX, AutoregressiveWrapper, GPT2Dataset, extract_tarfile,
  prepare_optimizer_parameters, get_tokenizer, is_main, prepare_data)
 
-from gpt_neox.utils import prepare_enwik8_data, get_args, get_params
-
-
+from gpt_neox.utils import get_args, get_params
 
 train_args = get_args()
-print("RANK: ", train_args.local_rank)
 params = get_params(train_args.model)
 
 # tokenizer
@@ -43,13 +35,13 @@
 dset_params = params["dataset"]
 assert dset_params is not None
 
-torch.distributed.barrier() # barrier will force processes to stop until *all* processes have reached the barrier
+torch.distributed.barrier()  # barrier will force processes to stop until *all* processes have reached the barrier
 if is_main(train_args):
  prepare_data(dset_params["name"])
- torch.distributed.barrier() # barrier will force processes to stop until *all* processes have reached the barrier
+ torch.distributed.barrier()  # barrier will force processes to stop until *all* processes have reached the barrier
 else:
- torch.distributed.barrier() 
- 
+ torch.distributed.barrier()
+
 train_dataset = GPT2Dataset(glob_pattern=dset_params["train_path"],
  seq_len=params["seq_len"],
  train=True,