Skip to content

Commit

Permalink
Merge branch 'main' into fused-kernels-msg
Browse files Browse the repository at this point in the history
  • Loading branch information
StellaAthena committed Mar 14, 2023
2 parents 577126c + 6534f19 commit 76dde6c
Show file tree
Hide file tree
Showing 5 changed files with 36 additions and 10 deletions.
7 changes: 7 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,13 @@ For those looking for a TPU-centric codebase, we recommend [Mesh Transformer JAX

**If you are not looking to train models with billions of parameters from scratch, this is likely the wrong library to use. For generic inference needs, we recommend you use the Hugging Face `transformers` library instead which supports GPT-NeoX models.**

## GPT-NeoX 2.0

Prior to 3/9/2023, GPT-NeoX relied on [DeeperSpeed](https://github.com/EleutherAI/DeeperSpeed), which was based on an old version of DeepSpeed (0.3.15). In order to migrate to the latest upstream DeepSpeed version while allowing users to access the old versions of GPT-NeoX and DeeperSpeed, we have introduced two versioned releases for both libraries:

- Version 1.0 of [GPT-NeoX](https://github.com/EleutherAI/gpt-neox/releases/tag/v1.0) and [DeeperSpeed](https://github.com/EleutherAI/DeeperSpeed/releases/tag/v1.0) maintain snapshots of the old stable versions that [GPT-NeoX-20B](https://arxiv.org/abs/2204.06745) and the [Pythia Suite](https://github.com/EleutherAI/pythia) were trained on.
- Version 2.0 of [GPT-NeoX](https://github.com/EleutherAI/gpt-neox/releases/tag/v2.0) and [DeeperSpeed](https://github.com/EleutherAI/DeeperSpeed/releases/tag/v2.0) are the latest versions built on the latest DeepSpeed, and will be maintained going forward.

# Contents

* [Quick Start](#quick-start)
Expand Down
5 changes: 0 additions & 5 deletions megatron/neox_arguments/deepspeed_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,8 +216,3 @@ class NeoXArgsDeepspeedRunner(NeoXArgsTemplate):
"""
Adds a `--comment` to the DeepSpeed launch command. In DeeperSpeed this is passed on to the SlurmLauncher as well. Sometime necessary for cluster rules, or so I've heard.
"""

no_ssh_check: bool = False
"""
If `True` and running with multiple nodes, then DeepSpeedd doesn't conduct a check to ensure the head node is reachable with ssh.
"""
8 changes: 8 additions & 0 deletions prepare_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,13 @@ def get_args():
parser.add_argument(
"-m", "--merge-file", default=None, help=f"Tokenizer merge file (if required)"
)
parser.add_argument(
"-f",
"--force-redownload",
dest="force_redownload",
default=False,
action="store_true",
)
return parser.parse_args()


Expand All @@ -65,4 +72,5 @@ def get_args():
data_dir=args.data_dir,
vocab_file=args.vocab_file,
merge_file=args.merge_file,
force_redownload=args.force_redownload,
)
1 change: 1 addition & 0 deletions requirements/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ pybind11>=2.6.2
regex
sentencepiece
six
best_download
tiktoken>=0.1.2
tokenizers>=0.12.1
transformers>=4.24.0
25 changes: 20 additions & 5 deletions tools/corpora.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ def __init__(
merge_file=None,
vocab_file=None,
data_dir=None,
force_redownload=None,
num_workers=None,
):
if tokenizer_type is None:
Expand All @@ -49,6 +50,8 @@ def __init__(
data_dir = os.environ.get("DATA_DIR", "./data")
if merge_file is None:
merge_file = f"{data_dir}/gpt2-merges.txt"
if force_redownload is None:
force_redownload = False
if vocab_file is None:
if tokenizer_type == "GPT2BPETokenizer":
vocab_file = f"{data_dir}/gpt2-vocab.json"
Expand All @@ -64,6 +67,7 @@ def __init__(
self._merge_file = merge_file
self._vocab_file = vocab_file
self._data_dir = data_dir
self._force_redownload = force_redownload
self._num_workers = num_workers

@property
Expand Down Expand Up @@ -121,9 +125,14 @@ def download(self):
"""downloads dataset"""
os.makedirs(os.path.join(self.base_dir, self.name), exist_ok=True)
for url in self.urls:
os.system(
f"wget {url} -O {os.path.join(self.base_dir, self.name, os.path.basename(url))}"
)
try:
os_cmd = f"wget {url} -O {os.path.join(self.base_dir, self.name, os.path.basename(url))}"
if os.system(os_cmd) != 0:
raise Exception(
f"Cannot download file at URL {url}: server may be down"
)
except Exception as e:
raise Exception(f"Download error: {e}")

def tokenize(self):
"""tokenizes dataset"""
Expand Down Expand Up @@ -151,9 +160,13 @@ def tokenize(self):
os.system(cmd)

def prepare(self):
if not self.exists():
if self._force_redownload:
self.download()
self.tokenize()
else:
if not self.exists():
self.download()

self.tokenize()


class Enron(DataDownloader):
Expand Down Expand Up @@ -325,6 +338,7 @@ def prepare_dataset(
data_dir: str = None,
vocab_file: str = None,
merge_file: str = None,
force_redownload: bool = None,
num_workers: int = None,
):
"""
Expand All @@ -349,6 +363,7 @@ def prepare_dataset(
vocab_file=vocab_file,
merge_file=merge_file,
data_dir=data_dir,
force_redownload=force_redownload,
num_workers=num_workers,
)
d.prepare()

0 comments on commit 76dde6c

Please sign in to comment.