From 592acbead89c7778c80a70c4a7c6485d598a26fc Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Sun, 13 Aug 2023 20:58:59 +0900 Subject: [PATCH 001/183] add for ja --- configs/local_setup_ja.yml | 27 +++++++++++++++++++++++++++ prepare_data_ja.sh | 5 +++++ preprocess_ja.sh | 8 ++++++++ tools/corpora.py | 9 +++++++++ 4 files changed, 49 insertions(+) create mode 100644 configs/local_setup_ja.yml create mode 100644 prepare_data_ja.sh create mode 100644 preprocess_ja.sh diff --git a/configs/local_setup_ja.yml b/configs/local_setup_ja.yml new file mode 100644 index 000000000..04af5b6fa --- /dev/null +++ b/configs/local_setup_ja.yml @@ -0,0 +1,27 @@ +# Suggested data paths when using GPT-NeoX locally +{ + "data_path": "data/wiki_ja_en", + + # or for weighted datasets: + # "train-data-paths": ["data/enwik8/enwik8_text_document", "data/enwik8/enwik8_text_document"], + # "test-data-paths": ["data/enwik8/enwik8_text_document", "data/enwik8/enwik8_text_document"], + # "valid-data-paths": ["data/enwik8/enwik8_text_document", "data/enwik8/enwik8_text_document"], + # "train-data-weights": [1., 2.], + # "test-data-weights": [2., 1.], + # "valid-data-weights": [0.5, 0.4], + + # If weight_by_num_documents is True, Builds dataset weights from a multinomial distribution over groups of data according to the number of documents in each group. + # WARNING: setting this to True will override any user provided weights + # "weight_by_num_documents": false, + # "weighted_sampler_alpha": 0.3, + + "vocab_file": "./novelAI/tokenizer.model", + + "save": "checkpoints", + "load": "checkpoints", + "checkpoint_validation_with_forward_pass": False, + + "tensorboard_dir": "tensorboard", + "log_dir": "logs", + "use_wandb": False +} diff --git a/prepare_data_ja.sh b/prepare_data_ja.sh new file mode 100644 index 000000000..cb15ffb82 --- /dev/null +++ b/prepare_data_ja.sh @@ -0,0 +1,5 @@ +#!/bin/sh +python prepare_data.py -d ./data \ +-t SPMTokenizer \ +--vocab-file ./novelAI/tokenizer.model \ +wiki_ja_en diff --git a/preprocess_ja.sh b/preprocess_ja.sh new file mode 100644 index 000000000..7de047c6d --- /dev/null +++ b/preprocess_ja.sh @@ -0,0 +1,8 @@ +#!/bin/sh +python tools/preprocess_data.py \ + --input ./data/mydataset.jsonl.zst \ + --output-prefix ./data/wiki_ja_en \ + --vocab-file ./novelAI/tokenizer.model \ + --dataset-impl mmap \ + --tokenizer-type SPMTokenizer \ + --append-eod diff --git a/tools/corpora.py b/tools/corpora.py index b9e846454..fb35477a3 100644 --- a/tools/corpora.py +++ b/tools/corpora.py @@ -293,6 +293,14 @@ class Enwik8(DataDownloader): urls = ["https://data.deepai.org/enwik8.zip"] +class WikiJaEn(DataDownloader): + name = "wiki_ja_en" + urls = [ + "jawikibooks-20230807-cirrussearch-content.json.gz", + "enwiki-20230807-cirrussearch-content.json.gz" + ] + + def maybe_download_gpt2_tokenizer_data(tokenizer_type, data_dir): if tokenizer_type is None or tokenizer_type == "GPT2BPETokenizer": GPT2_VOCAB_FP = f"{data_dir}//gpt2-vocab.json" @@ -324,6 +332,7 @@ def maybe_download_gpt2_tokenizer_data(tokenizer_type, data_dir): "c4": C4, "c4_openwebtext": C4OpenWebText, "enwik8": Enwik8, + 'wiki_ja_en': WikiJaEn } From cd99f474c8aadd7e904bf62a69414702b7842ccf Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Sun, 13 Aug 2023 21:06:13 +0900 Subject: [PATCH 002/183] fix link --- tools/corpora.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/corpora.py b/tools/corpora.py index fb35477a3..48a9930af 100644 --- a/tools/corpora.py +++ b/tools/corpora.py @@ -296,8 +296,8 @@ class Enwik8(DataDownloader): class WikiJaEn(DataDownloader): name = "wiki_ja_en" urls = [ - "jawikibooks-20230807-cirrussearch-content.json.gz", - "enwiki-20230807-cirrussearch-content.json.gz" + "https://dumps.wikimedia.org/other/cirrussearch/20230807/jawiki-20230807-cirrussearch-content.json.gz", + "https://dumps.wikimedia.org/other/cirrussearch/20230807/enwiki-20230807-cirrussearch-content.json.gz" ] From fc24f602855002d7a88b3771acdef39e9a713698 Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Mon, 14 Aug 2023 19:45:59 +0900 Subject: [PATCH 003/183] fix --- tools/corpora.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/tools/corpora.py b/tools/corpora.py index 48a9930af..7bcde2484 100644 --- a/tools/corpora.py +++ b/tools/corpora.py @@ -301,6 +301,13 @@ class WikiJaEn(DataDownloader): ] +class WikiJa(DataDownloader): + name = "wiki_ja" + urls = [ + "https://dumps.wikimedia.org/other/cirrussearch/20230807/jawiki-20230807-cirrussearch-content.json.gz", + ] + + def maybe_download_gpt2_tokenizer_data(tokenizer_type, data_dir): if tokenizer_type is None or tokenizer_type == "GPT2BPETokenizer": GPT2_VOCAB_FP = f"{data_dir}//gpt2-vocab.json" @@ -332,7 +339,8 @@ def maybe_download_gpt2_tokenizer_data(tokenizer_type, data_dir): "c4": C4, "c4_openwebtext": C4OpenWebText, "enwik8": Enwik8, - 'wiki_ja_en': WikiJaEn + 'wiki_ja_en': WikiJaEn, + 'wiki_ja': WikiJa } From ed91147f77f959bdff83a5e567d6bf8a22f8ff61 Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Mon, 14 Aug 2023 20:38:19 +0900 Subject: [PATCH 004/183] debug --- tools/preprocess_data.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/preprocess_data.py b/tools/preprocess_data.py index 862620eb8..7ff994527 100644 --- a/tools/preprocess_data.py +++ b/tools/preprocess_data.py @@ -187,6 +187,9 @@ def main(): encoded_docs = pool.imap(encoder.encode, fin, chunksize=25) else: encoder.initializer() + for doc in fin: + a = encoder.encode(doc) + print('a,', a) encoded_docs = (encoder.encode(doc) for doc in fin) # make a dataset builder for each key in args.jsonl_keys From a3ae99807ba395fe7bd8266a65e01137f19dff46 Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Mon, 14 Aug 2023 20:54:37 +0900 Subject: [PATCH 005/183] debug --- tools/preprocess_data.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/preprocess_data.py b/tools/preprocess_data.py index 7ff994527..984646587 100644 --- a/tools/preprocess_data.py +++ b/tools/preprocess_data.py @@ -49,6 +49,7 @@ def initializer(self): def encode(self, text): if self.args.ftfy: text = ftfy.fix_text(text) + print('text', text) ids = {} for key in self.args.jsonl_keys: doc_ids = [] From 81301573f83ce36b8271fb323eb86facdfb4c7b2 Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Mon, 14 Aug 2023 21:07:25 +0900 Subject: [PATCH 006/183] fix --- tools/preprocess_data.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/tools/preprocess_data.py b/tools/preprocess_data.py index 984646587..280265365 100644 --- a/tools/preprocess_data.py +++ b/tools/preprocess_data.py @@ -49,7 +49,7 @@ def initializer(self): def encode(self, text): if self.args.ftfy: text = ftfy.fix_text(text) - print('text', text) + ids = {} for key in self.args.jsonl_keys: doc_ids = [] @@ -188,10 +188,13 @@ def main(): encoded_docs = pool.imap(encoder.encode, fin, chunksize=25) else: encoder.initializer() + new_fin = [] for doc in fin: - a = encoder.encode(doc) - print('a,', a) - encoded_docs = (encoder.encode(doc) for doc in fin) + if 'text' in doc: + new_fin.append(doc['text']) + encoded_docs = (encoder.encode(doc) for doc in new_fin) + + # encoded_docs = (encoder.encode(doc) for doc in fin) # make a dataset builder for each key in args.jsonl_keys # each key will output to a different file beginning with args.output_prefix From e5be7b0001ebfc8b04cc266ac8ccc3ada98eb967 Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Mon, 14 Aug 2023 21:08:26 +0900 Subject: [PATCH 007/183] debug --- tools/preprocess_data.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tools/preprocess_data.py b/tools/preprocess_data.py index 280265365..601ed8fc2 100644 --- a/tools/preprocess_data.py +++ b/tools/preprocess_data.py @@ -49,7 +49,7 @@ def initializer(self): def encode(self, text): if self.args.ftfy: text = ftfy.fix_text(text) - + print('text,', text) ids = {} for key in self.args.jsonl_keys: doc_ids = [] @@ -192,8 +192,7 @@ def main(): for doc in fin: if 'text' in doc: new_fin.append(doc['text']) - encoded_docs = (encoder.encode(doc) for doc in new_fin) - + encoded_docs = (encoder.encode(doc) for doc in new_fin) # encoded_docs = (encoder.encode(doc) for doc in fin) # make a dataset builder for each key in args.jsonl_keys From 910d8944b9b06b4cb754840e11540ad0dac6f5e6 Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Mon, 14 Aug 2023 21:09:31 +0900 Subject: [PATCH 008/183] debug --- tools/preprocess_data.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/preprocess_data.py b/tools/preprocess_data.py index 601ed8fc2..05c986eef 100644 --- a/tools/preprocess_data.py +++ b/tools/preprocess_data.py @@ -183,6 +183,7 @@ def main(): # use multiprocessing to iterate over input documents fin = yield_from_files(args.input.split(","), semaphore) + print('args.workers', args.workers) if args.workers > 1: pool = multiprocessing.Pool(args.workers, initializer=encoder.initializer) encoded_docs = pool.imap(encoder.encode, fin, chunksize=25) From bd323688d1009d731b56356a485ac242e2243856 Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Mon, 14 Aug 2023 21:10:13 +0900 Subject: [PATCH 009/183] fix --- tools/preprocess_data.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/tools/preprocess_data.py b/tools/preprocess_data.py index 05c986eef..30a446158 100644 --- a/tools/preprocess_data.py +++ b/tools/preprocess_data.py @@ -182,16 +182,20 @@ def main(): # use multiprocessing to iterate over input documents fin = yield_from_files(args.input.split(","), semaphore) - + new_fin = [] + for doc in fin: + if 'text' in doc: + new_fin.append(doc['text']) print('args.workers', args.workers) if args.workers > 1: pool = multiprocessing.Pool(args.workers, initializer=encoder.initializer) - encoded_docs = pool.imap(encoder.encode, fin, chunksize=25) + # encoded_docs = pool.imap(encoder.encode, fin, chunksize=25) + encoded_docs = pool.imap(encoder.encode, new_fin, chunksize=25) else: encoder.initializer() new_fin = [] for doc in fin: - if 'text' in doc: + if 'text' in doc: new_fin.append(doc['text']) encoded_docs = (encoder.encode(doc) for doc in new_fin) # encoded_docs = (encoder.encode(doc) for doc in fin) From 50b273cd27ee538fc60e42539c9c443ac3b5f5d5 Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Tue, 15 Aug 2023 19:33:18 +0900 Subject: [PATCH 010/183] debug --- tools/preprocess_data.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/preprocess_data.py b/tools/preprocess_data.py index 30a446158..9e70de5ce 100644 --- a/tools/preprocess_data.py +++ b/tools/preprocess_data.py @@ -160,6 +160,7 @@ def yield_from_files(fnames: list, semaphore): def yielder(fname, semaphore): for f in filter(lambda x: x, lmd.Reader(fname).stream_data()): + print('f', f) semaphore.acquire() yield f From e4c653c42b5186db554a05301a9d0f3c3da920de Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Tue, 15 Aug 2023 20:12:11 +0900 Subject: [PATCH 011/183] add filter --- tools/preprocess_data.py | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/tools/preprocess_data.py b/tools/preprocess_data.py index 9e70de5ce..0b66432a1 100644 --- a/tools/preprocess_data.py +++ b/tools/preprocess_data.py @@ -159,7 +159,8 @@ def yield_from_files(fnames: list, semaphore): """ def yielder(fname, semaphore): - for f in filter(lambda x: x, lmd.Reader(fname).stream_data()): + stream = filter(lambda x: x, lmd.Reader(fname).stream_data()) + for f in filter(lambda x: x, 'text' in stream): print('f', f) semaphore.acquire() yield f @@ -182,23 +183,14 @@ def main(): semaphore = Semaphore(10000 + args.workers) # use multiprocessing to iterate over input documents - fin = yield_from_files(args.input.split(","), semaphore) - new_fin = [] - for doc in fin: - if 'text' in doc: - new_fin.append(doc['text']) - print('args.workers', args.workers) + fin = yield_from_files(args.input.split(","), semaphore) if args.workers > 1: pool = multiprocessing.Pool(args.workers, initializer=encoder.initializer) # encoded_docs = pool.imap(encoder.encode, fin, chunksize=25) - encoded_docs = pool.imap(encoder.encode, new_fin, chunksize=25) + encoded_docs = pool.imap(encoder.encode, fin, chunksize=25) else: - encoder.initializer() - new_fin = [] - for doc in fin: - if 'text' in doc: - new_fin.append(doc['text']) - encoded_docs = (encoder.encode(doc) for doc in new_fin) + encoder.initializer() + encoded_docs = (encoder.encode(doc) for doc in fin) # encoded_docs = (encoder.encode(doc) for doc in fin) # make a dataset builder for each key in args.jsonl_keys From 45c6951e055aacb9f4b8f1a7a9e67d4172abee4b Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Tue, 15 Aug 2023 20:14:20 +0900 Subject: [PATCH 012/183] add filter --- tools/preprocess_data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/preprocess_data.py b/tools/preprocess_data.py index 0b66432a1..a90288e48 100644 --- a/tools/preprocess_data.py +++ b/tools/preprocess_data.py @@ -160,7 +160,7 @@ def yield_from_files(fnames: list, semaphore): def yielder(fname, semaphore): stream = filter(lambda x: x, lmd.Reader(fname).stream_data()) - for f in filter(lambda x: x, 'text' in stream): + for f in filter(lambda x: 'text' in x, stream): print('f', f) semaphore.acquire() yield f From 37bbb2aaab786e46ca0b57ff72cae3b27d045506 Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Tue, 15 Aug 2023 20:17:33 +0900 Subject: [PATCH 013/183] debug --- tools/preprocess_data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/preprocess_data.py b/tools/preprocess_data.py index a90288e48..b57b1def2 100644 --- a/tools/preprocess_data.py +++ b/tools/preprocess_data.py @@ -161,7 +161,7 @@ def yield_from_files(fnames: list, semaphore): def yielder(fname, semaphore): stream = filter(lambda x: x, lmd.Reader(fname).stream_data()) for f in filter(lambda x: 'text' in x, stream): - print('f', f) + # print('f', f) semaphore.acquire() yield f From 4bc3a9d2601884a17f94a0671614d6096ded9326 Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Tue, 15 Aug 2023 20:18:46 +0900 Subject: [PATCH 014/183] fix --- tools/preprocess_data.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tools/preprocess_data.py b/tools/preprocess_data.py index b57b1def2..24f6546a0 100644 --- a/tools/preprocess_data.py +++ b/tools/preprocess_data.py @@ -160,10 +160,9 @@ def yield_from_files(fnames: list, semaphore): def yielder(fname, semaphore): stream = filter(lambda x: x, lmd.Reader(fname).stream_data()) - for f in filter(lambda x: 'text' in x, stream): - # print('f', f) + for f in filter(lambda x: 'text' in x, stream): semaphore.acquire() - yield f + yield f['text'] for fname in fnames: semaphore.acquire() From 152acc47aa0e24d006cf5c491ede769f7281a38a Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Tue, 15 Aug 2023 20:23:22 +0900 Subject: [PATCH 015/183] for wiki --- tools/preprocess_data.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/tools/preprocess_data.py b/tools/preprocess_data.py index 24f6546a0..79787abfe 100644 --- a/tools/preprocess_data.py +++ b/tools/preprocess_data.py @@ -158,16 +158,26 @@ def yield_from_files(fnames: list, semaphore): :param fnames: list of filenames """ - def yielder(fname, semaphore): + def yielder(fname, semaphore): + for f in filter(lambda x: x, lmd.Reader(fname).stream_data()): + semaphore.acquire() + yield f + + def wiki_yielder(fname, semaphore): stream = filter(lambda x: x, lmd.Reader(fname).stream_data()) - for f in filter(lambda x: 'text' in x, stream): + for f in filter(lambda x: 'text' in x, stream): semaphore.acquire() yield f['text'] for fname in fnames: semaphore.acquire() - - yield from yielder(fname, semaphore) + print('fname', fname) + if 'wiki' in fname: + yield from wiki_yielder(fname, semaphore) + else: + yield from yielder(fname, semaphore) + + def main(): @@ -182,7 +192,7 @@ def main(): semaphore = Semaphore(10000 + args.workers) # use multiprocessing to iterate over input documents - fin = yield_from_files(args.input.split(","), semaphore) + fin = yield_from_files(args.input.split(","), semaphore) if args.workers > 1: pool = multiprocessing.Pool(args.workers, initializer=encoder.initializer) # encoded_docs = pool.imap(encoder.encode, fin, chunksize=25) From b3871c007a0bbeb211c7988255e5aa61f057ae94 Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Tue, 15 Aug 2023 20:23:52 +0900 Subject: [PATCH 016/183] rm debug --- tools/preprocess_data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/preprocess_data.py b/tools/preprocess_data.py index 79787abfe..44eb6e952 100644 --- a/tools/preprocess_data.py +++ b/tools/preprocess_data.py @@ -49,7 +49,7 @@ def initializer(self): def encode(self, text): if self.args.ftfy: text = ftfy.fix_text(text) - print('text,', text) + # print('text,', text) ids = {} for key in self.args.jsonl_keys: doc_ids = [] From f3a0a6619e7c482ab06151000bbec896ab31f108 Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Tue, 15 Aug 2023 20:25:37 +0900 Subject: [PATCH 017/183] debug --- tools/preprocess_data.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/preprocess_data.py b/tools/preprocess_data.py index 44eb6e952..31b3196f1 100644 --- a/tools/preprocess_data.py +++ b/tools/preprocess_data.py @@ -57,6 +57,7 @@ def encode(self, text): if len(text_ids) > 0: doc_ids.append(text_ids) if self.args.append_eod: + print('doc_ids', doc_ids) doc_ids[-1].append(Encoder.tokenizer.eod) ids[key] = doc_ids return ids, len(text) From 5750065e0298883e78efc79ea02d1de4725ab24e Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Tue, 15 Aug 2023 20:29:17 +0900 Subject: [PATCH 018/183] debug --- tools/preprocess_data.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tools/preprocess_data.py b/tools/preprocess_data.py index 31b3196f1..ebd1f851b 100644 --- a/tools/preprocess_data.py +++ b/tools/preprocess_data.py @@ -57,8 +57,11 @@ def encode(self, text): if len(text_ids) > 0: doc_ids.append(text_ids) if self.args.append_eod: - print('doc_ids', doc_ids) - doc_ids[-1].append(Encoder.tokenizer.eod) + try: + doc_ids[-1].append(Encoder.tokenizer.eod) + except Exception as e: + print('text', text) + print('doc_ids', doc_ids) ids[key] = doc_ids return ids, len(text) From 4cf6be8f7affea91c1d21bd75b7be259c0ad186c Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Tue, 15 Aug 2023 21:29:31 +0900 Subject: [PATCH 019/183] fix filter --- tools/preprocess_data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/preprocess_data.py b/tools/preprocess_data.py index ebd1f851b..a923afcc3 100644 --- a/tools/preprocess_data.py +++ b/tools/preprocess_data.py @@ -169,7 +169,7 @@ def yielder(fname, semaphore): def wiki_yielder(fname, semaphore): stream = filter(lambda x: x, lmd.Reader(fname).stream_data()) - for f in filter(lambda x: 'text' in x, stream): + for f in filter(lambda x: 'text' in x and len(x['text']) != 0, stream): semaphore.acquire() yield f['text'] From c06e201da09f24a296ce1d7dcdfc4e824d18faef Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Tue, 15 Aug 2023 22:42:47 +0900 Subject: [PATCH 020/183] fix --- configs/local_setup_ja.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/local_setup_ja.yml b/configs/local_setup_ja.yml index 04af5b6fa..bc7035c4a 100644 --- a/configs/local_setup_ja.yml +++ b/configs/local_setup_ja.yml @@ -1,6 +1,6 @@ # Suggested data paths when using GPT-NeoX locally { - "data_path": "data/wiki_ja_en", + "data_path": "data/wiki_ja", # or for weighted datasets: # "train-data-paths": ["data/enwik8/enwik8_text_document", "data/enwik8/enwik8_text_document"], From b3723144894afba6caf95385c47367bd53b04d23 Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Tue, 15 Aug 2023 23:04:17 +0900 Subject: [PATCH 021/183] fix --- configs/1-3B.yml | 3 +++ configs/local_setup_ja.yml | 1 + 2 files changed, 4 insertions(+) diff --git a/configs/1-3B.yml b/configs/1-3B.yml index 3e80ae7fc..0a093f271 100644 --- a/configs/1-3B.yml +++ b/configs/1-3B.yml @@ -88,4 +88,7 @@ "steps_per_print": 10, "keep_last_n_checkpoints": 4, "wall_clock_breakdown": true, + + ## tokenizer type + "tokenizer_type": "HFTokenizer", } diff --git a/configs/local_setup_ja.yml b/configs/local_setup_ja.yml index bc7035c4a..e22de21f5 100644 --- a/configs/local_setup_ja.yml +++ b/configs/local_setup_ja.yml @@ -15,6 +15,7 @@ # "weight_by_num_documents": false, # "weighted_sampler_alpha": 0.3, + "tokenizer_type": "SPMTokenizer", "vocab_file": "./novelAI/tokenizer.model", "save": "checkpoints", From 74818bbd6050e16dec4515a7191aab5e254e28ae Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Tue, 15 Aug 2023 23:05:26 +0900 Subject: [PATCH 022/183] fix --- configs/1-3B.yml | 2 +- configs/local_setup_ja.yml | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/configs/1-3B.yml b/configs/1-3B.yml index 0a093f271..f5523c6ba 100644 --- a/configs/1-3B.yml +++ b/configs/1-3B.yml @@ -90,5 +90,5 @@ "wall_clock_breakdown": true, ## tokenizer type - "tokenizer_type": "HFTokenizer", + "tokenizer_type": "SPMTokenizer", } diff --git a/configs/local_setup_ja.yml b/configs/local_setup_ja.yml index e22de21f5..8ad4e2d30 100644 --- a/configs/local_setup_ja.yml +++ b/configs/local_setup_ja.yml @@ -14,8 +14,7 @@ # WARNING: setting this to True will override any user provided weights # "weight_by_num_documents": false, # "weighted_sampler_alpha": 0.3, - - "tokenizer_type": "SPMTokenizer", + "vocab_file": "./novelAI/tokenizer.model", "save": "checkpoints", From 62fde152260271e8ecc7394ddbfe1d6e44f56f39 Mon Sep 17 00:00:00 2001 From: if001 Date: Wed, 23 Aug 2023 17:24:28 +0900 Subject: [PATCH 023/183] fix data_path --- configs/local_setup_ja.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/configs/local_setup_ja.yml b/configs/local_setup_ja.yml index 8ad4e2d30..9644c6515 100644 --- a/configs/local_setup_ja.yml +++ b/configs/local_setup_ja.yml @@ -1,6 +1,7 @@ # Suggested data paths when using GPT-NeoX locally { - "data_path": "data/wiki_ja", + # "data_path": "data/wiki_ja", + "data_path": "data/wiki_ja/wiki_ja_text_document", # or for weighted datasets: # "train-data-paths": ["data/enwik8/enwik8_text_document", "data/enwik8/enwik8_text_document"], From 75c0ba3fa87002595615ede7a9c0b31f4a135861 Mon Sep 17 00:00:00 2001 From: if001 Date: Wed, 23 Aug 2023 18:26:31 +0900 Subject: [PATCH 024/183] Update local_setup_ja.yml --- configs/local_setup_ja.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/configs/local_setup_ja.yml b/configs/local_setup_ja.yml index 9644c6515..5ffa64d02 100644 --- a/configs/local_setup_ja.yml +++ b/configs/local_setup_ja.yml @@ -21,6 +21,11 @@ "save": "checkpoints", "load": "checkpoints", "checkpoint_validation_with_forward_pass": False, + "log_dir": "logs", + "save_interval": 10000, + "eval_interval": 1000, + "eval_iters": 10, + "keep_last_n_checkpoints": 4, "tensorboard_dir": "tensorboard", "log_dir": "logs", From ab9f79f8c8b47382c7b3d577475b578986540520 Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Fri, 1 Sep 2023 14:04:50 +0900 Subject: [PATCH 025/183] fix config --- configs/49M.yml | 1 + configs/local_setup_ja.yml | 7 +++---- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/configs/49M.yml b/configs/49M.yml index 9852320b0..f9822de9b 100644 --- a/configs/49M.yml +++ b/configs/49M.yml @@ -87,5 +87,6 @@ # logging "log_interval": 10, "steps_per_print": 10, + "keep_last_n_checkpoints": 4, "wall_clock_breakdown": true, } diff --git a/configs/local_setup_ja.yml b/configs/local_setup_ja.yml index 5ffa64d02..286f33d76 100644 --- a/configs/local_setup_ja.yml +++ b/configs/local_setup_ja.yml @@ -21,11 +21,10 @@ "save": "checkpoints", "load": "checkpoints", "checkpoint_validation_with_forward_pass": False, + + ## logging "log_dir": "logs", - "save_interval": 10000, - "eval_interval": 1000, - "eval_iters": 10, - "keep_last_n_checkpoints": 4, + "save_interval": 10000, "tensorboard_dir": "tensorboard", "log_dir": "logs", From 0f48e56c164efb02ffc9aabd1fa09fa106cbe44d Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Fri, 1 Sep 2023 14:04:57 +0900 Subject: [PATCH 026/183] add dataset --- tools/corpora.py | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/tools/corpora.py b/tools/corpora.py index 7bcde2484..f36285d07 100644 --- a/tools/corpora.py +++ b/tools/corpora.py @@ -307,6 +307,27 @@ class WikiJa(DataDownloader): "https://dumps.wikimedia.org/other/cirrussearch/20230807/jawiki-20230807-cirrussearch-content.json.gz", ] +class DataDownloaderWithHF(DataDownloader): + def __init__(self, hf_repo_ids = [], *args, **kwargs): + super().__init__(*args, **kwargs) + self.hf_repo_ids = hf_repo_ids + + def download(self): + super().download() + from huggingface_hub import snapshot_download + save_dir = os.path.join(self.base_dir, self.name) + for repo_id in self.hf_repo_ids: + snapshot_download(repo_id=repo_id, revision="main", allow_patterns="*.jsonl", local_dir=save_dir) + +class WikiOSCARJa(DataDownloader): + name = "wiki_oscar_ja" + urls = [ + "https://dumps.wikimedia.org/other/cirrussearch/20230807/jawiki-20230807-cirrussearch-content.json.gz", + ] + hf_repo_ids = [ + 'if001/oscar_2023_filtered' + ] + def maybe_download_gpt2_tokenizer_data(tokenizer_type, data_dir): if tokenizer_type is None or tokenizer_type == "GPT2BPETokenizer": @@ -340,7 +361,8 @@ def maybe_download_gpt2_tokenizer_data(tokenizer_type, data_dir): "c4_openwebtext": C4OpenWebText, "enwik8": Enwik8, 'wiki_ja_en': WikiJaEn, - 'wiki_ja': WikiJa + 'wiki_ja': WikiJa, + 'wiki_oscar_ja': WikiOSCARJa } From c4a8876128f766ae9bc5d433fe051d24d4329498 Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Fri, 1 Sep 2023 14:17:50 +0900 Subject: [PATCH 027/183] oscar --- tools/corpora.py | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/tools/corpora.py b/tools/corpora.py index f36285d07..5e3745837 100644 --- a/tools/corpora.py +++ b/tools/corpora.py @@ -319,7 +319,7 @@ def download(self): for repo_id in self.hf_repo_ids: snapshot_download(repo_id=repo_id, revision="main", allow_patterns="*.jsonl", local_dir=save_dir) -class WikiOSCARJa(DataDownloader): +class WikiOSCARJa(DataDownloaderWithHF): name = "wiki_oscar_ja" urls = [ "https://dumps.wikimedia.org/other/cirrussearch/20230807/jawiki-20230807-cirrussearch-content.json.gz", @@ -329,6 +329,25 @@ class WikiOSCARJa(DataDownloader): ] +class HFDataDownloader(DataDownloader): + def __init__(self, hf_repo_ids = [], *args, **kwargs): + super().__init__(*args, **kwargs) + self.hf_repo_ids = hf_repo_ids + + def download(self): + from huggingface_hub import snapshot_download + save_dir = os.path.join(self.base_dir, self.name) + for repo_id in self.hf_repo_ids: + snapshot_download(repo_id=repo_id, revision="main", allow_patterns="*.jsonl", local_dir=save_dir) + +class OSCARJa(HFDataDownloader): + name = "oscar_ja" + urls = [""] + hf_repo_ids = [ + 'if001/oscar_2023_filtered' + ] + + def maybe_download_gpt2_tokenizer_data(tokenizer_type, data_dir): if tokenizer_type is None or tokenizer_type == "GPT2BPETokenizer": GPT2_VOCAB_FP = f"{data_dir}//gpt2-vocab.json" @@ -362,6 +381,7 @@ def maybe_download_gpt2_tokenizer_data(tokenizer_type, data_dir): "enwik8": Enwik8, 'wiki_ja_en': WikiJaEn, 'wiki_ja': WikiJa, + 'oscar_ja': OSCARJa, 'wiki_oscar_ja': WikiOSCARJa } From 0a334487b76edac307d71a3cb6b87041cf2e98c8 Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Fri, 1 Sep 2023 14:19:45 +0900 Subject: [PATCH 028/183] debug --- tools/corpora.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/corpora.py b/tools/corpora.py index 5e3745837..39bd24b75 100644 --- a/tools/corpora.py +++ b/tools/corpora.py @@ -338,6 +338,7 @@ def download(self): from huggingface_hub import snapshot_download save_dir = os.path.join(self.base_dir, self.name) for repo_id in self.hf_repo_ids: + print('download', save_dir) snapshot_download(repo_id=repo_id, revision="main", allow_patterns="*.jsonl", local_dir=save_dir) class OSCARJa(HFDataDownloader): From 713f669c0af415689eb467926a8a314f3d9d59f0 Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Fri, 1 Sep 2023 14:21:58 +0900 Subject: [PATCH 029/183] debug --- tools/corpora.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/corpora.py b/tools/corpora.py index 39bd24b75..5351cf22b 100644 --- a/tools/corpora.py +++ b/tools/corpora.py @@ -334,9 +334,10 @@ def __init__(self, hf_repo_ids = [], *args, **kwargs): super().__init__(*args, **kwargs) self.hf_repo_ids = hf_repo_ids - def download(self): + def download(self): from huggingface_hub import snapshot_download save_dir = os.path.join(self.base_dir, self.name) + print('donwload0', self.hf_repo_ids) for repo_id in self.hf_repo_ids: print('download', save_dir) snapshot_download(repo_id=repo_id, revision="main", allow_patterns="*.jsonl", local_dir=save_dir) From 1778c12437cd6976d62ede13b374986b823e21b1 Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Fri, 1 Sep 2023 14:24:25 +0900 Subject: [PATCH 030/183] debug --- tools/corpora.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/corpora.py b/tools/corpora.py index 5351cf22b..aad9644fa 100644 --- a/tools/corpora.py +++ b/tools/corpora.py @@ -345,7 +345,7 @@ def download(self): class OSCARJa(HFDataDownloader): name = "oscar_ja" urls = [""] - hf_repo_ids = [ + super().hf_repo_ids = [ 'if001/oscar_2023_filtered' ] From e80130c947c051994375e6cbf274ef43d38564d0 Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Fri, 1 Sep 2023 14:27:27 +0900 Subject: [PATCH 031/183] debug --- tools/corpora.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/tools/corpora.py b/tools/corpora.py index aad9644fa..3512af1d6 100644 --- a/tools/corpora.py +++ b/tools/corpora.py @@ -330,11 +330,15 @@ class WikiOSCARJa(DataDownloaderWithHF): class HFDataDownloader(DataDownloader): - def __init__(self, hf_repo_ids = [], *args, **kwargs): + def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - self.hf_repo_ids = hf_repo_ids - def download(self): + @property + @abstractmethod + def hf_repo_ids(self): + pass + + def download(self): from huggingface_hub import snapshot_download save_dir = os.path.join(self.base_dir, self.name) print('donwload0', self.hf_repo_ids) @@ -345,9 +349,7 @@ def download(self): class OSCARJa(HFDataDownloader): name = "oscar_ja" urls = [""] - super().hf_repo_ids = [ - 'if001/oscar_2023_filtered' - ] + hf_repo_ids = ['if001/oscar_2023_filtered'] def maybe_download_gpt2_tokenizer_data(tokenizer_type, data_dir): From 84a60e4e9edeb81d4c3efea86f1da9f201940135 Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Fri, 1 Sep 2023 14:31:11 +0900 Subject: [PATCH 032/183] fix save --- tools/corpora.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/tools/corpora.py b/tools/corpora.py index 3512af1d6..8694a9533 100644 --- a/tools/corpora.py +++ b/tools/corpora.py @@ -339,12 +339,11 @@ def hf_repo_ids(self): pass def download(self): - from huggingface_hub import snapshot_download + from datasets import load_dataset save_dir = os.path.join(self.base_dir, self.name) - print('donwload0', self.hf_repo_ids) for repo_id in self.hf_repo_ids: - print('download', save_dir) - snapshot_download(repo_id=repo_id, revision="main", allow_patterns="*.jsonl", local_dir=save_dir) + ds=load_dataset(repo_id) + ds.save_to_disk(save_dir) class OSCARJa(HFDataDownloader): name = "oscar_ja" From 82805ce67118cfef4d81722ac2b3fbf7133af762 Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Fri, 1 Sep 2023 15:52:11 +0900 Subject: [PATCH 033/183] debug --- tools/corpora.py | 1 + tools/preprocess_data.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/corpora.py b/tools/corpora.py index 8694a9533..f6699e427 100644 --- a/tools/corpora.py +++ b/tools/corpora.py @@ -343,6 +343,7 @@ def download(self): save_dir = os.path.join(self.base_dir, self.name) for repo_id in self.hf_repo_ids: ds=load_dataset(repo_id) + print('save to', save_dir) ds.save_to_disk(save_dir) class OSCARJa(HFDataDownloader): diff --git a/tools/preprocess_data.py b/tools/preprocess_data.py index a923afcc3..852342658 100644 --- a/tools/preprocess_data.py +++ b/tools/preprocess_data.py @@ -49,7 +49,7 @@ def initializer(self): def encode(self, text): if self.args.ftfy: text = ftfy.fix_text(text) - # print('text,', text) + print('text,', text) ids = {} for key in self.args.jsonl_keys: doc_ids = [] From 77edca715a539dfb942d4f3caf14ea7f511eae5f Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Fri, 1 Sep 2023 16:04:56 +0900 Subject: [PATCH 034/183] fix --- tools/corpora.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tools/corpora.py b/tools/corpora.py index f6699e427..e686b1aa6 100644 --- a/tools/corpora.py +++ b/tools/corpora.py @@ -342,9 +342,8 @@ def download(self): from datasets import load_dataset save_dir = os.path.join(self.base_dir, self.name) for repo_id in self.hf_repo_ids: - ds=load_dataset(repo_id) print('save to', save_dir) - ds.save_to_disk(save_dir) + load_dataset(repo_id, data_dir=save_dir) class OSCARJa(HFDataDownloader): name = "oscar_ja" From d3954c263c786baf5d5fa20f05603be839d4f25b Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Fri, 1 Sep 2023 16:13:20 +0900 Subject: [PATCH 035/183] fix save dir --- tools/corpora.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tools/corpora.py b/tools/corpora.py index e686b1aa6..5d4097b4c 100644 --- a/tools/corpora.py +++ b/tools/corpora.py @@ -339,11 +339,13 @@ def hf_repo_ids(self): pass def download(self): - from datasets import load_dataset + from datasets import load_dataset, config + from pathlib import Path save_dir = os.path.join(self.base_dir, self.name) for repo_id in self.hf_repo_ids: print('save to', save_dir) - load_dataset(repo_id, data_dir=save_dir) + config.DOWNLOADED_DATASETS_PATH = Path(save_dir) + load_dataset(repo_id) class OSCARJa(HFDataDownloader): name = "oscar_ja" From bcb1311a2fbe49b32f7d0eb6cf689200b9810527 Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Fri, 1 Sep 2023 16:53:07 +0900 Subject: [PATCH 036/183] ix --- tools/corpora.py | 13 ++++++------- tools/preprocess_data.py | 2 +- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/tools/corpora.py b/tools/corpora.py index 5d4097b4c..ef944aa71 100644 --- a/tools/corpora.py +++ b/tools/corpora.py @@ -316,8 +316,8 @@ def download(self): super().download() from huggingface_hub import snapshot_download save_dir = os.path.join(self.base_dir, self.name) - for repo_id in self.hf_repo_ids: - snapshot_download(repo_id=repo_id, revision="main", allow_patterns="*.jsonl", local_dir=save_dir) + for repo_id in self.hf_repo_ids: + snapshot_download(repo_id=repo_id, revision="main", allow_patterns="*.jsonl", local_dir=save_dir, repo_type='dataset') class WikiOSCARJa(DataDownloaderWithHF): name = "wiki_oscar_ja" @@ -338,14 +338,13 @@ def __init__(self, *args, **kwargs): def hf_repo_ids(self): pass - def download(self): - from datasets import load_dataset, config - from pathlib import Path + def download(self): + from huggingface_hub import snapshot_download save_dir = os.path.join(self.base_dir, self.name) for repo_id in self.hf_repo_ids: print('save to', save_dir) - config.DOWNLOADED_DATASETS_PATH = Path(save_dir) - load_dataset(repo_id) + snapshot_download(repo_id=repo_id, allow_patterns="*.jsonl.zst", local_dir=save_dir) + class OSCARJa(HFDataDownloader): name = "oscar_ja" diff --git a/tools/preprocess_data.py b/tools/preprocess_data.py index 852342658..a923afcc3 100644 --- a/tools/preprocess_data.py +++ b/tools/preprocess_data.py @@ -49,7 +49,7 @@ def initializer(self): def encode(self, text): if self.args.ftfy: text = ftfy.fix_text(text) - print('text,', text) + # print('text,', text) ids = {} for key in self.args.jsonl_keys: doc_ids = [] From e0fb9c32b4501825974b535ea8b993f2796ccdce Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Fri, 1 Sep 2023 20:04:59 +0900 Subject: [PATCH 037/183] fix --- configs/local_setup_ja.yml | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/configs/local_setup_ja.yml b/configs/local_setup_ja.yml index 286f33d76..156f2196a 100644 --- a/configs/local_setup_ja.yml +++ b/configs/local_setup_ja.yml @@ -1,15 +1,15 @@ # Suggested data paths when using GPT-NeoX locally { # "data_path": "data/wiki_ja", - "data_path": "data/wiki_ja/wiki_ja_text_document", + # "data_path": "data/wiki_ja/wiki_ja_text_document", # or for weighted datasets: - # "train-data-paths": ["data/enwik8/enwik8_text_document", "data/enwik8/enwik8_text_document"], - # "test-data-paths": ["data/enwik8/enwik8_text_document", "data/enwik8/enwik8_text_document"], - # "valid-data-paths": ["data/enwik8/enwik8_text_document", "data/enwik8/enwik8_text_document"], - # "train-data-weights": [1., 2.], - # "test-data-weights": [2., 1.], - # "valid-data-weights": [0.5, 0.4], + "train-data-paths": ["data/wiki_ja/wiki_ja_text_document", "data/oscar_ja/oscar_ja_text_document"], + "test-data-paths": ["data/wiki_ja/wiki_ja_text_document", "data/oscar_ja/oscar_ja_text_document"], + "valid-data-paths": ["data/wiki_ja/wiki_ja_text_document", "data/oscar_ja/oscar_ja_text_document"], + "train-data-weights": [1., 1.], + "test-data-weights": [0.1, 0.1], + "valid-data-weights": [0.1, 0.1], # If weight_by_num_documents is True, Builds dataset weights from a multinomial distribution over groups of data according to the number of documents in each group. # WARNING: setting this to True will override any user provided weights From 3c210a0e68ab37bc08fdd93246a9692e76de00da Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Fri, 1 Sep 2023 20:38:21 +0900 Subject: [PATCH 038/183] fix config --- configs/49M.yml | 1 + configs/local_setup_ja.yml | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/49M.yml b/configs/49M.yml index f9822de9b..71a0bb6dd 100644 --- a/configs/49M.yml +++ b/configs/49M.yml @@ -83,6 +83,7 @@ "checkpoint_factor": 1000, "eval_interval": 100000, "eval_iters": 10, + "save_interval": 10000, # logging "log_interval": 10, diff --git a/configs/local_setup_ja.yml b/configs/local_setup_ja.yml index 156f2196a..66d72be44 100644 --- a/configs/local_setup_ja.yml +++ b/configs/local_setup_ja.yml @@ -24,7 +24,6 @@ ## logging "log_dir": "logs", - "save_interval": 10000, "tensorboard_dir": "tensorboard", "log_dir": "logs", From afedf90a7540eafb23b4de2ee1befc77144635a1 Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Fri, 1 Sep 2023 20:40:44 +0900 Subject: [PATCH 039/183] fix config --- configs/49M.yml | 1 - configs/local_setup_ja.yml | 2 ++ 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/configs/49M.yml b/configs/49M.yml index 71a0bb6dd..f9822de9b 100644 --- a/configs/49M.yml +++ b/configs/49M.yml @@ -83,7 +83,6 @@ "checkpoint_factor": 1000, "eval_interval": 100000, "eval_iters": 10, - "save_interval": 10000, # logging "log_interval": 10, diff --git a/configs/local_setup_ja.yml b/configs/local_setup_ja.yml index 66d72be44..16fe3fb8a 100644 --- a/configs/local_setup_ja.yml +++ b/configs/local_setup_ja.yml @@ -21,6 +21,8 @@ "save": "checkpoints", "load": "checkpoints", "checkpoint_validation_with_forward_pass": False, + + "save_interval": 10000, ## logging "log_dir": "logs", From 7f9da23c7509ed46ceb81b4ba59c7cd1b55d696a Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Fri, 1 Sep 2023 20:41:31 +0900 Subject: [PATCH 040/183] fix config --- configs/local_setup_ja.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/configs/local_setup_ja.yml b/configs/local_setup_ja.yml index 16fe3fb8a..66d72be44 100644 --- a/configs/local_setup_ja.yml +++ b/configs/local_setup_ja.yml @@ -21,8 +21,6 @@ "save": "checkpoints", "load": "checkpoints", "checkpoint_validation_with_forward_pass": False, - - "save_interval": 10000, ## logging "log_dir": "logs", From e7d90d349821977fe22859f324ebd63a80d640d9 Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Fri, 1 Sep 2023 20:54:15 +0900 Subject: [PATCH 041/183] fix tokenizer --- configs/125M.yml | 5 ++++- configs/49M.yml | 3 +++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/configs/125M.yml b/configs/125M.yml index 15a4b3b01..504879123 100644 --- a/configs/125M.yml +++ b/configs/125M.yml @@ -90,5 +90,8 @@ "wall_clock_breakdown": true, # networking - "hostfile": "/mock_path" + "hostfile": "/mock_path", + + ## tokenizer type + "tokenizer_type": "SPMTokenizer" } diff --git a/configs/49M.yml b/configs/49M.yml index f9822de9b..94e08ea2d 100644 --- a/configs/49M.yml +++ b/configs/49M.yml @@ -89,4 +89,7 @@ "steps_per_print": 10, "keep_last_n_checkpoints": 4, "wall_clock_breakdown": true, + + ## tokenizer type + "tokenizer_type": "SPMTokenizer" } From 36ee68c70265f1717fd8eb2c28d883bc0fbf0177 Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Fri, 1 Sep 2023 21:00:45 +0900 Subject: [PATCH 042/183] add --- configs/19M.yml | 6 ++++-- configs/20B.yml | 3 ++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/configs/19M.yml b/configs/19M.yml index 83e5c594a..39d6247b8 100644 --- a/configs/19M.yml +++ b/configs/19M.yml @@ -90,6 +90,8 @@ "prof_all": true, "debug": false }, - } - + }, + + ## tokenizer type + "tokenizer_type": "SPMTokenizer" } diff --git a/configs/20B.yml b/configs/20B.yml index 243f794d0..46b44c04b 100644 --- a/configs/20B.yml +++ b/configs/20B.yml @@ -104,7 +104,8 @@ "wall_clock_breakdown": false, ### NEW DATA: #### - "tokenizer_type": "HFTokenizer", + # "tokenizer_type": "HFTokenizer", + "tokenizer_type": "SPMTokenizer" "tensorboard-dir": "./tensorboard", "log_dir": "./logs", From 2afafe03d2736c34e276d26aa6b4d3831f7bd7dc Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Sun, 3 Sep 2023 11:39:13 +0900 Subject: [PATCH 043/183] fix conf --- configs/19M.yml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/configs/19M.yml b/configs/19M.yml index 39d6247b8..794c0c4ec 100644 --- a/configs/19M.yml +++ b/configs/19M.yml @@ -77,9 +77,11 @@ "checkpoint_factor": 1000, "eval_interval": 100000, "eval_iters": 10, + "keep_last_n_checkpoints": 4, + "save_interval": 10000, - "log_interval": 10, - "steps_per_print": 10, + "log_interval": 100, + "steps_per_print": 100, "wall_clock_breakdown": true, # additional deepspeed args not specified above From 43535a3ecb5ff634d78c4e1caa1910e998a45881 Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Sun, 3 Sep 2023 11:39:20 +0900 Subject: [PATCH 044/183] add en wiki --- tools/corpora.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/tools/corpora.py b/tools/corpora.py index ef944aa71..b878d4cad 100644 --- a/tools/corpora.py +++ b/tools/corpora.py @@ -293,10 +293,9 @@ class Enwik8(DataDownloader): urls = ["https://data.deepai.org/enwik8.zip"] -class WikiJaEn(DataDownloader): - name = "wiki_ja_en" - urls = [ - "https://dumps.wikimedia.org/other/cirrussearch/20230807/jawiki-20230807-cirrussearch-content.json.gz", +class WikiEn(DataDownloader): + name = "wiki_en" + urls = [ "https://dumps.wikimedia.org/other/cirrussearch/20230807/enwiki-20230807-cirrussearch-content.json.gz" ] @@ -328,7 +327,6 @@ class WikiOSCARJa(DataDownloaderWithHF): 'if001/oscar_2023_filtered' ] - class HFDataDownloader(DataDownloader): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -351,6 +349,11 @@ class OSCARJa(HFDataDownloader): urls = [""] hf_repo_ids = ['if001/oscar_2023_filtered'] +class AozoraJa(HFDataDownloader): + name = "aozora_ja" + urls = [""] + hf_repo_ids = ['globis-university/aozorabunko-clean'] + def maybe_download_gpt2_tokenizer_data(tokenizer_type, data_dir): if tokenizer_type is None or tokenizer_type == "GPT2BPETokenizer": @@ -383,10 +386,11 @@ def maybe_download_gpt2_tokenizer_data(tokenizer_type, data_dir): "c4": C4, "c4_openwebtext": C4OpenWebText, "enwik8": Enwik8, - 'wiki_ja_en': WikiJaEn, + 'wiki_en': WikiEn, 'wiki_ja': WikiJa, 'oscar_ja': OSCARJa, - 'wiki_oscar_ja': WikiOSCARJa + 'wiki_oscar_ja': WikiOSCARJa, + 'aozora_ja': AozoraJa } From acbb2792e6a17bbfd2fd3ffa51b81653dc703cd7 Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Sun, 3 Sep 2023 22:49:04 +0900 Subject: [PATCH 045/183] fix pattern --- tools/corpora.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tools/corpora.py b/tools/corpora.py index b878d4cad..efbcd497e 100644 --- a/tools/corpora.py +++ b/tools/corpora.py @@ -341,7 +341,11 @@ def download(self): save_dir = os.path.join(self.base_dir, self.name) for repo_id in self.hf_repo_ids: print('save to', save_dir) - snapshot_download(repo_id=repo_id, allow_patterns="*.jsonl.zst", local_dir=save_dir) + if 'if001/oscar_2023_filtered' == repo_id: + allow_patterns="*.jsonl.zst" + if 'globis-university/aozorabunko-clean' == repo_id: + allow_patterns="*.jsonl.gz" + snapshot_download(repo_id=repo_id, allow_patterns=allow_patterns, local_dir=save_dir) class OSCARJa(HFDataDownloader): From dc4d9994aba628d8da9b69fef36dc22ed5e7d2bd Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Sun, 3 Sep 2023 22:51:54 +0900 Subject: [PATCH 046/183] fix type --- tools/corpora.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/corpora.py b/tools/corpora.py index efbcd497e..e615e4107 100644 --- a/tools/corpora.py +++ b/tools/corpora.py @@ -345,7 +345,7 @@ def download(self): allow_patterns="*.jsonl.zst" if 'globis-university/aozorabunko-clean' == repo_id: allow_patterns="*.jsonl.gz" - snapshot_download(repo_id=repo_id, allow_patterns=allow_patterns, local_dir=save_dir) + snapshot_download(repo_id=repo_id, allow_patterns=allow_patterns, local_dir=save_dir, repo_type="dataset") class OSCARJa(HFDataDownloader): From 8d75b6dae8ff05f7dc7c0a5183121dbfc081a029 Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Tue, 5 Sep 2023 20:54:12 +0900 Subject: [PATCH 047/183] for aozora --- tools/corpora.py | 4 ++-- tools/preprocess_data.py | 10 +++++++--- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/tools/corpora.py b/tools/corpora.py index e615e4107..9430c66c0 100644 --- a/tools/corpora.py +++ b/tools/corpora.py @@ -343,7 +343,7 @@ def download(self): print('save to', save_dir) if 'if001/oscar_2023_filtered' == repo_id: allow_patterns="*.jsonl.zst" - if 'globis-university/aozorabunko-clean' == repo_id: + if 'if001/aozorabunko-clean-sin' == repo_id: allow_patterns="*.jsonl.gz" snapshot_download(repo_id=repo_id, allow_patterns=allow_patterns, local_dir=save_dir, repo_type="dataset") @@ -356,7 +356,7 @@ class OSCARJa(HFDataDownloader): class AozoraJa(HFDataDownloader): name = "aozora_ja" urls = [""] - hf_repo_ids = ['globis-university/aozorabunko-clean'] + hf_repo_ids = ['if001/aozorabunko-clean-sin'] def maybe_download_gpt2_tokenizer_data(tokenizer_type, data_dir): diff --git a/tools/preprocess_data.py b/tools/preprocess_data.py index a923afcc3..1a780a1ed 100644 --- a/tools/preprocess_data.py +++ b/tools/preprocess_data.py @@ -173,16 +173,20 @@ def wiki_yielder(fname, semaphore): semaphore.acquire() yield f['text'] + def aozora_yielder(fname, semaphore): + for f in filter(lambda x: x, lmd.Reader(fname).stream_data()): + semaphore.acquire() + yield f['text'] + for fname in fnames: semaphore.acquire() print('fname', fname) if 'wiki' in fname: yield from wiki_yielder(fname, semaphore) + if 'aozora' in fname: + yield from aozora_yielder(fname, semaphore) else: yield from yielder(fname, semaphore) - - - def main(): args = get_args() From 45c24c3fac5fe683c025449dbdd17f28d3424159 Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Tue, 5 Sep 2023 20:59:37 +0900 Subject: [PATCH 048/183] debug --- tools/preprocess_data.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/preprocess_data.py b/tools/preprocess_data.py index 1a780a1ed..19550a53a 100644 --- a/tools/preprocess_data.py +++ b/tools/preprocess_data.py @@ -176,6 +176,7 @@ def wiki_yielder(fname, semaphore): def aozora_yielder(fname, semaphore): for f in filter(lambda x: x, lmd.Reader(fname).stream_data()): semaphore.acquire() + print('f', f) yield f['text'] for fname in fnames: From fde35c223d5aed3ab416d1eda9c51eb991a15f66 Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Tue, 5 Sep 2023 21:01:47 +0900 Subject: [PATCH 049/183] debug --- tools/preprocess_data.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/preprocess_data.py b/tools/preprocess_data.py index 19550a53a..ef6f8d750 100644 --- a/tools/preprocess_data.py +++ b/tools/preprocess_data.py @@ -177,6 +177,7 @@ def aozora_yielder(fname, semaphore): for f in filter(lambda x: x, lmd.Reader(fname).stream_data()): semaphore.acquire() print('f', f) + print('f text', f['text']) yield f['text'] for fname in fnames: From 2391f3a1cf572a03f9612c1bda0d08f1a7a24d16 Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Tue, 5 Sep 2023 21:03:21 +0900 Subject: [PATCH 050/183] debug --- tools/preprocess_data.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/preprocess_data.py b/tools/preprocess_data.py index ef6f8d750..3e64b5709 100644 --- a/tools/preprocess_data.py +++ b/tools/preprocess_data.py @@ -177,6 +177,7 @@ def aozora_yielder(fname, semaphore): for f in filter(lambda x: x, lmd.Reader(fname).stream_data()): semaphore.acquire() print('f', f) + print('f type', type(f)) print('f text', f['text']) yield f['text'] From a2ba8e46c1cb6b67777b7ebdb8b4ee8563395fdb Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Tue, 5 Sep 2023 21:04:33 +0900 Subject: [PATCH 051/183] debug --- tools/preprocess_data.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tools/preprocess_data.py b/tools/preprocess_data.py index 3e64b5709..14b8b8aeb 100644 --- a/tools/preprocess_data.py +++ b/tools/preprocess_data.py @@ -32,6 +32,7 @@ import tqdm import torch import ftfy +import json from megatron.tokenizer import build_tokenizer from megatron.data import indexed_dataset @@ -178,8 +179,8 @@ def aozora_yielder(fname, semaphore): semaphore.acquire() print('f', f) print('f type', type(f)) - print('f text', f['text']) - yield f['text'] + print('f text', f['text']) + yield json.load(f)['text'] for fname in fnames: semaphore.acquire() From 0b633fc239b4ca3a7bdf3981942705d9920caf57 Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Tue, 5 Sep 2023 21:06:46 +0900 Subject: [PATCH 052/183] debug --- tools/preprocess_data.py | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/tools/preprocess_data.py b/tools/preprocess_data.py index 14b8b8aeb..909baba05 100644 --- a/tools/preprocess_data.py +++ b/tools/preprocess_data.py @@ -174,21 +174,13 @@ def wiki_yielder(fname, semaphore): semaphore.acquire() yield f['text'] - def aozora_yielder(fname, semaphore): - for f in filter(lambda x: x, lmd.Reader(fname).stream_data()): - semaphore.acquire() - print('f', f) - print('f type', type(f)) - print('f text', f['text']) - yield json.load(f)['text'] - for fname in fnames: semaphore.acquire() print('fname', fname) if 'wiki' in fname: yield from wiki_yielder(fname, semaphore) if 'aozora' in fname: - yield from aozora_yielder(fname, semaphore) + yield from wiki_yielder(fname, semaphore) else: yield from yielder(fname, semaphore) From 50eeaeaee36f655793c004a93e65285dcb64e934 Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Tue, 5 Sep 2023 21:12:27 +0900 Subject: [PATCH 053/183] debug --- tools/preprocess_data.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tools/preprocess_data.py b/tools/preprocess_data.py index 909baba05..158312caa 100644 --- a/tools/preprocess_data.py +++ b/tools/preprocess_data.py @@ -174,6 +174,12 @@ def wiki_yielder(fname, semaphore): semaphore.acquire() yield f['text'] + def wiki_yielder(fname, semaphore): + for f in filter(lambda x: x, lmd.Reader(fname).stream_data()): + semaphore.acquire() + print('type', type(f)) + yield f['text'] + for fname in fnames: semaphore.acquire() print('fname', fname) From 0a7957ea6deb50edd6dbd4342ea10f2ac7f69ab8 Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Tue, 5 Sep 2023 21:13:08 +0900 Subject: [PATCH 054/183] debug --- tools/preprocess_data.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tools/preprocess_data.py b/tools/preprocess_data.py index 158312caa..7d5a5ca18 100644 --- a/tools/preprocess_data.py +++ b/tools/preprocess_data.py @@ -177,8 +177,7 @@ def wiki_yielder(fname, semaphore): def wiki_yielder(fname, semaphore): for f in filter(lambda x: x, lmd.Reader(fname).stream_data()): semaphore.acquire() - print('type', type(f)) - yield f['text'] + yield json.load(f)['text'] for fname in fnames: semaphore.acquire() From 73496e70acf4fe4aaa3758279b7dc59100fc1f0c Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Tue, 5 Sep 2023 21:13:35 +0900 Subject: [PATCH 055/183] debug --- tools/preprocess_data.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/preprocess_data.py b/tools/preprocess_data.py index 7d5a5ca18..bb38ddeff 100644 --- a/tools/preprocess_data.py +++ b/tools/preprocess_data.py @@ -174,7 +174,7 @@ def wiki_yielder(fname, semaphore): semaphore.acquire() yield f['text'] - def wiki_yielder(fname, semaphore): + def aozora_yielder(fname, semaphore): for f in filter(lambda x: x, lmd.Reader(fname).stream_data()): semaphore.acquire() yield json.load(f)['text'] @@ -185,7 +185,7 @@ def wiki_yielder(fname, semaphore): if 'wiki' in fname: yield from wiki_yielder(fname, semaphore) if 'aozora' in fname: - yield from wiki_yielder(fname, semaphore) + yield from aozora_yielder(fname, semaphore) else: yield from yielder(fname, semaphore) From 23df3e1f038e0e5fcd420fe38cce2a2751777044 Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Tue, 5 Sep 2023 21:14:18 +0900 Subject: [PATCH 056/183] debug --- tools/preprocess_data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/preprocess_data.py b/tools/preprocess_data.py index bb38ddeff..052799fb9 100644 --- a/tools/preprocess_data.py +++ b/tools/preprocess_data.py @@ -177,7 +177,7 @@ def wiki_yielder(fname, semaphore): def aozora_yielder(fname, semaphore): for f in filter(lambda x: x, lmd.Reader(fname).stream_data()): semaphore.acquire() - yield json.load(f)['text'] + yield json.loads(f)['text'] for fname in fnames: semaphore.acquire() From 2a47daf1adcc799e2206405b440f700571577170 Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Wed, 6 Sep 2023 13:38:51 +0900 Subject: [PATCH 057/183] fix config --- configs/local_setup_ja.yml | 12 ++++++------ tools/corpora.py | 39 ++++++++++++++++++++++++++++++++++---- 2 files changed, 41 insertions(+), 10 deletions(-) diff --git a/configs/local_setup_ja.yml b/configs/local_setup_ja.yml index 66d72be44..d0c4d19d0 100644 --- a/configs/local_setup_ja.yml +++ b/configs/local_setup_ja.yml @@ -4,12 +4,12 @@ # "data_path": "data/wiki_ja/wiki_ja_text_document", # or for weighted datasets: - "train-data-paths": ["data/wiki_ja/wiki_ja_text_document", "data/oscar_ja/oscar_ja_text_document"], - "test-data-paths": ["data/wiki_ja/wiki_ja_text_document", "data/oscar_ja/oscar_ja_text_document"], - "valid-data-paths": ["data/wiki_ja/wiki_ja_text_document", "data/oscar_ja/oscar_ja_text_document"], - "train-data-weights": [1., 1.], - "test-data-weights": [0.1, 0.1], - "valid-data-weights": [0.1, 0.1], + "train-data-paths": ["data/wiki_ja_novelAI_bin/wiki_ja_text_document", "data/oscar_ja_novelAI_bin/oscar_ja_text_document","wiki_en_novelAI_bin/wiki_en_text_document" ,"aozora_ja_novelAI_bin/aozora_ja_text_document"], + "test-data-paths": ["data/wiki_ja_novelAI_bin/wiki_ja_text_document", "data/oscar_ja_novelAI_bin/oscar_ja_text_document","wiki_en_novelAI_bin/wiki_en_text_document" ,"aozora_ja_novelAI_bin/aozora_ja_text_document"], + "valid-data-paths": ["data/wiki_ja_novelAI_bin/wiki_ja_text_document", "data/oscar_ja_novelAI_bin/oscar_ja_text_document","wiki_en_novelAI_bin/wiki_en_text_document" ,"aozora_ja_novelAI_bin/aozora_ja_text_document"], + "train-data-weights": [0.9, 0.9, 0.9, 0.9], + "test-data-weights": [0.1, 0.1, 0.1, 0.1], + "valid-data-weights": [0.1, 0.1, 0.1, 0.1], # If weight_by_num_documents is True, Builds dataset weights from a multinomial distribution over groups of data according to the number of documents in each group. # WARNING: setting this to True will override any user provided weights diff --git a/tools/corpora.py b/tools/corpora.py index 9430c66c0..390b47954 100644 --- a/tools/corpora.py +++ b/tools/corpora.py @@ -327,7 +327,7 @@ class WikiOSCARJa(DataDownloaderWithHF): 'if001/oscar_2023_filtered' ] -class HFDataDownloader(DataDownloader): +class HFSnapshotDownloader(DataDownloader): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -341,6 +341,7 @@ def download(self): save_dir = os.path.join(self.base_dir, self.name) for repo_id in self.hf_repo_ids: print('save to', save_dir) + allow_patterns = None if 'if001/oscar_2023_filtered' == repo_id: allow_patterns="*.jsonl.zst" if 'if001/aozorabunko-clean-sin' == repo_id: @@ -348,16 +349,45 @@ def download(self): snapshot_download(repo_id=repo_id, allow_patterns=allow_patterns, local_dir=save_dir, repo_type="dataset") -class OSCARJa(HFDataDownloader): +class OSCARJa(HFSnapshotDownloader): name = "oscar_ja" urls = [""] hf_repo_ids = ['if001/oscar_2023_filtered'] -class AozoraJa(HFDataDownloader): +class AozoraJa(HFSnapshotDownloader): name = "aozora_ja" urls = [""] hf_repo_ids = ['if001/aozorabunko-clean-sin'] +class HFDataDownloader(DataDownloader): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + @property + @abstractmethod + def hf_repo_ids(self): + pass + + def download(self): + from datasets import load_dataset + save_dir = os.path.join(self.base_dir, self.name) + for repo_id in self.hf_repo_ids: + ds = load_dataset(repo_id) + name = repo_id.split('/')[0] + save_path = f'{save_dir}/{name}.json' + print('save to', save_path) + ds['train'].to_json(save_path) + + +class IzumiDataset(HFSnapshotDownloader): + name = "izumi_dataset" + urls = [""] + hf_repo_ids = [ + "izumi-lab/wikipedia-ja-20230720", + "izumi-lab/wikipedia-en-20230720", + "izumi-lab/wikinews-ja-20230728" + ] + def maybe_download_gpt2_tokenizer_data(tokenizer_type, data_dir): if tokenizer_type is None or tokenizer_type == "GPT2BPETokenizer": @@ -394,7 +424,8 @@ def maybe_download_gpt2_tokenizer_data(tokenizer_type, data_dir): 'wiki_ja': WikiJa, 'oscar_ja': OSCARJa, 'wiki_oscar_ja': WikiOSCARJa, - 'aozora_ja': AozoraJa + 'aozora_ja': AozoraJa, + 'izumi_dataset': IzumiDataset } From 0e6f1e268ac9676438499ec388ea92e6d8dee694 Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Wed, 6 Sep 2023 14:04:32 +0900 Subject: [PATCH 058/183] fix config --- configs/19M.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/configs/19M.yml b/configs/19M.yml index 794c0c4ec..af111c57d 100644 --- a/configs/19M.yml +++ b/configs/19M.yml @@ -76,12 +76,12 @@ "warmup": 0.01, "checkpoint_factor": 1000, "eval_interval": 100000, - "eval_iters": 10, + "eval_iters": 1000, "keep_last_n_checkpoints": 4, - "save_interval": 10000, + "save_iters": 1000, - "log_interval": 100, - "steps_per_print": 100, + "log_interval": 1000, + "steps_per_print": 1000, "wall_clock_breakdown": true, # additional deepspeed args not specified above From 5e12b35e11e7e8dcf27626bad80adb07962fd709 Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Wed, 6 Sep 2023 14:13:37 +0900 Subject: [PATCH 059/183] fix config --- configs/local_setup_ja.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/configs/local_setup_ja.yml b/configs/local_setup_ja.yml index d0c4d19d0..29af25041 100644 --- a/configs/local_setup_ja.yml +++ b/configs/local_setup_ja.yml @@ -4,9 +4,9 @@ # "data_path": "data/wiki_ja/wiki_ja_text_document", # or for weighted datasets: - "train-data-paths": ["data/wiki_ja_novelAI_bin/wiki_ja_text_document", "data/oscar_ja_novelAI_bin/oscar_ja_text_document","wiki_en_novelAI_bin/wiki_en_text_document" ,"aozora_ja_novelAI_bin/aozora_ja_text_document"], - "test-data-paths": ["data/wiki_ja_novelAI_bin/wiki_ja_text_document", "data/oscar_ja_novelAI_bin/oscar_ja_text_document","wiki_en_novelAI_bin/wiki_en_text_document" ,"aozora_ja_novelAI_bin/aozora_ja_text_document"], - "valid-data-paths": ["data/wiki_ja_novelAI_bin/wiki_ja_text_document", "data/oscar_ja_novelAI_bin/oscar_ja_text_document","wiki_en_novelAI_bin/wiki_en_text_document" ,"aozora_ja_novelAI_bin/aozora_ja_text_document"], + "train-data-paths": ["data/wiki_ja_novelAI_bin/wiki_ja_text_document", "data/oscar_ja_novelAI_bin/oscar_ja_text_document","data/wiki_en_novelAI_bin/wiki_en_text_document" ,"data/aozora_ja_novelAI_bin/aozora_ja_text_document"], + "test-data-paths": ["data/wiki_ja_novelAI_bin/wiki_ja_text_document", "data/oscar_ja_novelAI_bin/oscar_ja_text_document","data/wiki_en_novelAI_bin/wiki_en_text_document" ,"data/aozora_ja_novelAI_bin/aozora_ja_text_document"], + "valid-data-paths": ["data/wiki_ja_novelAI_bin/wiki_ja_text_document", "data/oscar_ja_novelAI_bin/oscar_ja_text_document","data/wiki_en_novelAI_bin/wiki_en_text_document" ,"data/aozora_ja_novelAI_bin/aozora_ja_text_document"], "train-data-weights": [0.9, 0.9, 0.9, 0.9], "test-data-weights": [0.1, 0.1, 0.1, 0.1], "valid-data-weights": [0.1, 0.1, 0.1, 0.1], From 56e601468acc20782d9aca3c25431ade74be14e7 Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Wed, 6 Sep 2023 15:04:33 +0900 Subject: [PATCH 060/183] debug --- megatron/training.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/megatron/training.py b/megatron/training.py index 96a94a1d0..ac33ea6cd 100644 --- a/megatron/training.py +++ b/megatron/training.py @@ -814,6 +814,7 @@ def train( lr = 0 # Logging. + print('hoge'*100) report_memory_flag = training_log( neox_args=neox_args, timers=timers, @@ -828,7 +829,7 @@ def train( optimizer=optimizer, noise_scale_logger=noise_scale_logger, ) - + print('bbbb'*100) # Checkpointing if neox_args.save and iteration in neox_args.save_iters: save_checkpoint( From 70e9094721fb81540da6a303304837424b921e2a Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Wed, 6 Sep 2023 15:06:19 +0900 Subject: [PATCH 061/183] debug --- megatron/training.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/megatron/training.py b/megatron/training.py index ac33ea6cd..4422ebb58 100644 --- a/megatron/training.py +++ b/megatron/training.py @@ -814,7 +814,7 @@ def train( lr = 0 # Logging. - print('hoge'*100) + print('aaaa'*100) report_memory_flag = training_log( neox_args=neox_args, timers=timers, @@ -846,6 +846,7 @@ def train( and iteration % neox_args.eval_interval == 0 and neox_args.do_valid ): + print('cccc'*100) prefix = "iteration {}".format(iteration) evaluate_and_print_results( neox_args=neox_args, @@ -857,7 +858,8 @@ def train( verbose=False, timers=timers, ) - + print('dddd'*100) + print('eeee'*100) if neox_args.exit_interval and iteration % neox_args.exit_interval == 0: torch.distributed.barrier() time_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S") @@ -868,7 +870,7 @@ def train( ) ) sys.exit() - + print('ffff'*100) return iteration From 92761651f1f0e475cf4a68261cb1566ecc98e3d7 Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Wed, 6 Sep 2023 15:08:14 +0900 Subject: [PATCH 062/183] debug --- megatron/training.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/megatron/training.py b/megatron/training.py index 4422ebb58..493c5fe52 100644 --- a/megatron/training.py +++ b/megatron/training.py @@ -791,6 +791,7 @@ def train( # to monitor if we've skipped many iterations in a row and trigger an early exit overflow_monitor = OverflowMonitor(optimizer) while iteration < neox_args.train_iters: + print('0000'*100) loss_dict, skipped_iter = train_step( neox_args=neox_args, timers=timers, @@ -799,13 +800,14 @@ def train( optimizer=optimizer, lr_scheduler=lr_scheduler, ) + print('1111'*100) iteration += 1 neox_args.iteration = iteration if neox_args.precision == "fp16": overflow_monitor.check(skipped_iter) # check for repeated overflow if neox_args.log_gradient_noise_scale: # log noise scale if applicable noise_scale_logger.update() - + print('2222'*100) # get learning rate (if present) - if doing soft prompt tuning + pipe parallel, you # may have no tunable parameters on a specific rank if optimizer.param_groups: From 621241096462e02fee315a820a8eeffc208ffe04 Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Wed, 6 Sep 2023 15:10:04 +0900 Subject: [PATCH 063/183] debug --- megatron/training.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/megatron/training.py b/megatron/training.py index 493c5fe52..58afe1030 100644 --- a/megatron/training.py +++ b/megatron/training.py @@ -705,6 +705,7 @@ def train_step(neox_args, timers, data_iterator, model, optimizer, lr_scheduler) for _ in range(neox_args.gradient_accumulation_steps): # Forward model for one step. timers("forward").start() + print('aaaa'*100) loss = forward_step( neox_args=neox_args, timers=timers, @@ -712,6 +713,7 @@ def train_step(neox_args, timers, data_iterator, model, optimizer, lr_scheduler) model=model, is_train=True, ) + print('bbbb'*100) timers("forward").stop() losses.append(loss) # Calculate gradients, reduce across processes, and clip. @@ -723,6 +725,7 @@ def train_step(neox_args, timers, data_iterator, model, optimizer, lr_scheduler) model=model, loss=loss, ) + print('ccccc'*100) timers("backward").stop() # Update parameters. timers("optimizer").start() @@ -734,7 +737,7 @@ def train_step(neox_args, timers, data_iterator, model, optimizer, lr_scheduler) reduced_loss = { "lm_loss": reduce_losses(losses).mean() } # reduces losses across machines for logging - + print('dddd'*100) if neox_args.precision == "fp16" and model.optimizer.overflow: skipped_iter = 1 else: @@ -807,7 +810,7 @@ def train( overflow_monitor.check(skipped_iter) # check for repeated overflow if neox_args.log_gradient_noise_scale: # log noise scale if applicable noise_scale_logger.update() - print('2222'*100) + # get learning rate (if present) - if doing soft prompt tuning + pipe parallel, you # may have no tunable parameters on a specific rank if optimizer.param_groups: @@ -816,7 +819,6 @@ def train( lr = 0 # Logging. - print('aaaa'*100) report_memory_flag = training_log( neox_args=neox_args, timers=timers, @@ -831,7 +833,6 @@ def train( optimizer=optimizer, noise_scale_logger=noise_scale_logger, ) - print('bbbb'*100) # Checkpointing if neox_args.save and iteration in neox_args.save_iters: save_checkpoint( @@ -848,7 +849,6 @@ def train( and iteration % neox_args.eval_interval == 0 and neox_args.do_valid ): - print('cccc'*100) prefix = "iteration {}".format(iteration) evaluate_and_print_results( neox_args=neox_args, @@ -860,8 +860,7 @@ def train( verbose=False, timers=timers, ) - print('dddd'*100) - print('eeee'*100) + if neox_args.exit_interval and iteration % neox_args.exit_interval == 0: torch.distributed.barrier() time_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S") @@ -872,7 +871,7 @@ def train( ) ) sys.exit() - print('ffff'*100) + return iteration From 176f4ea6db323b904eeef4f6244b9b2f86a7016b Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Wed, 6 Sep 2023 15:11:57 +0900 Subject: [PATCH 064/183] debug --- megatron/training.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/megatron/training.py b/megatron/training.py index 58afe1030..fe0f5904f 100644 --- a/megatron/training.py +++ b/megatron/training.py @@ -705,7 +705,6 @@ def train_step(neox_args, timers, data_iterator, model, optimizer, lr_scheduler) for _ in range(neox_args.gradient_accumulation_steps): # Forward model for one step. timers("forward").start() - print('aaaa'*100) loss = forward_step( neox_args=neox_args, timers=timers, @@ -713,7 +712,6 @@ def train_step(neox_args, timers, data_iterator, model, optimizer, lr_scheduler) model=model, is_train=True, ) - print('bbbb'*100) timers("forward").stop() losses.append(loss) # Calculate gradients, reduce across processes, and clip. @@ -724,8 +722,7 @@ def train_step(neox_args, timers, data_iterator, model, optimizer, lr_scheduler) optimizer=optimizer, model=model, loss=loss, - ) - print('ccccc'*100) + ) timers("backward").stop() # Update parameters. timers("optimizer").start() @@ -737,7 +734,7 @@ def train_step(neox_args, timers, data_iterator, model, optimizer, lr_scheduler) reduced_loss = { "lm_loss": reduce_losses(losses).mean() } # reduces losses across machines for logging - print('dddd'*100) + if neox_args.precision == "fp16" and model.optimizer.overflow: skipped_iter = 1 else: @@ -750,7 +747,9 @@ def train_step_pipe(neox_args, timers, model, data_iterator): """Single training step with DeepSpeed's pipeline parallel engine.""" assert neox_args.deepspeed + print('aaaa'*100) loss = model.train_batch(data_iter=data_iterator) + print('bbbb'*100) loss_dict = {"lm_loss": loss} # Don't break Megatron's timers because we changed code paths. for t in [ From 4412f2021ab4c0da7ae69be8cc144be1b7c5246d Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Wed, 6 Sep 2023 15:16:53 +0900 Subject: [PATCH 065/183] debug --- megatron/training.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/megatron/training.py b/megatron/training.py index fe0f5904f..dd7b4a054 100644 --- a/megatron/training.py +++ b/megatron/training.py @@ -746,10 +746,8 @@ def train_step(neox_args, timers, data_iterator, model, optimizer, lr_scheduler) def train_step_pipe(neox_args, timers, model, data_iterator): """Single training step with DeepSpeed's pipeline parallel engine.""" - assert neox_args.deepspeed - print('aaaa'*100) - loss = model.train_batch(data_iter=data_iterator) - print('bbbb'*100) + assert neox_args.deepspeed + loss = model.train_batch(data_iter=data_iterator) loss_dict = {"lm_loss": loss} # Don't break Megatron's timers because we changed code paths. for t in [ From da157ea02cd6b0a2dd82a4805ad2bf7cf15e4c2f Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Wed, 6 Sep 2023 15:17:03 +0900 Subject: [PATCH 066/183] debug --- configs/19M.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/19M.yml b/configs/19M.yml index af111c57d..8d470be8a 100644 --- a/configs/19M.yml +++ b/configs/19M.yml @@ -88,7 +88,7 @@ "deepspeed_extra_args": { "comms_logger": { "enabled": true, - "verbose": true, + "verbose": false, "prof_all": true, "debug": false }, From 788a87124bef62d7a9d09fa46a5fcf97b8d20470 Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Wed, 6 Sep 2023 15:19:33 +0900 Subject: [PATCH 067/183] debug --- configs/19M.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/configs/19M.yml b/configs/19M.yml index 8d470be8a..94648b3a8 100644 --- a/configs/19M.yml +++ b/configs/19M.yml @@ -87,9 +87,9 @@ # additional deepspeed args not specified above "deepspeed_extra_args": { "comms_logger": { - "enabled": true, + "enabled": false, "verbose": false, - "prof_all": true, + "prof_all": false, "debug": false }, }, From 99060c291b93ac89f3ce0591a1e3d1ef7a46a360 Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Wed, 6 Sep 2023 15:20:05 +0900 Subject: [PATCH 068/183] debug --- megatron/training.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/megatron/training.py b/megatron/training.py index dd7b4a054..ab598cc5e 100644 --- a/megatron/training.py +++ b/megatron/training.py @@ -790,8 +790,7 @@ def train( # to monitor if we've skipped many iterations in a row and trigger an early exit overflow_monitor = OverflowMonitor(optimizer) - while iteration < neox_args.train_iters: - print('0000'*100) + while iteration < neox_args.train_iters: loss_dict, skipped_iter = train_step( neox_args=neox_args, timers=timers, @@ -800,7 +799,6 @@ def train( optimizer=optimizer, lr_scheduler=lr_scheduler, ) - print('1111'*100) iteration += 1 neox_args.iteration = iteration if neox_args.precision == "fp16": From ee3b182a4ae451d243238bf5ce5cdaa72a0866f6 Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Wed, 6 Sep 2023 15:23:10 +0900 Subject: [PATCH 069/183] debug --- megatron/training.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/megatron/training.py b/megatron/training.py index ab598cc5e..5c0f60cb1 100644 --- a/megatron/training.py +++ b/megatron/training.py @@ -746,8 +746,10 @@ def train_step(neox_args, timers, data_iterator, model, optimizer, lr_scheduler) def train_step_pipe(neox_args, timers, model, data_iterator): """Single training step with DeepSpeed's pipeline parallel engine.""" - assert neox_args.deepspeed - loss = model.train_batch(data_iter=data_iterator) + assert neox_args.deepspeed + print('0000'*100) + loss = model.train_batch(data_iter=data_iterator) + print('1111'*100) loss_dict = {"lm_loss": loss} # Don't break Megatron's timers because we changed code paths. for t in [ @@ -805,7 +807,7 @@ def train( overflow_monitor.check(skipped_iter) # check for repeated overflow if neox_args.log_gradient_noise_scale: # log noise scale if applicable noise_scale_logger.update() - + print('2222'*100) # get learning rate (if present) - if doing soft prompt tuning + pipe parallel, you # may have no tunable parameters on a specific rank if optimizer.param_groups: @@ -828,6 +830,7 @@ def train( optimizer=optimizer, noise_scale_logger=noise_scale_logger, ) + print('3333'*100) # Checkpointing if neox_args.save and iteration in neox_args.save_iters: save_checkpoint( From d3851e113e86613a752fa1d41138b0cff5c42313 Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Wed, 6 Sep 2023 15:34:15 +0900 Subject: [PATCH 070/183] debug --- megatron/training.py | 5 +---- megatron/utils.py | 3 +++ 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/megatron/training.py b/megatron/training.py index 5c0f60cb1..8a4c8cf8f 100644 --- a/megatron/training.py +++ b/megatron/training.py @@ -747,9 +747,7 @@ def train_step_pipe(neox_args, timers, model, data_iterator): """Single training step with DeepSpeed's pipeline parallel engine.""" assert neox_args.deepspeed - print('0000'*100) loss = model.train_batch(data_iter=data_iterator) - print('1111'*100) loss_dict = {"lm_loss": loss} # Don't break Megatron's timers because we changed code paths. for t in [ @@ -806,8 +804,7 @@ def train( if neox_args.precision == "fp16": overflow_monitor.check(skipped_iter) # check for repeated overflow if neox_args.log_gradient_noise_scale: # log noise scale if applicable - noise_scale_logger.update() - print('2222'*100) + noise_scale_logger.update() # get learning rate (if present) - if doing soft prompt tuning + pipe parallel, you # may have no tunable parameters on a specific rank if optimizer.param_groups: diff --git a/megatron/utils.py b/megatron/utils.py index 0071ef872..e1e835835 100644 --- a/megatron/utils.py +++ b/megatron/utils.py @@ -301,6 +301,9 @@ def log(self, names, normalizer=1.0, reset=True): for name in names: elapsed_time = self.timers[name].elapsed(reset=reset) * 1000.0 / normalizer string += " | {}: {:.2f}".format(name, elapsed_time) + print('log'*100) + print(string) + print('log'*100) if torch.distributed.is_initialized(): if torch.distributed.get_rank() == 0: print(string, flush=True) From fea9ef72f406ebf4b0137207f53289dee86d0581 Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Wed, 6 Sep 2023 15:39:30 +0900 Subject: [PATCH 071/183] debug --- megatron/__init__.py | 3 ++- megatron/training.py | 5 +++-- megatron/utils.py | 5 +---- 3 files changed, 6 insertions(+), 7 deletions(-) diff --git a/megatron/__init__.py b/megatron/__init__.py index 4a9f98a31..2b07725c3 100644 --- a/megatron/__init__.py +++ b/megatron/__init__.py @@ -16,12 +16,13 @@ def print_rank_0(*message): """If distributed is initialized print only on rank 0.""" + print('call1111'*100) if torch.distributed.is_initialized(): if torch.distributed.get_rank() == 0: print(*message, flush=True) else: print(*message, flush=True) - + print('call2222'*100) from .initialize import initialize_megatron from .neox_arguments import NeoXArgs diff --git a/megatron/training.py b/megatron/training.py index 8a4c8cf8f..583683bbf 100644 --- a/megatron/training.py +++ b/megatron/training.py @@ -747,7 +747,9 @@ def train_step_pipe(neox_args, timers, model, data_iterator): """Single training step with DeepSpeed's pipeline parallel engine.""" assert neox_args.deepspeed + print('1'*100) loss = model.train_batch(data_iter=data_iterator) + print('2'*100) loss_dict = {"lm_loss": loss} # Don't break Megatron's timers because we changed code paths. for t in [ @@ -826,8 +828,7 @@ def train( model=model, optimizer=optimizer, noise_scale_logger=noise_scale_logger, - ) - print('3333'*100) + ) # Checkpointing if neox_args.save and iteration in neox_args.save_iters: save_checkpoint( diff --git a/megatron/utils.py b/megatron/utils.py index e1e835835..44fa98a1a 100644 --- a/megatron/utils.py +++ b/megatron/utils.py @@ -300,10 +300,7 @@ def log(self, names, normalizer=1.0, reset=True): string = "time (ms)" for name in names: elapsed_time = self.timers[name].elapsed(reset=reset) * 1000.0 / normalizer - string += " | {}: {:.2f}".format(name, elapsed_time) - print('log'*100) - print(string) - print('log'*100) + string += " | {}: {:.2f}".format(name, elapsed_time) if torch.distributed.is_initialized(): if torch.distributed.get_rank() == 0: print(string, flush=True) From 31f2ec45d91e83009bd8460e7254398a41d7cbbc Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Wed, 6 Sep 2023 15:54:22 +0900 Subject: [PATCH 072/183] debug --- megatron/__init__.py | 6 ++---- megatron/training.py | 4 +--- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/megatron/__init__.py b/megatron/__init__.py index 2b07725c3..a2cf4df99 100644 --- a/megatron/__init__.py +++ b/megatron/__init__.py @@ -15,14 +15,12 @@ def print_rank_0(*message): - """If distributed is initialized print only on rank 0.""" - print('call1111'*100) + """If distributed is initialized print only on rank 0.""" if torch.distributed.is_initialized(): if torch.distributed.get_rank() == 0: print(*message, flush=True) else: - print(*message, flush=True) - print('call2222'*100) + print(*message, flush=True) from .initialize import initialize_megatron from .neox_arguments import NeoXArgs diff --git a/megatron/training.py b/megatron/training.py index 583683bbf..bca3057b7 100644 --- a/megatron/training.py +++ b/megatron/training.py @@ -747,9 +747,7 @@ def train_step_pipe(neox_args, timers, model, data_iterator): """Single training step with DeepSpeed's pipeline parallel engine.""" assert neox_args.deepspeed - print('1'*100) - loss = model.train_batch(data_iter=data_iterator) - print('2'*100) + loss = model.train_batch(data_iter=data_iterator) loss_dict = {"lm_loss": loss} # Don't break Megatron's timers because we changed code paths. for t in [ From 8a11228a33e21ba318ac30f7453ed2056d58a8c4 Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Thu, 7 Sep 2023 19:55:00 +0900 Subject: [PATCH 073/183] add convert settings --- configs/convert_settings.yml | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) create mode 100644 configs/convert_settings.yml diff --git a/configs/convert_settings.yml b/configs/convert_settings.yml new file mode 100644 index 000000000..edb2aa560 --- /dev/null +++ b/configs/convert_settings.yml @@ -0,0 +1,30 @@ +{ + "tokenizer_type": "SPMTokenizer" + + "pipe_parallel_size": 1, + "model_parallel_size": 1, + + # model settings + "num_layers": 6, + "hidden_size": 512, + "num_attention_heads": 8, + "seq_length": 2048, + "max_position_embeddings": 2048, + "pos_emb": "rotary", + "no_weight_tying": true, + "gpt_j_residual": false, + "output_layer_parallelism": "column", + + "scaled_upper_triang_masked_softmax_fusion": false, + "bias_gelu_fusion": false, + + "optimizer": { + "type": "Adam", + "params": { + "lr": 0.001, + "betas": [0.9, 0.95], + "eps": 1.0e-8, + } + }, + "min_lr": 0.0001 +} From 9a0bf455e2ebd2538997ab0dbb2f68b374698ba0 Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Thu, 7 Sep 2023 20:08:05 +0900 Subject: [PATCH 074/183] add convert settings --- configs/convert_settings.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/convert_settings.yml b/configs/convert_settings.yml index edb2aa560..8cf817987 100644 --- a/configs/convert_settings.yml +++ b/configs/convert_settings.yml @@ -1,5 +1,5 @@ { - "tokenizer_type": "SPMTokenizer" + "tokenizer_type": "SPMTokenizer", "pipe_parallel_size": 1, "model_parallel_size": 1, From f5cb60676d8c18db77bfad328744fa1db83c6551 Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Thu, 7 Sep 2023 20:17:47 +0900 Subject: [PATCH 075/183] add --- configs/convert_settings.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/convert_settings.yml b/configs/convert_settings.yml index 8cf817987..1fe676739 100644 --- a/configs/convert_settings.yml +++ b/configs/convert_settings.yml @@ -1,6 +1,6 @@ { "tokenizer_type": "SPMTokenizer", - + "vocab-file": "./novelAI/tokenizer.model" "pipe_parallel_size": 1, "model_parallel_size": 1, From fbe2e2b9816170e38c0b224553feca8faa7bb539 Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Thu, 7 Sep 2023 20:23:50 +0900 Subject: [PATCH 076/183] add --- configs/convert_settings.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/configs/convert_settings.yml b/configs/convert_settings.yml index 1fe676739..baf797385 100644 --- a/configs/convert_settings.yml +++ b/configs/convert_settings.yml @@ -1,6 +1,7 @@ { "tokenizer_type": "SPMTokenizer", - "vocab-file": "./novelAI/tokenizer.model" + "vocab-file": "./novelAI/tokenizer.model", + "pipe_parallel_size": 1, "model_parallel_size": 1, From 288175581ba72bed26a9923204528bbc3b2e086d Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Thu, 7 Sep 2023 20:26:23 +0900 Subject: [PATCH 077/183] debug --- tools/convert_module_to_hf.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/convert_module_to_hf.py b/tools/convert_module_to_hf.py index 905bdfa16..3f7ccb080 100644 --- a/tools/convert_module_to_hf.py +++ b/tools/convert_module_to_hf.py @@ -183,6 +183,7 @@ def convert(input_checkpoint_path, loaded_config, output_checkpoint_path): # get layer from hf model hf_layer = hf_model.gpt_neox.layers[layer_i] + print('state_dict: ', hf_layer.state_dict()) # + 2 bc of embed layer and a dummy _pre_transformer_block loaded_tp_ranks = load_partitions( From 4bede8c1181f9b16801183c39aa1f7d13dd8964e Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Thu, 7 Sep 2023 20:45:11 +0900 Subject: [PATCH 078/183] fix --- tools/convert_module_to_hf.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/tools/convert_module_to_hf.py b/tools/convert_module_to_hf.py index 3f7ccb080..a79e13d28 100644 --- a/tools/convert_module_to_hf.py +++ b/tools/convert_module_to_hf.py @@ -183,7 +183,9 @@ def convert(input_checkpoint_path, loaded_config, output_checkpoint_path): # get layer from hf model hf_layer = hf_model.gpt_neox.layers[layer_i] - print('state_dict: ', hf_layer.state_dict()) + for v, _ in hf_layer.state_dict(): + print('state_dict: ', v) + print('-'*200) # + 2 bc of embed layer and a dummy _pre_transformer_block loaded_tp_ranks = load_partitions( @@ -228,10 +230,16 @@ def convert(input_checkpoint_path, loaded_config, output_checkpoint_path): state_dict["attention.rotary_emb.inv_freq"] = loaded_tp_ranks[0][ "attention.rotary_emb.inv_freq" ] - state_dict["attention.bias"] = hf_layer.state_dict()["attention.bias"] - state_dict["attention.masked_bias"] = hf_layer.state_dict()[ - "attention.masked_bias" - ] + + state_dict["attention.dense.bias"] = hf_layer.state_dict()["attention.dense.bias"] + + if "attention.bias" in hf_layer.state_dict(): + state_dict["attention.bias"] = hf_layer.state_dict()["attention.bias"] + + if "attention.masked_bias" in hf_layer.state_dict(): + state_dict["attention.masked_bias"] = hf_layer.state_dict()[ + "attention.masked_bias" + ] # load state_dict into layer hf_layer.load_state_dict(state_dict) From dea764f6ac78a767c817882f170a289d2ba50283 Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Thu, 7 Sep 2023 20:49:00 +0900 Subject: [PATCH 079/183] fix --- tools/convert_module_to_hf.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/convert_module_to_hf.py b/tools/convert_module_to_hf.py index a79e13d28..c46d78402 100644 --- a/tools/convert_module_to_hf.py +++ b/tools/convert_module_to_hf.py @@ -183,7 +183,7 @@ def convert(input_checkpoint_path, loaded_config, output_checkpoint_path): # get layer from hf model hf_layer = hf_model.gpt_neox.layers[layer_i] - for v, _ in hf_layer.state_dict(): + for v in hf_layer.state_dict(): print('state_dict: ', v) print('-'*200) @@ -235,7 +235,7 @@ def convert(input_checkpoint_path, loaded_config, output_checkpoint_path): if "attention.bias" in hf_layer.state_dict(): state_dict["attention.bias"] = hf_layer.state_dict()["attention.bias"] - + if "attention.masked_bias" in hf_layer.state_dict(): state_dict["attention.masked_bias"] = hf_layer.state_dict()[ "attention.masked_bias" From 643a3d18db54417a82a1e494c3bd47431e5e870a Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Thu, 7 Sep 2023 22:06:40 +0900 Subject: [PATCH 080/183] add dataset --- tools/corpora.py | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/tools/corpora.py b/tools/corpora.py index 390b47954..698336d8d 100644 --- a/tools/corpora.py +++ b/tools/corpora.py @@ -379,8 +379,8 @@ def download(self): ds['train'].to_json(save_path) -class IzumiDataset(HFSnapshotDownloader): - name = "izumi_dataset" +class IzumiFullDataset(HFSnapshotDownloader): + name = "izumi_full_dataset" urls = [""] hf_repo_ids = [ "izumi-lab/wikipedia-ja-20230720", @@ -388,6 +388,26 @@ class IzumiDataset(HFSnapshotDownloader): "izumi-lab/wikinews-ja-20230728" ] +class IzumiWikiJaDataset(HFSnapshotDownloader): + name = "izumi_wiki_ja_dataset" + urls = [""] + hf_repo_ids = [ + "izumi-lab/wikipedia-ja-20230720", + ] + +class IzumiWikiJaDataset(HFSnapshotDownloader): + name = "izumi_wiki_en_dataset" + urls = [""] + hf_repo_ids = [ + "izumi-lab/wikipedia-en-20230720", + ] + +class IzumiWikiNewsJaDataset(HFSnapshotDownloader): + name = "izumi_wiki_news_dataset" + urls = [""] + hf_repo_ids = [ + "izumi-lab/wikinews-ja-20230728" + ] def maybe_download_gpt2_tokenizer_data(tokenizer_type, data_dir): if tokenizer_type is None or tokenizer_type == "GPT2BPETokenizer": @@ -425,7 +445,7 @@ def maybe_download_gpt2_tokenizer_data(tokenizer_type, data_dir): 'oscar_ja': OSCARJa, 'wiki_oscar_ja': WikiOSCARJa, 'aozora_ja': AozoraJa, - 'izumi_dataset': IzumiDataset + 'izumi_dataset': IzumiFullDataset } From 8d7ba794e3adc30a608bdf9c32df22ea30766b7b Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Thu, 7 Sep 2023 22:20:57 +0900 Subject: [PATCH 081/183] fix --- tools/corpora.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/corpora.py b/tools/corpora.py index 698336d8d..5aa783b5f 100644 --- a/tools/corpora.py +++ b/tools/corpora.py @@ -379,7 +379,7 @@ def download(self): ds['train'].to_json(save_path) -class IzumiFullDataset(HFSnapshotDownloader): +class IzumiFullDataset(HFDataDownloader): name = "izumi_full_dataset" urls = [""] hf_repo_ids = [ @@ -388,21 +388,21 @@ class IzumiFullDataset(HFSnapshotDownloader): "izumi-lab/wikinews-ja-20230728" ] -class IzumiWikiJaDataset(HFSnapshotDownloader): +class IzumiWikiJaDataset(HFDataDownloader): name = "izumi_wiki_ja_dataset" urls = [""] hf_repo_ids = [ "izumi-lab/wikipedia-ja-20230720", ] -class IzumiWikiJaDataset(HFSnapshotDownloader): +class IzumiWikiJaDataset(HFDataDownloader): name = "izumi_wiki_en_dataset" urls = [""] hf_repo_ids = [ "izumi-lab/wikipedia-en-20230720", ] -class IzumiWikiNewsJaDataset(HFSnapshotDownloader): +class IzumiWikiNewsJaDataset(HFDataDownloader): name = "izumi_wiki_news_dataset" urls = [""] hf_repo_ids = [ From 2fa07ee0142e2fa135f705b627efc91b206587ba Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Thu, 7 Sep 2023 22:34:05 +0900 Subject: [PATCH 082/183] add config --- tools/corpora.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tools/corpora.py b/tools/corpora.py index 5aa783b5f..50141cbc5 100644 --- a/tools/corpora.py +++ b/tools/corpora.py @@ -395,7 +395,7 @@ class IzumiWikiJaDataset(HFDataDownloader): "izumi-lab/wikipedia-ja-20230720", ] -class IzumiWikiJaDataset(HFDataDownloader): +class IzumiWikiEnDataset(HFDataDownloader): name = "izumi_wiki_en_dataset" urls = [""] hf_repo_ids = [ @@ -445,7 +445,10 @@ def maybe_download_gpt2_tokenizer_data(tokenizer_type, data_dir): 'oscar_ja': OSCARJa, 'wiki_oscar_ja': WikiOSCARJa, 'aozora_ja': AozoraJa, - 'izumi_dataset': IzumiFullDataset + 'izumi_dataset': IzumiFullDataset, + 'izumi_wiki_ja_dataset': IzumiWikiJaDataset, + 'izumi_wiki_en_dataset': IzumiWikiEnDataset, + 'izumi_wiki_news_dataset': IzumiWikiNewsJaDataset } From 11d6bdb3a19c7c82201fdc0cc35586897a6cfdad Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Thu, 7 Sep 2023 22:40:54 +0900 Subject: [PATCH 083/183] fix save name --- tools/corpora.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/corpora.py b/tools/corpora.py index 50141cbc5..d12fa839a 100644 --- a/tools/corpora.py +++ b/tools/corpora.py @@ -374,7 +374,7 @@ def download(self): for repo_id in self.hf_repo_ids: ds = load_dataset(repo_id) name = repo_id.split('/')[0] - save_path = f'{save_dir}/{name}.json' + save_path = f'{save_dir}/{name}.jsonl' print('save to', save_path) ds['train'].to_json(save_path) From f4157e51574d6b9ee1aa3078b3358887f200ce22 Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Thu, 7 Sep 2023 22:46:50 +0900 Subject: [PATCH 084/183] fix to_json --- tools/corpora.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/corpora.py b/tools/corpora.py index d12fa839a..daeae07ed 100644 --- a/tools/corpora.py +++ b/tools/corpora.py @@ -376,7 +376,7 @@ def download(self): name = repo_id.split('/')[0] save_path = f'{save_dir}/{name}.jsonl' print('save to', save_path) - ds['train'].to_json(save_path) + ds['train'].to_json(save_path, force_ascii=False) class IzumiFullDataset(HFDataDownloader): From a8c1a318188c33fc5125e9c90810de35642d69b6 Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Thu, 7 Sep 2023 22:50:22 +0900 Subject: [PATCH 085/183] fix yielder --- tools/preprocess_data.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tools/preprocess_data.py b/tools/preprocess_data.py index 052799fb9..fa16a6087 100644 --- a/tools/preprocess_data.py +++ b/tools/preprocess_data.py @@ -168,6 +168,13 @@ def yielder(fname, semaphore): semaphore.acquire() yield f + + def hf_yielder(fname, semaphore): + stream = filter(lambda x: x, lmd.Reader(fname).stream_data()) + for f in filter(lambda x: 'text' in x and len(json.leads(x)['text']) != 0, stream): + semaphore.acquire() + yield f['text'] + def wiki_yielder(fname, semaphore): stream = filter(lambda x: x, lmd.Reader(fname).stream_data()) for f in filter(lambda x: 'text' in x and len(x['text']) != 0, stream): @@ -182,6 +189,8 @@ def aozora_yielder(fname, semaphore): for fname in fnames: semaphore.acquire() print('fname', fname) + if 'izumi' in fname: + yield from hf_yielder(fname, semaphore) if 'wiki' in fname: yield from wiki_yielder(fname, semaphore) if 'aozora' in fname: From 2841cca50881606092f4b323b60b9e2077a69d75 Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Thu, 7 Sep 2023 22:51:10 +0900 Subject: [PATCH 086/183] fix yielder --- tools/preprocess_data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/preprocess_data.py b/tools/preprocess_data.py index fa16a6087..ebdcfc4c2 100644 --- a/tools/preprocess_data.py +++ b/tools/preprocess_data.py @@ -171,7 +171,7 @@ def yielder(fname, semaphore): def hf_yielder(fname, semaphore): stream = filter(lambda x: x, lmd.Reader(fname).stream_data()) - for f in filter(lambda x: 'text' in x and len(json.leads(x)['text']) != 0, stream): + for f in filter(lambda x: 'text' in x and len(json.loads(x)['text']) != 0, stream): semaphore.acquire() yield f['text'] From 8b9d57beba556765a1caf4a4474e15145678318c Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Thu, 7 Sep 2023 22:54:39 +0900 Subject: [PATCH 087/183] fix yielder --- tools/preprocess_data.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/tools/preprocess_data.py b/tools/preprocess_data.py index ebdcfc4c2..b2a54e052 100644 --- a/tools/preprocess_data.py +++ b/tools/preprocess_data.py @@ -168,13 +168,6 @@ def yielder(fname, semaphore): semaphore.acquire() yield f - - def hf_yielder(fname, semaphore): - stream = filter(lambda x: x, lmd.Reader(fname).stream_data()) - for f in filter(lambda x: 'text' in x and len(json.loads(x)['text']) != 0, stream): - semaphore.acquire() - yield f['text'] - def wiki_yielder(fname, semaphore): stream = filter(lambda x: x, lmd.Reader(fname).stream_data()) for f in filter(lambda x: 'text' in x and len(x['text']) != 0, stream): @@ -190,7 +183,7 @@ def aozora_yielder(fname, semaphore): semaphore.acquire() print('fname', fname) if 'izumi' in fname: - yield from hf_yielder(fname, semaphore) + yield from aozora_yielder(fname, semaphore) if 'wiki' in fname: yield from wiki_yielder(fname, semaphore) if 'aozora' in fname: From 87ee88df1eb3fdccf8848163b1af5c078710f6cf Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Thu, 7 Sep 2023 22:56:06 +0900 Subject: [PATCH 088/183] debug --- tools/preprocess_data.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/preprocess_data.py b/tools/preprocess_data.py index b2a54e052..e6c8193a7 100644 --- a/tools/preprocess_data.py +++ b/tools/preprocess_data.py @@ -177,6 +177,7 @@ def wiki_yielder(fname, semaphore): def aozora_yielder(fname, semaphore): for f in filter(lambda x: x, lmd.Reader(fname).stream_data()): semaphore.acquire() + print('debug: ', type(f), f) yield json.loads(f)['text'] for fname in fnames: From 31e28b89c748858b8cdffa339efeef1d05331940 Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Thu, 7 Sep 2023 22:57:03 +0900 Subject: [PATCH 089/183] debug --- tools/preprocess_data.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tools/preprocess_data.py b/tools/preprocess_data.py index e6c8193a7..876fae8a9 100644 --- a/tools/preprocess_data.py +++ b/tools/preprocess_data.py @@ -176,15 +176,14 @@ def wiki_yielder(fname, semaphore): def aozora_yielder(fname, semaphore): for f in filter(lambda x: x, lmd.Reader(fname).stream_data()): - semaphore.acquire() - print('debug: ', type(f), f) + semaphore.acquire() yield json.loads(f)['text'] for fname in fnames: semaphore.acquire() print('fname', fname) if 'izumi' in fname: - yield from aozora_yielder(fname, semaphore) + yield from yielder(fname, semaphore) if 'wiki' in fname: yield from wiki_yielder(fname, semaphore) if 'aozora' in fname: From e405194bb2d4baf8168f4b91404c879edcc932a8 Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Thu, 7 Sep 2023 23:05:01 +0900 Subject: [PATCH 090/183] fix --- tools/preprocess_data.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/preprocess_data.py b/tools/preprocess_data.py index 876fae8a9..f0c8822e3 100644 --- a/tools/preprocess_data.py +++ b/tools/preprocess_data.py @@ -184,9 +184,9 @@ def aozora_yielder(fname, semaphore): print('fname', fname) if 'izumi' in fname: yield from yielder(fname, semaphore) - if 'wiki' in fname: + elif 'wiki' in fname: yield from wiki_yielder(fname, semaphore) - if 'aozora' in fname: + elif 'aozora' in fname: yield from aozora_yielder(fname, semaphore) else: yield from yielder(fname, semaphore) From 1852b8e47be3aa7bd5c412adc843c6a530ed7837 Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Fri, 8 Sep 2023 14:32:42 +0900 Subject: [PATCH 091/183] fix config --- configs/49M.yml | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/configs/49M.yml b/configs/49M.yml index 94e08ea2d..6cfb4e4fa 100644 --- a/configs/49M.yml +++ b/configs/49M.yml @@ -80,13 +80,15 @@ "distributed_backend": "nccl", "lr_decay_style": "cosine", "warmup": 0.01, - "checkpoint_factor": 1000, + "checkpoint_factor": 10000, "eval_interval": 100000, - "eval_iters": 10, + "eval_iters": 1000, + "keep_last_n_checkpoints": 4, + "save_iters": 10000, # logging - "log_interval": 10, - "steps_per_print": 10, + "log_interval": 1000, + "steps_per_print": 1000, "keep_last_n_checkpoints": 4, "wall_clock_breakdown": true, From f561019dcf3e319f50342f169a7f31c849e3409d Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Fri, 8 Sep 2023 15:13:43 +0900 Subject: [PATCH 092/183] fix config --- configs/49M.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/49M.yml b/configs/49M.yml index 6cfb4e4fa..9621346fa 100644 --- a/configs/49M.yml +++ b/configs/49M.yml @@ -46,7 +46,7 @@ }, # batch / data settings - "train_micro_batch_size_per_gpu": 32, + "train_micro_batch_size_per_gpu": 16, "gas": 1, "data_impl": "mmap", "num_workers": 1, From 12a545c9579bd09fac7a79da262cd9b0bf2c6da8 Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Fri, 8 Sep 2023 15:15:44 +0900 Subject: [PATCH 093/183] fix config --- configs/49M.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/configs/49M.yml b/configs/49M.yml index 9621346fa..4896c1ba0 100644 --- a/configs/49M.yml +++ b/configs/49M.yml @@ -46,7 +46,8 @@ }, # batch / data settings - "train_micro_batch_size_per_gpu": 16, + "train_micro_batch_size_per_gpu": 8, + # "train_micro_batch_size_per_gpu": 32, "gas": 1, "data_impl": "mmap", "num_workers": 1, From fe8ebcf0b9d7ee1cfbb65043faca42ae44b6158f Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Fri, 8 Sep 2023 18:40:40 +0900 Subject: [PATCH 094/183] force hide log --- megatron/utils.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/megatron/utils.py b/megatron/utils.py index 44fa98a1a..77cc38ff4 100644 --- a/megatron/utils.py +++ b/megatron/utils.py @@ -300,7 +300,11 @@ def log(self, names, normalizer=1.0, reset=True): string = "time (ms)" for name in names: elapsed_time = self.timers[name].elapsed(reset=reset) * 1000.0 / normalizer - string += " | {}: {:.2f}".format(name, elapsed_time) + string += " | {}: {:.2f}".format(name, elapsed_time) + + if "optimizer_allgather" in string: + return + if torch.distributed.is_initialized(): if torch.distributed.get_rank() == 0: print(string, flush=True) From eae78d5c387120360d82d1c800dc94eb68dccadb Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Fri, 8 Sep 2023 18:42:29 +0900 Subject: [PATCH 095/183] debug --- megatron/utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/megatron/utils.py b/megatron/utils.py index 77cc38ff4..a6aea1e90 100644 --- a/megatron/utils.py +++ b/megatron/utils.py @@ -302,6 +302,7 @@ def log(self, names, normalizer=1.0, reset=True): elapsed_time = self.timers[name].elapsed(reset=reset) * 1000.0 / normalizer string += " | {}: {:.2f}".format(name, elapsed_time) + print('--------------', string, flush=True) if "optimizer_allgather" in string: return From 1354502306d4ab67d629bffdf56d6f4284609908 Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Fri, 8 Sep 2023 19:01:10 +0900 Subject: [PATCH 096/183] set log level --- configs/49M.yml | 2 ++ megatron/utils.py | 6 +----- train.py | 3 +++ 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/configs/49M.yml b/configs/49M.yml index 4896c1ba0..eae03552f 100644 --- a/configs/49M.yml +++ b/configs/49M.yml @@ -14,6 +14,8 @@ "no_weight_tying": true, "gpt_j_residual": true, "output_layer_parallelism": "column", + "use_bnb_optimizer": true, + # these should provide some speedup but takes a while to build, set to true if desired "scaled_upper_triang_masked_softmax_fusion": false, diff --git a/megatron/utils.py b/megatron/utils.py index a6aea1e90..dced16b0c 100644 --- a/megatron/utils.py +++ b/megatron/utils.py @@ -301,11 +301,7 @@ def log(self, names, normalizer=1.0, reset=True): for name in names: elapsed_time = self.timers[name].elapsed(reset=reset) * 1000.0 / normalizer string += " | {}: {:.2f}".format(name, elapsed_time) - - print('--------------', string, flush=True) - if "optimizer_allgather" in string: - return - + if torch.distributed.is_initialized(): if torch.distributed.get_rank() == 0: print(string, flush=True) diff --git a/train.py b/train.py index 358ab3a81..a16887036 100644 --- a/train.py +++ b/train.py @@ -18,6 +18,9 @@ """Train""" from megatron.neox_arguments import NeoXArgs from megatron.training import pretrain +import logging + +logging.basicConfig(level=logging.WARNING) if __name__ == "__main__": neox_args = NeoXArgs.consume_neox_args() From 599829363414a8de3afe57e05bf941a8c58538c8 Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Fri, 8 Sep 2023 19:06:05 +0900 Subject: [PATCH 097/183] debug --- configs/49M.yml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/configs/49M.yml b/configs/49M.yml index eae03552f..bde1938a8 100644 --- a/configs/49M.yml +++ b/configs/49M.yml @@ -97,4 +97,13 @@ ## tokenizer type "tokenizer_type": "SPMTokenizer" + + "deepspeed_extra_args": { + "comms_logger": { + "enabled": false, + "verbose": false, + "prof_all": false, + "debug": false + } + } } From 3a8efcba50197521a11333767786141de12f95f9 Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Fri, 8 Sep 2023 19:07:00 +0900 Subject: [PATCH 098/183] debug --- configs/49M.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/configs/49M.yml b/configs/49M.yml index bde1938a8..7084e294d 100644 --- a/configs/49M.yml +++ b/configs/49M.yml @@ -96,8 +96,8 @@ "wall_clock_breakdown": true, ## tokenizer type - "tokenizer_type": "SPMTokenizer" - + "tokenizer_type": "SPMTokenizer", + "deepspeed_extra_args": { "comms_logger": { "enabled": false, From 6b3f64f787c179d78fca5d48bb22e07a7f2c9beb Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Fri, 8 Sep 2023 19:09:38 +0900 Subject: [PATCH 099/183] debug --- configs/49M.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/configs/49M.yml b/configs/49M.yml index 7084e294d..5e87d5035 100644 --- a/configs/49M.yml +++ b/configs/49M.yml @@ -44,8 +44,9 @@ "overlap_comm": True, "reduce_scatter": True, "reduce_bucket_size": 500000000, - "contiguous_gradients": True, + "contiguous_gradients": True }, + "zero_allow_untested_optimizer": True # batch / data settings "train_micro_batch_size_per_gpu": 8, @@ -97,7 +98,7 @@ ## tokenizer type "tokenizer_type": "SPMTokenizer", - + "deepspeed_extra_args": { "comms_logger": { "enabled": false, From 032b708105fca436514319dfbf75a4a58236e9c4 Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Fri, 8 Sep 2023 19:10:47 +0900 Subject: [PATCH 100/183] debug --- configs/49M.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/49M.yml b/configs/49M.yml index 5e87d5035..b09b62f48 100644 --- a/configs/49M.yml +++ b/configs/49M.yml @@ -46,7 +46,7 @@ "reduce_bucket_size": 500000000, "contiguous_gradients": True }, - "zero_allow_untested_optimizer": True + "zero_allow_untested_optimizer": True, # batch / data settings "train_micro_batch_size_per_gpu": 8, From 0f8ac2975d80c27ac4dafed4242dc32d3c778063 Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Fri, 8 Sep 2023 19:11:23 +0900 Subject: [PATCH 101/183] debug --- configs/49M.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/configs/49M.yml b/configs/49M.yml index b09b62f48..da90946d0 100644 --- a/configs/49M.yml +++ b/configs/49M.yml @@ -44,9 +44,10 @@ "overlap_comm": True, "reduce_scatter": True, "reduce_bucket_size": 500000000, - "contiguous_gradients": True + "contiguous_gradients": True, + "zero_allow_untested_optimizer": True, }, - "zero_allow_untested_optimizer": True, + # batch / data settings "train_micro_batch_size_per_gpu": 8, From 52952c950d426cfb2e7548e346a388f3e2d0245f Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Fri, 8 Sep 2023 19:15:13 +0900 Subject: [PATCH 102/183] debug --- configs/49M.yml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/configs/49M.yml b/configs/49M.yml index da90946d0..1d1c13926 100644 --- a/configs/49M.yml +++ b/configs/49M.yml @@ -44,10 +44,9 @@ "overlap_comm": True, "reduce_scatter": True, "reduce_bucket_size": 500000000, - "contiguous_gradients": True, - "zero_allow_untested_optimizer": True, + "contiguous_gradients": True, }, - + "zero_allow_untested_optimizer": true, # batch / data settings "train_micro_batch_size_per_gpu": 8, From 4e26eaf106979c29af0eae3636f617a19cc66527 Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Fri, 8 Sep 2023 19:17:15 +0900 Subject: [PATCH 103/183] debug --- configs/49M.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/configs/49M.yml b/configs/49M.yml index 1d1c13926..68a0f9ab1 100644 --- a/configs/49M.yml +++ b/configs/49M.yml @@ -38,7 +38,7 @@ # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training "zero_optimization": { - "stage": 1, + "stage": 0, "allgather_partitions": True, "allgather_bucket_size": 500000000, "overlap_comm": True, @@ -46,7 +46,7 @@ "reduce_bucket_size": 500000000, "contiguous_gradients": True, }, - "zero_allow_untested_optimizer": true, + "zero_allow_untested_optimizer": false, # batch / data settings "train_micro_batch_size_per_gpu": 8, From 8579e3c41944b50b38a23e2aefac560b720a2776 Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Fri, 8 Sep 2023 19:18:10 +0900 Subject: [PATCH 104/183] debug --- configs/49M.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/49M.yml b/configs/49M.yml index 68a0f9ab1..8e9f71002 100644 --- a/configs/49M.yml +++ b/configs/49M.yml @@ -46,7 +46,7 @@ "reduce_bucket_size": 500000000, "contiguous_gradients": True, }, - "zero_allow_untested_optimizer": false, + # "zero_allow_untested_optimizer": true, # batch / data settings "train_micro_batch_size_per_gpu": 8, From a5eb327d1c07b31cc0c06c340c463bbfa3a498a3 Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Fri, 8 Sep 2023 19:19:13 +0900 Subject: [PATCH 105/183] debug --- configs/49M.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/49M.yml b/configs/49M.yml index 8e9f71002..9127e6823 100644 --- a/configs/49M.yml +++ b/configs/49M.yml @@ -14,7 +14,7 @@ "no_weight_tying": true, "gpt_j_residual": true, "output_layer_parallelism": "column", - "use_bnb_optimizer": true, + # "use_bnb_optimizer": true, # these should provide some speedup but takes a while to build, set to true if desired From fc5e9076ed79cd6e07cc37a1e769c5e0442d5b68 Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Fri, 8 Sep 2023 19:36:28 +0900 Subject: [PATCH 106/183] debug --- megatron/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/megatron/utils.py b/megatron/utils.py index dced16b0c..ab51f0667 100644 --- a/megatron/utils.py +++ b/megatron/utils.py @@ -301,7 +301,7 @@ def log(self, names, normalizer=1.0, reset=True): for name in names: elapsed_time = self.timers[name].elapsed(reset=reset) * 1000.0 / normalizer string += " | {}: {:.2f}".format(name, elapsed_time) - + print("-"*10, string) if torch.distributed.is_initialized(): if torch.distributed.get_rank() == 0: print(string, flush=True) From b4ea1bbaeaf18bc34ff2d68f1774a5d0259ae997 Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Fri, 8 Sep 2023 19:47:44 +0900 Subject: [PATCH 107/183] debug --- configs/49M.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/49M.yml b/configs/49M.yml index 9127e6823..cf7a63b3c 100644 --- a/configs/49M.yml +++ b/configs/49M.yml @@ -38,7 +38,7 @@ # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training "zero_optimization": { - "stage": 0, + "stage": 2, "allgather_partitions": True, "allgather_bucket_size": 500000000, "overlap_comm": True, From fb70e4f89481fc5dae8fe1695f70ecc61ea3bce6 Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Fri, 8 Sep 2023 19:48:35 +0900 Subject: [PATCH 108/183] debug --- configs/49M.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/49M.yml b/configs/49M.yml index cf7a63b3c..560eb4de6 100644 --- a/configs/49M.yml +++ b/configs/49M.yml @@ -38,7 +38,7 @@ # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training "zero_optimization": { - "stage": 2, + "stage": 1, "allgather_partitions": True, "allgather_bucket_size": 500000000, "overlap_comm": True, From 12ac35236138d58643e052e95d134c02d60d9183 Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Fri, 8 Sep 2023 19:51:53 +0900 Subject: [PATCH 109/183] debug --- train.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/train.py b/train.py index a16887036..d7c065a2e 100644 --- a/train.py +++ b/train.py @@ -18,9 +18,9 @@ """Train""" from megatron.neox_arguments import NeoXArgs from megatron.training import pretrain -import logging -logging.basicConfig(level=logging.WARNING) +import logging +logging.getLogger('deepspeed').setLevel(logging.WARNING) if __name__ == "__main__": neox_args = NeoXArgs.consume_neox_args() From b94532dbeed22b3a5b51eff0910a81e1034a2079 Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Fri, 8 Sep 2023 19:53:19 +0900 Subject: [PATCH 110/183] debug --- megatron/training.py | 2 ++ train.py | 3 --- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/megatron/training.py b/megatron/training.py index bca3057b7..eed4b10d6 100644 --- a/megatron/training.py +++ b/megatron/training.py @@ -57,6 +57,8 @@ from megatron.model.gpt2_model import cross_entropy from eval_tasks import run_eval_harness +import logging +logging.getLogger('deepspeed').setLevel(logging.WARNING) def mup_weights_reinit(neox_args, model): def has_method(o, name): diff --git a/train.py b/train.py index d7c065a2e..358ab3a81 100644 --- a/train.py +++ b/train.py @@ -19,9 +19,6 @@ from megatron.neox_arguments import NeoXArgs from megatron.training import pretrain -import logging -logging.getLogger('deepspeed').setLevel(logging.WARNING) - if __name__ == "__main__": neox_args = NeoXArgs.consume_neox_args() neox_args.configure_distributed_args() From 1c232a791dd49fbb482c8d893c741bdf6daf8241 Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Fri, 8 Sep 2023 19:56:15 +0900 Subject: [PATCH 111/183] debug --- megatron/training.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/megatron/training.py b/megatron/training.py index eed4b10d6..1f2231acd 100644 --- a/megatron/training.py +++ b/megatron/training.py @@ -58,7 +58,9 @@ from eval_tasks import run_eval_harness import logging -logging.getLogger('deepspeed').setLevel(logging.WARNING) +from deepspeed.utils import logger as ds_logger +ds_logger.setLevel(logging.WARNING) + def mup_weights_reinit(neox_args, model): def has_method(o, name): From 0644f3c1079ea51f3281dc5a34910ed86854ee5e Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Sun, 10 Sep 2023 14:57:51 +0900 Subject: [PATCH 112/183] add swiglu --- megatron/model/activations.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/megatron/model/activations.py b/megatron/model/activations.py index 5c4ba1d5a..fc4c73445 100644 --- a/megatron/model/activations.py +++ b/megatron/model/activations.py @@ -46,7 +46,9 @@ def get_activation(neox_args): elif neox_args.activation == "mish": activation_func = mish elif neox_args.activation == "silu": - activation_func = F.silu + activation_func = F.silu + elif neox_args.activation == "swiglu": + activation_func = swiglu else: raise ValueError(f"Activation function {neox_args.activation} not recognized") return activation_func @@ -120,6 +122,10 @@ def swish(x, beta: float = 1.0): def mish(x): return x * torch.tanh(F.softplus(x)) +@torch.jit.script +def swiglu(x): + x = torch.chunk(x, 2, dim=-1) + return F.silu(x[0]) * x[1] class GEGLU(torch.nn.Module): def __init__(self, neox_args): From c6264b0fe24c893097816f50cc6a5a47a1093f74 Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Sun, 10 Sep 2023 15:00:53 +0900 Subject: [PATCH 113/183] add swiglu --- megatron/neox_arguments/neox_args.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/megatron/neox_arguments/neox_args.py b/megatron/neox_arguments/neox_args.py index e1ea16a16..6ddae6e92 100644 --- a/megatron/neox_arguments/neox_args.py +++ b/megatron/neox_arguments/neox_args.py @@ -229,10 +229,10 @@ class NeoXArgsModel(NeoXArgsTemplate): """ activation: Literal[ - "gelu", "geglu", "relu", "softsign", "swish", "mish", "silu" + "gelu", "geglu", "relu", "softsign", "swish", "mish", "silu", "swiglu" ] = "gelu" """ - Activation function to use - choose from ["gelu", "geglu", "relu", "softsign", "swish", "mish", "silu"] + Activation function to use - choose from ["gelu", "geglu", "relu", "softsign", "swish", "mish", "silu", "swiglu"] """ scaled_upper_triang_masked_softmax_fusion: bool = False From ea72bd8063903bba89438991720973d23b6388af Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Sun, 10 Sep 2023 15:05:26 +0900 Subject: [PATCH 114/183] curriculum flash activation --- configs/49M.yml | 30 +++++++++++++++++++++++++++--- 1 file changed, 27 insertions(+), 3 deletions(-) diff --git a/configs/49M.yml b/configs/49M.yml index 560eb4de6..496210217 100644 --- a/configs/49M.yml +++ b/configs/49M.yml @@ -14,12 +14,36 @@ "no_weight_tying": true, "gpt_j_residual": true, "output_layer_parallelism": "column", + # "activation": "glue", + "activation": "swiglu", + "norm": "rmsnorm", # "use_bnb_optimizer": true, - # these should provide some speedup but takes a while to build, set to true if desired - "scaled_upper_triang_masked_softmax_fusion": false, - "bias_gelu_fusion": false, + #"scaled_upper_triang_masked_softmax_fusion": false, + #"bias_gelu_fusion": false, + "scaled_upper_triang_masked_softmax_fusion": true, + "bias-gelu-fusion": true, + "attention-config": [ + [ + [ + "flash" + ], + 10 + ] + ], + "curriculum_learning": { + "enabled": true, + "curriculum_type": "seqlen", + "min_difficulty": 64, + "max_difficulty": 2048, + "schedule_type": "fixed_linear", + "schedule_config": { + "total_curriculum_step": 20000, + "difficulty_step": 8 + } + }, + # init methods "init_method": "small_init", From f02ab6959433a1fbf401ea565803e4c1058ff204 Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Sun, 10 Sep 2023 15:30:38 +0900 Subject: [PATCH 115/183] fix --- configs/49M.yml | 6 +++--- configs/local_setup_ja.yml | 10 +++++++--- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/configs/49M.yml b/configs/49M.yml index 496210217..fa9fdd7fc 100644 --- a/configs/49M.yml +++ b/configs/49M.yml @@ -14,15 +14,15 @@ "no_weight_tying": true, "gpt_j_residual": true, "output_layer_parallelism": "column", - # "activation": "glue", - "activation": "swiglu", + "activation": "glue", + # "activation": "swiglu", "norm": "rmsnorm", # "use_bnb_optimizer": true, # these should provide some speedup but takes a while to build, set to true if desired #"scaled_upper_triang_masked_softmax_fusion": false, #"bias_gelu_fusion": false, - "scaled_upper_triang_masked_softmax_fusion": true, + "scaled_upper_triang_masked_softmax_fusion": true, "bias-gelu-fusion": true, "attention-config": [ [ diff --git a/configs/local_setup_ja.yml b/configs/local_setup_ja.yml index 29af25041..972fe7fc2 100644 --- a/configs/local_setup_ja.yml +++ b/configs/local_setup_ja.yml @@ -18,14 +18,18 @@ "vocab_file": "./novelAI/tokenizer.model", - "save": "checkpoints", - "load": "checkpoints", + #"save": "checkpoints", + # "load": "checkpoints", + "save": "/content/drive/MyDrive/pre_trained/49M" + "load": "/content/drive/MyDrive/pre_trained/49M" + "checkpoint_validation_with_forward_pass": False, ## logging "log_dir": "logs", - "tensorboard_dir": "tensorboard", + # "tensorboard_dir": "tensorboard", + "tensorboard_dir": "/content/drive/MyDrive/pre_trained/49M" "log_dir": "logs", "use_wandb": False } From b5a05302c17a5988c33d2aab199dc89f98e5648d Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Sun, 10 Sep 2023 15:33:39 +0900 Subject: [PATCH 116/183] fix --- configs/local_setup_ja.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/configs/local_setup_ja.yml b/configs/local_setup_ja.yml index 972fe7fc2..547f8b164 100644 --- a/configs/local_setup_ja.yml +++ b/configs/local_setup_ja.yml @@ -20,8 +20,8 @@ #"save": "checkpoints", # "load": "checkpoints", - "save": "/content/drive/MyDrive/pre_trained/49M" - "load": "/content/drive/MyDrive/pre_trained/49M" + "save": "/content/drive/MyDrive/pre_trained/49M", + "load": "/content/drive/MyDrive/pre_trained/49M", "checkpoint_validation_with_forward_pass": False, @@ -29,7 +29,7 @@ "log_dir": "logs", # "tensorboard_dir": "tensorboard", - "tensorboard_dir": "/content/drive/MyDrive/pre_trained/49M" + "tensorboard_dir": "/content/drive/MyDrive/pre_trained/49M", "log_dir": "logs", "use_wandb": False } From 4b17a49970272d1b5d186b4be26dabdfec6480db Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Sun, 10 Sep 2023 15:35:03 +0900 Subject: [PATCH 117/183] fix --- configs/49M.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/49M.yml b/configs/49M.yml index fa9fdd7fc..b24ee884e 100644 --- a/configs/49M.yml +++ b/configs/49M.yml @@ -14,7 +14,7 @@ "no_weight_tying": true, "gpt_j_residual": true, "output_layer_parallelism": "column", - "activation": "glue", + "activation": "gelu", # "activation": "swiglu", "norm": "rmsnorm", # "use_bnb_optimizer": true, From 401bf31cba0dfc55511018f2a71556f6e5fa60dc Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Sun, 10 Sep 2023 16:45:17 +0900 Subject: [PATCH 118/183] wsiglu --- configs/49M.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/configs/49M.yml b/configs/49M.yml index b24ee884e..52443e367 100644 --- a/configs/49M.yml +++ b/configs/49M.yml @@ -14,8 +14,8 @@ "no_weight_tying": true, "gpt_j_residual": true, "output_layer_parallelism": "column", - "activation": "gelu", - # "activation": "swiglu", + # "activation": "gelu", + "activation": "swiglu", "norm": "rmsnorm", # "use_bnb_optimizer": true, From 9dfe92f8bf49d3787d2412abb0ecda64caeed870 Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Sun, 10 Sep 2023 16:50:49 +0900 Subject: [PATCH 119/183] swiglu --- megatron/model/transformer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py index 4e81b70b6..890703e80 100644 --- a/megatron/model/transformer.py +++ b/megatron/model/transformer.py @@ -120,7 +120,7 @@ def forward(self, hidden_states): if ( self.activation_type == "gelu" and self.bias_gelu_fusion - ) or self.activation_type == "geglu": + ) or self.activation_type == "geglu" or self.activation_type == "swiglu": intermediate_parallel = self.activation_func( intermediate_parallel, bias_parallel ) From 322deed28914105a425887e0a92bfb5192bcd46e Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Sun, 10 Sep 2023 17:17:53 +0900 Subject: [PATCH 120/183] xpos --- configs/49M.yml | 7 +- megatron/model/positional_embeddings.py | 107 ++++++++++++++++++++++++ megatron/model/transformer.py | 18 +++- megatron/neox_arguments/neox_args.py | 2 +- 4 files changed, 129 insertions(+), 5 deletions(-) diff --git a/configs/49M.yml b/configs/49M.yml index 52443e367..4e162b9f1 100644 --- a/configs/49M.yml +++ b/configs/49M.yml @@ -9,13 +9,14 @@ "num_attention_heads": 10, "seq_length": 2048, "max_position_embeddings": 2048, - "pos_emb": "rotary", + # "pos_emb": "rotary", + "pos_emb": "xpos", "rotary_pct": 0.25, "no_weight_tying": true, "gpt_j_residual": true, "output_layer_parallelism": "column", - # "activation": "gelu", - "activation": "swiglu", + "activation": "gelu", + # "activation": "swiglu", "norm": "rmsnorm", # "use_bnb_optimizer": true, diff --git a/megatron/model/positional_embeddings.py b/megatron/model/positional_embeddings.py index 68815075a..59f1bb02e 100644 --- a/megatron/model/positional_embeddings.py +++ b/megatron/model/positional_embeddings.py @@ -221,3 +221,110 @@ def forward(self, x): ) # seq_len_k - 1 points to the last token index in the current inference batch. return x + a + + +# Original implementation adjusted from https://github.com/sunyt32/torchscale + +def fixed_pos_embedding(x, base): + seq_len, dim = x.shape + inv_freq = 1.0 / (base ** (torch.arange(0, dim) / dim)) + sinusoid_inp = ( + torch.einsum("i , j -> i j", torch.arange(0, seq_len, dtype=torch.float), inv_freq).to(x) + ) + return torch.cos(sinusoid_inp), torch.sin(sinusoid_inp) + + +class XPosEmbedding(torch.nn.Module): + """ + xPos positional embeddings from https://arxiv.org/abs/2212.10554. + """ + + def __init__(self, head_dim, freq_base=10000, scale_base=512, gamma=0.4, precision=torch.half): + super().__init__() + self.scale_base = scale_base + self.register_buffer( + "scale", + ( + (torch.arange(0, head_dim, 2) + gamma * head_dim) + / ((1.0 + gamma) * head_dim) + ), + ) + self.max_seq_len_cached = None + self.precision = precision + self.freq_base = freq_base + + def forward(self, x, seq_dim=1, seq_len=None): + if seq_len is None: + seq_len = x.shape[seq_dim] + scale = ( + self.scale + ** ( + torch.arange(0, seq_len, 1) - seq_len // 2 + ).to(self.scale).div(self.scale_base)[:, None] + ) + + if ( + self.max_seq_len_cached is None + or (seq_len > self.max_seq_len_cached) + ): + self.max_seq_len_cached = seq_len + cos, sin = fixed_pos_embedding(scale, self.freq_base) + self.cos_cached = cos + self.sin_cached = sin + if self.precision == torch.bfloat16: + self.cos_cached = self.cos_cached.bfloat16() + self.sin_cached = self.sin_cached.bfloat16() + return ( + self.cos_cached[:seq_len], + self.sin_cached[:seq_len], + scale, + ) + + +def rotate_every_two(x): + x1 = x[:, :, ::2] + x2 = x[:, :, 1::2] + x = torch.stack((-x2, x1), dim=-1) + return x.flatten(-2) # in einsum notation: rearrange(x, '... d j -> ... (d j)')\ + + +def duplicate_interleave(m): + """ + A simple version of `torch.repeat_interleave` for duplicating a matrix while interleaving the copy. + """ + dim0 = m.shape[0] + m = m.view(-1, 1) # flatten the matrix + m = m.repeat(1, 2) # repeat all elements into the 2nd dimension + m = m.view(dim0, -1) # reshape into a matrix, interleaving the copy + return m.unsqueeze(1) + + +def _apply_xpos_emb(x, cos, sin, scale): + # x is assumed to be (seq_len, batch_size, dim) here. + cos = duplicate_interleave(cos * scale) + sin = duplicate_interleave(sin * scale) + # einsum notation for lambda t: repeat(t[offset:x.shape[1]+offset,:], "n d -> () n () (d j)", j=2) + return (x * cos) + (rotate_every_two(x) * sin) + + +@torch.jit.script +def apply_xpos_emb(q, k, cos, sin, scale, offset: int = 0): + # q/k are assumed to be (seq_len, batch_size, dim) here. + cos = cos[offset:q.shape[0] + offset] + sin = sin[offset:q.shape[0] + offset] + scale = scale[offset:q.shape[0] + offset] + return ( + _apply_xpos_emb(q, cos, sin, scale), + _apply_xpos_emb(k, cos, sin, 1.0 / scale), + ) + + +def apply_xpos_emb_torch(q, k, cos, sin, scale, offset: int = 0): + # q/k are assumed to be (seq_len, batch_size, dim) here. + cos = cos[offset:q.shape[0] + offset] + sin = sin[offset:q.shape[0] + offset] + scale = scale[offset:q.shape[0] + offset] + return ( + _apply_xpos_emb(q, cos, sin, scale), + _apply_xpos_emb(k, cos, sin, 1.0 / scale), + ) \ No newline at end of file diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py index 890703e80..1eefacae6 100644 --- a/megatron/model/transformer.py +++ b/megatron/model/transformer.py @@ -32,6 +32,9 @@ apply_rotary_pos_emb_torch, apply_rotary_pos_emb, AliBi, + XPosEmbedding, + apply_xpos_emb_torch, + apply_xpos_emb ) from megatron.model.fused_bias_dropout import ( get_bias_dropout_add, @@ -120,7 +123,7 @@ def forward(self, hidden_states): if ( self.activation_type == "gelu" and self.bias_gelu_fusion - ) or self.activation_type == "geglu" or self.activation_type == "swiglu": + ) or self.activation_type == "geglu": intermediate_parallel = self.activation_func( intermediate_parallel, bias_parallel ) @@ -332,6 +335,11 @@ def __init__( else: self.rotary_emb = None + ## xpos + if neox_args.pos_emb == "xpos": + self.xpos_emb = XPosEmbedding(self.hidden_size_per_attention_head, precision=neox_args.params_dtype) + else: + self.xpos_emb = None self.attention_type = neox_args.attention_config[layer_number] self.use_flash_attention = self.attention_type == "flash" self.sparse = self.attention_type not in ("global", "flash") @@ -665,6 +673,14 @@ def forward(self, hidden_states, attention_mask, layer_past=None): query_layer = torch.cat((query_layer, query_pass), dim=-1) key_layer = torch.cat((key_layer, key_pass), dim=-1) + ## xpos + if exists(self.xpos_emb): + apply_xpos_fn = apply_xpos_emb_torch if self.bf16 else apply_xpos_emb + cos, sin, scale = self.xpos_emb(value_layer, seq_len=seq_len) + query_layer, key_layer = apply_xpos_fn( + query_layer, key_layer, cos, sin, scale, offset=offset) + + # ================================== # Cache key and value for inference # ================================== diff --git a/megatron/neox_arguments/neox_args.py b/megatron/neox_arguments/neox_args.py index 6ddae6e92..98707e267 100644 --- a/megatron/neox_arguments/neox_args.py +++ b/megatron/neox_arguments/neox_args.py @@ -141,7 +141,7 @@ class NeoXArgsModel(NeoXArgsTemplate): """ pos_emb: Literal[ - "learned", "rotary", "sinusoidal", "rpe", "alibi", "none" + "learned", "rotary", "sinusoidal", "rpe", "alibi", "none", "xpos" ] = "learned" """ Type of positional embedding to use - choose from 'learned', 'rotary', 'sinusoidal', 'rpe', 'none' From 2e95d1dc056332ea5058d6e92035a7dc4f2f771b Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Sun, 10 Sep 2023 17:19:31 +0900 Subject: [PATCH 121/183] fix --- configs/local_setup_ja.yml | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/configs/local_setup_ja.yml b/configs/local_setup_ja.yml index 547f8b164..69ab5effb 100644 --- a/configs/local_setup_ja.yml +++ b/configs/local_setup_ja.yml @@ -20,16 +20,17 @@ #"save": "checkpoints", # "load": "checkpoints", - "save": "/content/drive/MyDrive/pre_trained/49M", - "load": "/content/drive/MyDrive/pre_trained/49M", + "save": "/content/drive/MyDrive/pre_trained/49M/checkpoints", + "load": "/content/drive/MyDrive/pre_trained/49M/checkpoints", "checkpoint_validation_with_forward_pass": False, ## logging - "log_dir": "logs", + # "log_dir": "logs", + "log_dir": "/content/drive/MyDrive/pre_trained/49M/logs", # "tensorboard_dir": "tensorboard", - "tensorboard_dir": "/content/drive/MyDrive/pre_trained/49M", - "log_dir": "logs", + "tensorboard_dir": "/content/drive/MyDrive/pre_trained/49M/tensorboard", + # "log_dir": "logs", "use_wandb": False } From c5fd89a188f3798afcde82142acf767fe83f10ae Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Sun, 10 Sep 2023 17:22:31 +0900 Subject: [PATCH 122/183] fix xpos --- megatron/model/transformer.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py index 1eefacae6..8b6627a3f 100644 --- a/megatron/model/transformer.py +++ b/megatron/model/transformer.py @@ -675,6 +675,12 @@ def forward(self, hidden_states, attention_mask, layer_past=None): ## xpos if exists(self.xpos_emb): + seq_len = key_layer.shape[0] + offset = 0 + if exists(layer_past) and layer_past.numel() > 0: + offset = layer_past[0].shape[0] + seq_len += offset + apply_xpos_fn = apply_xpos_emb_torch if self.bf16 else apply_xpos_emb cos, sin, scale = self.xpos_emb(value_layer, seq_len=seq_len) query_layer, key_layer = apply_xpos_fn( From db18c2dd829a5b9bef650bde443cddc3a3f86114 Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Sun, 10 Sep 2023 17:36:23 +0900 Subject: [PATCH 123/183] fix --- megatron/model/transformer.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py index 8b6627a3f..ebb40baea 100644 --- a/megatron/model/transformer.py +++ b/megatron/model/transformer.py @@ -640,6 +640,14 @@ def forward(self, hidden_states, attention_mask, layer_past=None): mixed_x_layer, 3 ) + ## for xpos + if layer_past is not None: + past_key, past_value = layer_past + key_layer = torch.cat((past_key.type_as(key_layer), + key_layer), dim=0) + value_layer = torch.cat((past_value.type_as(value_layer), + value_layer), dim=0) + if exists(self.rotary_emb): if exists(self.rotary_ndims): # partial rotary From e2ec22c7004afbac32fa8552d397e0fc64bcf61e Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Sun, 10 Sep 2023 17:38:26 +0900 Subject: [PATCH 124/183] debug --- megatron/model/transformer.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py index ebb40baea..e8dec142c 100644 --- a/megatron/model/transformer.py +++ b/megatron/model/transformer.py @@ -640,14 +640,6 @@ def forward(self, hidden_states, attention_mask, layer_past=None): mixed_x_layer, 3 ) - ## for xpos - if layer_past is not None: - past_key, past_value = layer_past - key_layer = torch.cat((past_key.type_as(key_layer), - key_layer), dim=0) - value_layer = torch.cat((past_value.type_as(value_layer), - value_layer), dim=0) - if exists(self.rotary_emb): if exists(self.rotary_ndims): # partial rotary @@ -669,9 +661,17 @@ def forward(self, hidden_states, attention_mask, layer_past=None): seq_len = key_layer.shape[0] offset = 0 + if layer_past is not None: + past_key, past_value = layer_past + key_layer = torch.cat((past_key.type_as(key_layer), + key_layer), dim=0) + value_layer = torch.cat((past_value.type_as(value_layer), + value_layer), dim=0) + if exists(layer_past) and layer_past.numel() > 0: offset = layer_past[0].shape[0] seq_len += offset + print('has layer_past', exists(layer_past)) cos, sin = self.rotary_emb(value_layer, seq_len=seq_len) query_layer, key_layer = apply_rotary_fn( query_rot, key_rot, cos, sin, offset=offset From c7aa2386bbaa10d06ae47f6303ae054a0a959e6b Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Sun, 10 Sep 2023 17:40:03 +0900 Subject: [PATCH 125/183] debug --- megatron/model/transformer.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py index e8dec142c..3ae8bbffb 100644 --- a/megatron/model/transformer.py +++ b/megatron/model/transformer.py @@ -661,17 +661,10 @@ def forward(self, hidden_states, attention_mask, layer_past=None): seq_len = key_layer.shape[0] offset = 0 - if layer_past is not None: - past_key, past_value = layer_past - key_layer = torch.cat((past_key.type_as(key_layer), - key_layer), dim=0) - value_layer = torch.cat((past_value.type_as(value_layer), - value_layer), dim=0) - if exists(layer_past) and layer_past.numel() > 0: offset = layer_past[0].shape[0] seq_len += offset - print('has layer_past', exists(layer_past)) + cos, sin = self.rotary_emb(value_layer, seq_len=seq_len) query_layer, key_layer = apply_rotary_fn( query_rot, key_rot, cos, sin, offset=offset @@ -685,6 +678,15 @@ def forward(self, hidden_states, attention_mask, layer_past=None): if exists(self.xpos_emb): seq_len = key_layer.shape[0] offset = 0 + print('has layer_past', exists(layer_past)) + if exists(layer_past): + past_key, past_value = layer_past + key_layer = torch.cat((past_key.type_as(key_layer), + key_layer), dim=0) + value_layer = torch.cat((past_value.type_as(value_layer), + value_layer), dim=0) + + if exists(layer_past) and layer_past.numel() > 0: offset = layer_past[0].shape[0] seq_len += offset From 9de41b1bd081898c5495f9ddafeef35a2ab484f3 Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Sun, 10 Sep 2023 17:43:21 +0900 Subject: [PATCH 126/183] debug --- megatron/model/transformer.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py index 3ae8bbffb..f0e786715 100644 --- a/megatron/model/transformer.py +++ b/megatron/model/transformer.py @@ -676,6 +676,23 @@ def forward(self, hidden_states, attention_mask, layer_past=None): ## xpos if exists(self.xpos_emb): + # =================================== + # Raw attention scores. [b, np, s, s] + # =================================== + + # [b, np, sq, sk] + output_size = (query_layer.size(1), + query_layer.size(2), + query_layer.size(0), + key_layer.size(0)) + + # [sq, b, np, hn] -> [sq, b * np, hn] + query_layer = query_layer.view(output_size[2], + output_size[0] * output_size[1], -1) + # [sk, b, np, hn] -> [sk, b * np, hn] + key_layer = key_layer.view(output_size[3], + output_size[0] * output_size[1], -1) + seq_len = key_layer.shape[0] offset = 0 print('has layer_past', exists(layer_past)) From 6478c1c7e285553d74cb4fb8d767472368e85a30 Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Sun, 10 Sep 2023 17:49:28 +0900 Subject: [PATCH 127/183] debug --- megatron/model/transformer.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py index f0e786715..293c94c8f 100644 --- a/megatron/model/transformer.py +++ b/megatron/model/transformer.py @@ -728,6 +728,12 @@ def forward(self, hidden_states, attention_mask, layer_past=None): if self.use_cache: present = torch.stack((key_layer, value_layer)) + if exists(self.xpos_emb): + # change view [sk, b * np, hn] + value_layer = value_layer.view(value_layer.size(0), + output_size[0] * output_size[1], -1) + + if self.use_flash_attention: context_layer = self.flash_attention(query_layer, key_layer, value_layer) elif not self.sparse: From 5025cb230faf905d06d27aae18cf0e300b2cb398 Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Sun, 10 Sep 2023 18:02:41 +0900 Subject: [PATCH 128/183] fix --- configs/49M.yml | 16 ++++++++-------- megatron/model/transformer.py | 7 ------- 2 files changed, 8 insertions(+), 15 deletions(-) diff --git a/configs/49M.yml b/configs/49M.yml index 4e162b9f1..a32dec0c0 100644 --- a/configs/49M.yml +++ b/configs/49M.yml @@ -25,14 +25,14 @@ #"bias_gelu_fusion": false, "scaled_upper_triang_masked_softmax_fusion": true, "bias-gelu-fusion": true, - "attention-config": [ - [ - [ - "flash" - ], - 10 - ] - ], + # "attention-config": [ + # [ + # [ + # "flash" + # ], + # 10 + # ] + # ], "curriculum_learning": { "enabled": true, "curriculum_type": "seqlen", diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py index 293c94c8f..c644219b2 100644 --- a/megatron/model/transformer.py +++ b/megatron/model/transformer.py @@ -578,7 +578,6 @@ def flash_attention(self, query_layer, key_layer, value_layer): ) # [b, sq, np, hn] -> [b, np, sq, hn] matmul_result = matmul_result.transpose(1, 2) - else: # [sq, b, np, hn] -> [b, sq, np, hn] sq = query_layer.size(0) @@ -728,12 +727,6 @@ def forward(self, hidden_states, attention_mask, layer_past=None): if self.use_cache: present = torch.stack((key_layer, value_layer)) - if exists(self.xpos_emb): - # change view [sk, b * np, hn] - value_layer = value_layer.view(value_layer.size(0), - output_size[0] * output_size[1], -1) - - if self.use_flash_attention: context_layer = self.flash_attention(query_layer, key_layer, value_layer) elif not self.sparse: From f3979b8f7f1450b7208e74a6e727fa6782c32571 Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Sun, 10 Sep 2023 18:17:25 +0900 Subject: [PATCH 129/183] debug --- configs/49M.yml | 4 ++-- megatron/model/transformer.py | 8 +++++++- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/configs/49M.yml b/configs/49M.yml index a32dec0c0..dcc196cbb 100644 --- a/configs/49M.yml +++ b/configs/49M.yml @@ -9,8 +9,8 @@ "num_attention_heads": 10, "seq_length": 2048, "max_position_embeddings": 2048, - # "pos_emb": "rotary", - "pos_emb": "xpos", + "pos_emb": "rotary", + # "pos_emb": "xpos", "rotary_pct": 0.25, "no_weight_tying": true, "gpt_j_residual": true, diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py index c644219b2..69de6ad4a 100644 --- a/megatron/model/transformer.py +++ b/megatron/model/transformer.py @@ -672,7 +672,10 @@ def forward(self, hidden_states, attention_mask, layer_past=None): if exists(self.rotary_ndims): query_layer = torch.cat((query_layer, query_pass), dim=-1) key_layer = torch.cat((key_layer, key_pass), dim=-1) - + print('query_layer', query_layer.size()) + print('key_layer', key_layer.size()) + print('value_layer', value_layer.size()) + ## xpos if exists(self.xpos_emb): # =================================== @@ -711,6 +714,9 @@ def forward(self, hidden_states, attention_mask, layer_past=None): cos, sin, scale = self.xpos_emb(value_layer, seq_len=seq_len) query_layer, key_layer = apply_xpos_fn( query_layer, key_layer, cos, sin, scale, offset=offset) + print('query_layer', query_layer.size()) + print('key_layer', key_layer.size()) + print('value_layer', value_layer.size()) # ================================== From fd9a93156e20b9695bfbaa262e44b5cc51dde730 Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Sun, 10 Sep 2023 18:19:58 +0900 Subject: [PATCH 130/183] debug --- configs/49M.yml | 4 ++-- megatron/model/transformer.py | 7 ++++--- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/configs/49M.yml b/configs/49M.yml index dcc196cbb..a32dec0c0 100644 --- a/configs/49M.yml +++ b/configs/49M.yml @@ -9,8 +9,8 @@ "num_attention_heads": 10, "seq_length": 2048, "max_position_embeddings": 2048, - "pos_emb": "rotary", - # "pos_emb": "xpos", + # "pos_emb": "rotary", + "pos_emb": "xpos", "rotary_pct": 0.25, "no_weight_tying": true, "gpt_j_residual": true, diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py index 69de6ad4a..92fcd642d 100644 --- a/megatron/model/transformer.py +++ b/megatron/model/transformer.py @@ -672,9 +672,10 @@ def forward(self, hidden_states, attention_mask, layer_past=None): if exists(self.rotary_ndims): query_layer = torch.cat((query_layer, query_pass), dim=-1) key_layer = torch.cat((key_layer, key_pass), dim=-1) - print('query_layer', query_layer.size()) - print('key_layer', key_layer.size()) - print('value_layer', value_layer.size()) + + # print('query_layer', query_layer.size()) #torch.Size([64, 8, 10, 64]) + # print('key_layer', key_layer.size()) #torch.Size([64, 8, 10, 64]) + # print('value_layer', value_layer.size()) #torch.Size([64, 8, 10, 64]) ## xpos if exists(self.xpos_emb): From 31327fdd41a8df66385bebac6530b586df132705 Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Sun, 10 Sep 2023 18:22:51 +0900 Subject: [PATCH 131/183] debug --- megatron/model/transformer.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py index 92fcd642d..e8b987312 100644 --- a/megatron/model/transformer.py +++ b/megatron/model/transformer.py @@ -688,6 +688,8 @@ def forward(self, hidden_states, attention_mask, layer_past=None): query_layer.size(2), query_layer.size(0), key_layer.size(0)) + print('query_layer0', query_layer.size()) # torch.Size([64, 80, 64]) + print('key_layer0', key_layer.size()) # torch.Size([64, 80, 64]) # [sq, b, np, hn] -> [sq, b * np, hn] query_layer = query_layer.view(output_size[2], @@ -695,6 +697,9 @@ def forward(self, hidden_states, attention_mask, layer_past=None): # [sk, b, np, hn] -> [sk, b * np, hn] key_layer = key_layer.view(output_size[3], output_size[0] * output_size[1], -1) + + print('query_layer1', query_layer.size()) # torch.Size([64, 80, 64]) + print('key_layer1', key_layer.size()) # torch.Size([64, 80, 64]) seq_len = key_layer.shape[0] offset = 0 @@ -710,14 +715,16 @@ def forward(self, hidden_states, attention_mask, layer_past=None): if exists(layer_past) and layer_past.numel() > 0: offset = layer_past[0].shape[0] seq_len += offset + print('query_layer2', query_layer.size()) # torch.Size([64, 80, 64]) + print('key_laye2', key_layer.size()) # torch.Size([64, 80, 64]) apply_xpos_fn = apply_xpos_emb_torch if self.bf16 else apply_xpos_emb cos, sin, scale = self.xpos_emb(value_layer, seq_len=seq_len) query_layer, key_layer = apply_xpos_fn( - query_layer, key_layer, cos, sin, scale, offset=offset) - print('query_layer', query_layer.size()) - print('key_layer', key_layer.size()) - print('value_layer', value_layer.size()) + query_layer, key_layer, cos, sin, scale, offset=offset) + print('query_layer3', query_layer.size()) # torch.Size([64, 80, 64]) + print('key_layer3', key_layer.size()) # torch.Size([64, 80, 64]) + print('value_layer3', value_layer.size()) # torch.Size([64, 8, 10, 64]) # ================================== From 7d568d549e648f51d5038d3e03202231d5cb2413 Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Sun, 10 Sep 2023 18:30:01 +0900 Subject: [PATCH 132/183] fix --- megatron/model/transformer.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py index e8b987312..264ab68e8 100644 --- a/megatron/model/transformer.py +++ b/megatron/model/transformer.py @@ -722,9 +722,17 @@ def forward(self, hidden_states, attention_mask, layer_past=None): cos, sin, scale = self.xpos_emb(value_layer, seq_len=seq_len) query_layer, key_layer = apply_xpos_fn( query_layer, key_layer, cos, sin, scale, offset=offset) + print('query_layer3', query_layer.size()) # torch.Size([64, 80, 64]) print('key_layer3', key_layer.size()) # torch.Size([64, 80, 64]) - print('value_layer3', value_layer.size()) # torch.Size([64, 8, 10, 64]) + print('value_layer3', value_layer.size()) # torch.Size([64, 8, 10, 64]) + query_layer = query_layer.view(*output_size) + key_layer = key_layer.view(*output_size) + + print('query_layer4', query_layer.size()) # torch.Size([64, 80, 64]) + print('key_layer4', key_layer.size()) # torch.Size([64, 80, 64]) + print('value_layer4', value_layer.size()) # torch.Size([64, 8, 10, 64]) + # ================================== From 7bbc614b9ea7ca7fcca32cb7a59a95acb96083ff Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Sun, 10 Sep 2023 18:34:00 +0900 Subject: [PATCH 133/183] fix --- megatron/model/transformer.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py index 264ab68e8..ef79eded7 100644 --- a/megatron/model/transformer.py +++ b/megatron/model/transformer.py @@ -684,6 +684,8 @@ def forward(self, hidden_states, attention_mask, layer_past=None): # =================================== # [b, np, sq, sk] + + _b, _s, _s2, _e = query_layer.size() output_size = (query_layer.size(1), query_layer.size(2), query_layer.size(0), @@ -726,8 +728,8 @@ def forward(self, hidden_states, attention_mask, layer_past=None): print('query_layer3', query_layer.size()) # torch.Size([64, 80, 64]) print('key_layer3', key_layer.size()) # torch.Size([64, 80, 64]) print('value_layer3', value_layer.size()) # torch.Size([64, 8, 10, 64]) - query_layer = query_layer.view(*output_size) - key_layer = key_layer.view(*output_size) + query_layer = query_layer.view(_b, _s, _s2, _e) + key_layer = key_layer.view(_b, _s, _s2, _e) print('query_layer4', query_layer.size()) # torch.Size([64, 80, 64]) print('key_layer4', key_layer.size()) # torch.Size([64, 80, 64]) From 4b1521e1b380edd8641da4be0a063012b60c3597 Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Sun, 10 Sep 2023 18:34:16 +0900 Subject: [PATCH 134/183] fix --- megatron/model/transformer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py index ef79eded7..eaff6ab6e 100644 --- a/megatron/model/transformer.py +++ b/megatron/model/transformer.py @@ -690,6 +690,7 @@ def forward(self, hidden_states, attention_mask, layer_past=None): query_layer.size(2), query_layer.size(0), key_layer.size(0)) + print('output_size', output_size) print('query_layer0', query_layer.size()) # torch.Size([64, 80, 64]) print('key_layer0', key_layer.size()) # torch.Size([64, 80, 64]) From f80ee3570db37d17121ecf9ca580b127c592210b Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Sun, 10 Sep 2023 18:48:38 +0900 Subject: [PATCH 135/183] debug --- megatron/model/transformer.py | 43 ++++++++++++----------------------- 1 file changed, 14 insertions(+), 29 deletions(-) diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py index eaff6ab6e..1d2b39ea9 100644 --- a/megatron/model/transformer.py +++ b/megatron/model/transformer.py @@ -682,62 +682,47 @@ def forward(self, hidden_states, attention_mask, layer_past=None): # =================================== # Raw attention scores. [b, np, s, s] # =================================== - + _sq, _b, _np, _hn = query_layer.size() + # [b, np, sq, sk] - - _b, _s, _s2, _e = query_layer.size() output_size = (query_layer.size(1), query_layer.size(2), query_layer.size(0), key_layer.size(0)) - print('output_size', output_size) - print('query_layer0', query_layer.size()) # torch.Size([64, 80, 64]) - print('key_layer0', key_layer.size()) # torch.Size([64, 80, 64]) + print('key_layer 1' , key_layer.size()) + # [sq, b, np, hn] -> [sq, b * np, hn] query_layer = query_layer.view(output_size[2], output_size[0] * output_size[1], -1) # [sk, b, np, hn] -> [sk, b * np, hn] key_layer = key_layer.view(output_size[3], - output_size[0] * output_size[1], -1) - - print('query_layer1', query_layer.size()) # torch.Size([64, 80, 64]) - print('key_layer1', key_layer.size()) # torch.Size([64, 80, 64]) - + output_size[0] * output_size[1], -1) + print('key_layer 2' , key_layer.size()) seq_len = key_layer.shape[0] offset = 0 - print('has layer_past', exists(layer_past)) if exists(layer_past): past_key, past_value = layer_past key_layer = torch.cat((past_key.type_as(key_layer), key_layer), dim=0) value_layer = torch.cat((past_value.type_as(value_layer), value_layer), dim=0) - - + + print('key_layer 3' , key_layer.size()) if exists(layer_past) and layer_past.numel() > 0: offset = layer_past[0].shape[0] seq_len += offset - print('query_layer2', query_layer.size()) # torch.Size([64, 80, 64]) - print('key_laye2', key_layer.size()) # torch.Size([64, 80, 64]) apply_xpos_fn = apply_xpos_emb_torch if self.bf16 else apply_xpos_emb cos, sin, scale = self.xpos_emb(value_layer, seq_len=seq_len) query_layer, key_layer = apply_xpos_fn( query_layer, key_layer, cos, sin, scale, offset=offset) - - print('query_layer3', query_layer.size()) # torch.Size([64, 80, 64]) - print('key_layer3', key_layer.size()) # torch.Size([64, 80, 64]) - print('value_layer3', value_layer.size()) # torch.Size([64, 8, 10, 64]) - query_layer = query_layer.view(_b, _s, _s2, _e) - key_layer = key_layer.view(_b, _s, _s2, _e) - - print('query_layer4', query_layer.size()) # torch.Size([64, 80, 64]) - print('key_layer4', key_layer.size()) # torch.Size([64, 80, 64]) - print('value_layer4', value_layer.size()) # torch.Size([64, 8, 10, 64]) - - - + print('key_layer 4' , key_layer.size()) + ## [b, np*sq, hn] -> [b, np, sq, hn] + query_layer = query_layer.view(_b, _np, _sq, _hn) + key_layer = key_layer.view(_b, _np, _sq, _hn) + print('key_layer 5' , key_layer.size()) + exit(0) # ================================== # Cache key and value for inference # ================================== From e41a7f258825c400894be549a45ffb5670ae7bd9 Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Sun, 10 Sep 2023 18:55:44 +0900 Subject: [PATCH 136/183] fix --- megatron/model/transformer.py | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py index 1d2b39ea9..82817d5c2 100644 --- a/megatron/model/transformer.py +++ b/megatron/model/transformer.py @@ -689,16 +689,13 @@ def forward(self, hidden_states, attention_mask, layer_past=None): query_layer.size(2), query_layer.size(0), key_layer.size(0)) - print('key_layer 1' , key_layer.size()) - # [sq, b, np, hn] -> [sq, b * np, hn] query_layer = query_layer.view(output_size[2], output_size[0] * output_size[1], -1) # [sk, b, np, hn] -> [sk, b * np, hn] key_layer = key_layer.view(output_size[3], output_size[0] * output_size[1], -1) - print('key_layer 2' , key_layer.size()) seq_len = key_layer.shape[0] offset = 0 if exists(layer_past): @@ -708,7 +705,6 @@ def forward(self, hidden_states, attention_mask, layer_past=None): value_layer = torch.cat((past_value.type_as(value_layer), value_layer), dim=0) - print('key_layer 3' , key_layer.size()) if exists(layer_past) and layer_past.numel() > 0: offset = layer_past[0].shape[0] seq_len += offset @@ -717,12 +713,12 @@ def forward(self, hidden_states, attention_mask, layer_past=None): cos, sin, scale = self.xpos_emb(value_layer, seq_len=seq_len) query_layer, key_layer = apply_xpos_fn( query_layer, key_layer, cos, sin, scale, offset=offset) - print('key_layer 4' , key_layer.size()) - ## [b, np*sq, hn] -> [b, np, sq, hn] - query_layer = query_layer.view(_b, _np, _sq, _hn) - key_layer = key_layer.view(_b, _np, _sq, _hn) - print('key_layer 5' , key_layer.size()) - exit(0) + + ## [sq, b * np, hn] -> [sq, b, np, hn] + query_layer = query_layer.view(_sq, _b, _np, _hn) + ## [sq, b * np, hn] -> [sk, b, np, hn] + key_layer = key_layer.view(_sq, _b, _np, _hn) + # ================================== # Cache key and value for inference # ================================== From 3ce96f1fe0cb746b367bc698c2c4ea15fbc78913 Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Sun, 10 Sep 2023 19:02:15 +0900 Subject: [PATCH 137/183] debug --- configs/49M.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/configs/49M.yml b/configs/49M.yml index a32dec0c0..75cd38c16 100644 --- a/configs/49M.yml +++ b/configs/49M.yml @@ -15,8 +15,8 @@ "no_weight_tying": true, "gpt_j_residual": true, "output_layer_parallelism": "column", - "activation": "gelu", - # "activation": "swiglu", + # "activation": "gelu", + "activation": "swiglu", "norm": "rmsnorm", # "use_bnb_optimizer": true, From 19299bc2008c11ccb4c6228bc6442878ff13fda0 Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Sun, 10 Sep 2023 19:05:12 +0900 Subject: [PATCH 138/183] debug --- megatron/mpu/layers.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/megatron/mpu/layers.py b/megatron/mpu/layers.py index 92edbd6eb..4c7698ba3 100644 --- a/megatron/mpu/layers.py +++ b/megatron/mpu/layers.py @@ -742,6 +742,10 @@ def forward(self, input_): else: input_parallel = scatter_to_model_parallel_region(input_) # Matrix multiply. + + ## (512x1280 and 2560x640) + print('debug: ', input_parallel.size(), self.weight.size()) + exit(0) output_parallel = F.linear(input_parallel, self.weight) # All-reduce across all the partitions. if not self.parallel_output: From 314ee87fce7170eabd8927141a9b1249b87d42df Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Sun, 10 Sep 2023 19:07:51 +0900 Subject: [PATCH 139/183] debug --- configs/49M.yml | 4 ++-- megatron/mpu/layers.py | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/configs/49M.yml b/configs/49M.yml index 75cd38c16..801168cfc 100644 --- a/configs/49M.yml +++ b/configs/49M.yml @@ -9,8 +9,8 @@ "num_attention_heads": 10, "seq_length": 2048, "max_position_embeddings": 2048, - # "pos_emb": "rotary", - "pos_emb": "xpos", + "pos_emb": "rotary", + # "pos_emb": "xpos", "rotary_pct": 0.25, "no_weight_tying": true, "gpt_j_residual": true, diff --git a/megatron/mpu/layers.py b/megatron/mpu/layers.py index 4c7698ba3..12787a655 100644 --- a/megatron/mpu/layers.py +++ b/megatron/mpu/layers.py @@ -746,6 +746,7 @@ def forward(self, input_): ## (512x1280 and 2560x640) print('debug: ', input_parallel.size(), self.weight.size()) exit(0) + ## xpos debug: torch.Size([64, 8, 640]) torch.Size([640, 640]) output_parallel = F.linear(input_parallel, self.weight) # All-reduce across all the partitions. if not self.parallel_output: From d565c48b864cba8a578919f7d6774e8ad9480548 Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Sun, 10 Sep 2023 19:09:35 +0900 Subject: [PATCH 140/183] debug --- configs/49M.yml | 4 ++-- megatron/mpu/layers.py | 5 +++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/configs/49M.yml b/configs/49M.yml index 801168cfc..75cd38c16 100644 --- a/configs/49M.yml +++ b/configs/49M.yml @@ -9,8 +9,8 @@ "num_attention_heads": 10, "seq_length": 2048, "max_position_embeddings": 2048, - "pos_emb": "rotary", - # "pos_emb": "xpos", + # "pos_emb": "rotary", + "pos_emb": "xpos", "rotary_pct": 0.25, "no_weight_tying": true, "gpt_j_residual": true, diff --git a/megatron/mpu/layers.py b/megatron/mpu/layers.py index 12787a655..a75681c7d 100644 --- a/megatron/mpu/layers.py +++ b/megatron/mpu/layers.py @@ -744,9 +744,10 @@ def forward(self, input_): # Matrix multiply. ## (512x1280 and 2560x640) - print('debug: ', input_parallel.size(), self.weight.size()) - exit(0) + ## print('debug: ', input_parallel.size(), self.weight.size()) + ## exit(0) ## xpos debug: torch.Size([64, 8, 640]) torch.Size([640, 640]) + ## torch.Size([64, 8, 640]) torch.Size([640, 640]) output_parallel = F.linear(input_parallel, self.weight) # All-reduce across all the partitions. if not self.parallel_output: From b5185367358a5b2d1edc2bea7e1cb361ef1a2c81 Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Sun, 10 Sep 2023 19:30:13 +0900 Subject: [PATCH 141/183] debug --- megatron/model/activations.py | 1 + megatron/mpu/layers.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/megatron/model/activations.py b/megatron/model/activations.py index fc4c73445..40c3ff906 100644 --- a/megatron/model/activations.py +++ b/megatron/model/activations.py @@ -125,6 +125,7 @@ def mish(x): @torch.jit.script def swiglu(x): x = torch.chunk(x, 2, dim=-1) + print("x0 x1: ", x[0].size(), x[1].size()) return F.silu(x[0]) * x[1] class GEGLU(torch.nn.Module): diff --git a/megatron/mpu/layers.py b/megatron/mpu/layers.py index a75681c7d..3afe1ff3b 100644 --- a/megatron/mpu/layers.py +++ b/megatron/mpu/layers.py @@ -744,7 +744,7 @@ def forward(self, input_): # Matrix multiply. ## (512x1280 and 2560x640) - ## print('debug: ', input_parallel.size(), self.weight.size()) + print('debug: ', input_parallel.size(), self.weight.size()) ## exit(0) ## xpos debug: torch.Size([64, 8, 640]) torch.Size([640, 640]) ## torch.Size([64, 8, 640]) torch.Size([640, 640]) From 3396a71452fcceb0a04525a447502ebd0b9faac4 Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Sun, 10 Sep 2023 19:43:35 +0900 Subject: [PATCH 142/183] debug --- megatron/model/activations.py | 3 ++- megatron/mpu/layers.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/megatron/model/activations.py b/megatron/model/activations.py index 40c3ff906..81ab3dbab 100644 --- a/megatron/model/activations.py +++ b/megatron/model/activations.py @@ -124,8 +124,9 @@ def mish(x): @torch.jit.script def swiglu(x): + print("x0 x1 111: ", x) x = torch.chunk(x, 2, dim=-1) - print("x0 x1: ", x[0].size(), x[1].size()) + print("x0 x1 222: ", x[0].size(), x[1].size()) return F.silu(x[0]) * x[1] class GEGLU(torch.nn.Module): diff --git a/megatron/mpu/layers.py b/megatron/mpu/layers.py index 3afe1ff3b..ad1b3e44c 100644 --- a/megatron/mpu/layers.py +++ b/megatron/mpu/layers.py @@ -734,6 +734,7 @@ def set_parallel_output(self, parallel_output: bool): self.parallel_output = parallel_output def forward(self, input_): + print('debug1: ', input_.size()) if self.use_mup and self.mup_rescale_parameters: input_ /= self.width_mult() # Set up backprop all-reduce. @@ -744,7 +745,7 @@ def forward(self, input_): # Matrix multiply. ## (512x1280 and 2560x640) - print('debug: ', input_parallel.size(), self.weight.size()) + print('debug2: ', input_parallel.size(), self.weight.size()) ## exit(0) ## xpos debug: torch.Size([64, 8, 640]) torch.Size([640, 640]) ## torch.Size([64, 8, 640]) torch.Size([640, 640]) From 6951f10661a5a80a106a96294974138add15077d Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Sun, 10 Sep 2023 19:44:55 +0900 Subject: [PATCH 143/183] debug --- megatron/model/activations.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/megatron/model/activations.py b/megatron/model/activations.py index 81ab3dbab..5f50d307a 100644 --- a/megatron/model/activations.py +++ b/megatron/model/activations.py @@ -124,7 +124,7 @@ def mish(x): @torch.jit.script def swiglu(x): - print("x0 x1 111: ", x) + print("x0 x1 111: ", x.size()) x = torch.chunk(x, 2, dim=-1) print("x0 x1 222: ", x[0].size(), x[1].size()) return F.silu(x[0]) * x[1] From 0f3dfe35251c65925e2952f180ca0db480d2c5de Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Sun, 10 Sep 2023 20:06:35 +0900 Subject: [PATCH 144/183] debug --- megatron/mpu/layers.py | 1 + 1 file changed, 1 insertion(+) diff --git a/megatron/mpu/layers.py b/megatron/mpu/layers.py index ad1b3e44c..4650aa814 100644 --- a/megatron/mpu/layers.py +++ b/megatron/mpu/layers.py @@ -734,6 +734,7 @@ def set_parallel_output(self, parallel_output: bool): self.parallel_output = parallel_output def forward(self, input_): + print('self.input_is_parallel', self.input_is_parallel) print('debug1: ', input_.size()) if self.use_mup and self.mup_rescale_parameters: input_ /= self.width_mult() From 558c2d3b41c908ec9081c88fda5a75cd49f21a39 Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Sun, 10 Sep 2023 20:09:13 +0900 Subject: [PATCH 145/183] debug --- megatron/model/activations.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/megatron/model/activations.py b/megatron/model/activations.py index 5f50d307a..057449e30 100644 --- a/megatron/model/activations.py +++ b/megatron/model/activations.py @@ -125,9 +125,9 @@ def mish(x): @torch.jit.script def swiglu(x): print("x0 x1 111: ", x.size()) - x = torch.chunk(x, 2, dim=-1) - print("x0 x1 222: ", x[0].size(), x[1].size()) - return F.silu(x[0]) * x[1] + return F.silu(x) * x + # x = torch.chunk(x, 2, dim=-1) + # return F.silu(x[0]) * x[1] class GEGLU(torch.nn.Module): def __init__(self, neox_args): From 3c8109b3da66a056fa6e076caf78e3b1f36744cd Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Sun, 10 Sep 2023 20:11:39 +0900 Subject: [PATCH 146/183] fix --- megatron/model/activations.py | 1 - megatron/mpu/layers.py | 7 ------- 2 files changed, 8 deletions(-) diff --git a/megatron/model/activations.py b/megatron/model/activations.py index 057449e30..ea7935d8a 100644 --- a/megatron/model/activations.py +++ b/megatron/model/activations.py @@ -124,7 +124,6 @@ def mish(x): @torch.jit.script def swiglu(x): - print("x0 x1 111: ", x.size()) return F.silu(x) * x # x = torch.chunk(x, 2, dim=-1) # return F.silu(x[0]) * x[1] diff --git a/megatron/mpu/layers.py b/megatron/mpu/layers.py index 4650aa814..f335df98c 100644 --- a/megatron/mpu/layers.py +++ b/megatron/mpu/layers.py @@ -734,8 +734,6 @@ def set_parallel_output(self, parallel_output: bool): self.parallel_output = parallel_output def forward(self, input_): - print('self.input_is_parallel', self.input_is_parallel) - print('debug1: ', input_.size()) if self.use_mup and self.mup_rescale_parameters: input_ /= self.width_mult() # Set up backprop all-reduce. @@ -745,11 +743,6 @@ def forward(self, input_): input_parallel = scatter_to_model_parallel_region(input_) # Matrix multiply. - ## (512x1280 and 2560x640) - print('debug2: ', input_parallel.size(), self.weight.size()) - ## exit(0) - ## xpos debug: torch.Size([64, 8, 640]) torch.Size([640, 640]) - ## torch.Size([64, 8, 640]) torch.Size([640, 640]) output_parallel = F.linear(input_parallel, self.weight) # All-reduce across all the partitions. if not self.parallel_output: From 9579b62491e883cea57ed06eae66e11d41222de3 Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Mon, 11 Sep 2023 19:43:23 +0900 Subject: [PATCH 147/183] fix save iter --- configs/49M.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/49M.yml b/configs/49M.yml index 75cd38c16..fb63b7568 100644 --- a/configs/49M.yml +++ b/configs/49M.yml @@ -113,7 +113,7 @@ "eval_interval": 100000, "eval_iters": 1000, "keep_last_n_checkpoints": 4, - "save_iters": 10000, + "save_iters": 5000, # logging "log_interval": 1000, From 81e4460f57d10c520e55b699a49aacbd70d40a43 Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Tue, 12 Sep 2023 17:46:23 +0900 Subject: [PATCH 148/183] fix --- configs/49M.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/49M.yml b/configs/49M.yml index fb63b7568..c688a54a3 100644 --- a/configs/49M.yml +++ b/configs/49M.yml @@ -109,7 +109,7 @@ "distributed_backend": "nccl", "lr_decay_style": "cosine", "warmup": 0.01, - "checkpoint_factor": 10000, + "checkpoint_factor": 5000, "eval_interval": 100000, "eval_iters": 1000, "keep_last_n_checkpoints": 4, From c9042c57a137257a237d75487ba6780cae91f78a Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Sat, 16 Sep 2023 08:19:02 +0900 Subject: [PATCH 149/183] fix --- configs/convert_19M_settings.yml | 31 ++++++++++++++++++++++++++ configs/convert_49M_settings.yml | 37 ++++++++++++++++++++++++++++++++ 2 files changed, 68 insertions(+) create mode 100644 configs/convert_19M_settings.yml create mode 100644 configs/convert_49M_settings.yml diff --git a/configs/convert_19M_settings.yml b/configs/convert_19M_settings.yml new file mode 100644 index 000000000..baf797385 --- /dev/null +++ b/configs/convert_19M_settings.yml @@ -0,0 +1,31 @@ +{ + "tokenizer_type": "SPMTokenizer", + "vocab-file": "./novelAI/tokenizer.model", + + "pipe_parallel_size": 1, + "model_parallel_size": 1, + + # model settings + "num_layers": 6, + "hidden_size": 512, + "num_attention_heads": 8, + "seq_length": 2048, + "max_position_embeddings": 2048, + "pos_emb": "rotary", + "no_weight_tying": true, + "gpt_j_residual": false, + "output_layer_parallelism": "column", + + "scaled_upper_triang_masked_softmax_fusion": false, + "bias_gelu_fusion": false, + + "optimizer": { + "type": "Adam", + "params": { + "lr": 0.001, + "betas": [0.9, 0.95], + "eps": 1.0e-8, + } + }, + "min_lr": 0.0001 +} diff --git a/configs/convert_49M_settings.yml b/configs/convert_49M_settings.yml new file mode 100644 index 000000000..9287a60f1 --- /dev/null +++ b/configs/convert_49M_settings.yml @@ -0,0 +1,37 @@ +{ + "tokenizer_type": "SPMTokenizer", + "vocab-file": "./novelAI/tokenizer.model", + + "pipe_parallel_size": 1, + "model_parallel_size": 1, + + # model settings + "num_layers": 10, + "hidden_size": 640, + "num_attention_heads": 10, + "seq_length": 2048, + "max_position_embeddings": 2048, + + "activation": "swiglu", + "norm": "rmsnorm", + "pos_emb": "xpos", + + ## ------------------- + "pos_emb": "rotary", + "no_weight_tying": true, + "gpt_j_residual": false, + "output_layer_parallelism": "column", + + "scaled_upper_triang_masked_softmax_fusion": false, + "bias_gelu_fusion": false, + + "optimizer": { + "type": "Adam", + "params": { + "lr": 0.001, + "betas": [0.9, 0.95], + "eps": 1.0e-8, + } + }, + "min_lr": 0.0001 +} From b62cf6f52955517b14ddafbe5d864b75b7b780b1 Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Sat, 16 Sep 2023 08:43:38 +0900 Subject: [PATCH 150/183] add for hf gptneox --- hf_gptneox.py | 54 +++++++++++++++++++++++++++++++++++ tools/convert_module_to_hf.py | 5 +++- 2 files changed, 58 insertions(+), 1 deletion(-) create mode 100644 hf_gptneox.py diff --git a/hf_gptneox.py b/hf_gptneox.py new file mode 100644 index 000000000..33e95c8cb --- /dev/null +++ b/hf_gptneox.py @@ -0,0 +1,54 @@ +from transformers.models.gpt_neox import GPTNeoXPreTrainedModel, GPTNeoXModel, GPTNeoXLayer, GPTNeoXMLP +from transformers.activations import ClassInstantier, ACT2CLS +from torch import Tensor, nn + +from typing import Callable, Optional +import torch.nn.functional as F + + +class SwiGLUFFN(nn.Module): + def __init__( + self, + in_features: int, + hidden_features: Optional[int] = None, + out_features: Optional[int] = None, + act_layer: Callable[..., nn.Module] = None, + drop: float = 0.0, + bias: bool = True, + ) -> None: + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.w12 = nn.Linear(in_features, 2 * hidden_features, bias=bias) + self.w3 = nn.Linear(hidden_features, out_features, bias=bias) + + def forward(self, x: Tensor) -> Tensor: + x12 = self.w12(x) + x1, x2 = x12.chunk(2, dim=-1) + hidden = F.silu(x1) * x2 + return self.w3(hidden) + +ACT2CLS['swiglu'] = SwiGLUFFN +ACT2FN = ClassInstantier(ACT2CLS) + +class GPTNeoX2MLP(GPTNeoXMLP): + def __init__(self, config): + super().__init__() + self.act = ACT2FN[config.hidden_act] + +class GPTNeoX2Layer(GPTNeoXModel): + def __init__(self, config): + super().__init__() + self.mlp = GPTNeoX2MLP(config) + +class GPTNeoX2Model(GPTNeoXModel): + def __init__(self, config): + super().__init__(config) + self.layers = nn.ModuleList([GPTNeoX2Layer(config) for _ in range(config.num_hidden_layers)]) + +class GPTNeoX2ForCausalLM(GPTNeoXPreTrainedModel): + _tied_weights_keys = ["embed_out.weight"] + + def __init__(self, config): + super().__init__(config) + self.gpt_neox = GPTNeoX2Model(config) \ No newline at end of file diff --git a/tools/convert_module_to_hf.py b/tools/convert_module_to_hf.py index c46d78402..130c81675 100644 --- a/tools/convert_module_to_hf.py +++ b/tools/convert_module_to_hf.py @@ -23,6 +23,7 @@ import torch from transformers import GPTNeoXConfig, GPTNeoXForCausalLM +from ..hf_gptneox import GPTNeoX2ForCausalLM sys.path.append( os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir)) @@ -145,7 +146,9 @@ def convert(input_checkpoint_path, loaded_config, output_checkpoint_path): hf_config = create_config(loaded_config) - hf_model = GPTNeoXForCausalLM(hf_config) + # hf_model = GPTNeoXForCausalLM(hf_config) + ## for swiglu + hf_model = GPTNeoX2ForCausalLM(hf_config) # save model in fp16/bf16 if Deepspeed fp16 or bf16 mixed precision was used in config, else 32 bit weights fp16 = get_key(loaded_config, "fp16") From ba18cd300ee1f1a4b6f46c8f69427188ba34564b Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Sat, 16 Sep 2023 08:44:45 +0900 Subject: [PATCH 151/183] add --- hf_gptneox.py | 54 ----------------------------------- tools/convert_module_to_hf.py | 2 +- 2 files changed, 1 insertion(+), 55 deletions(-) delete mode 100644 hf_gptneox.py diff --git a/hf_gptneox.py b/hf_gptneox.py deleted file mode 100644 index 33e95c8cb..000000000 --- a/hf_gptneox.py +++ /dev/null @@ -1,54 +0,0 @@ -from transformers.models.gpt_neox import GPTNeoXPreTrainedModel, GPTNeoXModel, GPTNeoXLayer, GPTNeoXMLP -from transformers.activations import ClassInstantier, ACT2CLS -from torch import Tensor, nn - -from typing import Callable, Optional -import torch.nn.functional as F - - -class SwiGLUFFN(nn.Module): - def __init__( - self, - in_features: int, - hidden_features: Optional[int] = None, - out_features: Optional[int] = None, - act_layer: Callable[..., nn.Module] = None, - drop: float = 0.0, - bias: bool = True, - ) -> None: - super().__init__() - out_features = out_features or in_features - hidden_features = hidden_features or in_features - self.w12 = nn.Linear(in_features, 2 * hidden_features, bias=bias) - self.w3 = nn.Linear(hidden_features, out_features, bias=bias) - - def forward(self, x: Tensor) -> Tensor: - x12 = self.w12(x) - x1, x2 = x12.chunk(2, dim=-1) - hidden = F.silu(x1) * x2 - return self.w3(hidden) - -ACT2CLS['swiglu'] = SwiGLUFFN -ACT2FN = ClassInstantier(ACT2CLS) - -class GPTNeoX2MLP(GPTNeoXMLP): - def __init__(self, config): - super().__init__() - self.act = ACT2FN[config.hidden_act] - -class GPTNeoX2Layer(GPTNeoXModel): - def __init__(self, config): - super().__init__() - self.mlp = GPTNeoX2MLP(config) - -class GPTNeoX2Model(GPTNeoXModel): - def __init__(self, config): - super().__init__(config) - self.layers = nn.ModuleList([GPTNeoX2Layer(config) for _ in range(config.num_hidden_layers)]) - -class GPTNeoX2ForCausalLM(GPTNeoXPreTrainedModel): - _tied_weights_keys = ["embed_out.weight"] - - def __init__(self, config): - super().__init__(config) - self.gpt_neox = GPTNeoX2Model(config) \ No newline at end of file diff --git a/tools/convert_module_to_hf.py b/tools/convert_module_to_hf.py index 130c81675..4b790d04c 100644 --- a/tools/convert_module_to_hf.py +++ b/tools/convert_module_to_hf.py @@ -23,7 +23,7 @@ import torch from transformers import GPTNeoXConfig, GPTNeoXForCausalLM -from ..hf_gptneox import GPTNeoX2ForCausalLM +from hf_gptneox import GPTNeoX2ForCausalLM sys.path.append( os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir)) From 0542bbd35da5949b53c42d55f5c39e4259eef7a5 Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Sat, 16 Sep 2023 08:45:06 +0900 Subject: [PATCH 152/183] fix --- tools/hf_gptneox.py | 54 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) create mode 100644 tools/hf_gptneox.py diff --git a/tools/hf_gptneox.py b/tools/hf_gptneox.py new file mode 100644 index 000000000..33e95c8cb --- /dev/null +++ b/tools/hf_gptneox.py @@ -0,0 +1,54 @@ +from transformers.models.gpt_neox import GPTNeoXPreTrainedModel, GPTNeoXModel, GPTNeoXLayer, GPTNeoXMLP +from transformers.activations import ClassInstantier, ACT2CLS +from torch import Tensor, nn + +from typing import Callable, Optional +import torch.nn.functional as F + + +class SwiGLUFFN(nn.Module): + def __init__( + self, + in_features: int, + hidden_features: Optional[int] = None, + out_features: Optional[int] = None, + act_layer: Callable[..., nn.Module] = None, + drop: float = 0.0, + bias: bool = True, + ) -> None: + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.w12 = nn.Linear(in_features, 2 * hidden_features, bias=bias) + self.w3 = nn.Linear(hidden_features, out_features, bias=bias) + + def forward(self, x: Tensor) -> Tensor: + x12 = self.w12(x) + x1, x2 = x12.chunk(2, dim=-1) + hidden = F.silu(x1) * x2 + return self.w3(hidden) + +ACT2CLS['swiglu'] = SwiGLUFFN +ACT2FN = ClassInstantier(ACT2CLS) + +class GPTNeoX2MLP(GPTNeoXMLP): + def __init__(self, config): + super().__init__() + self.act = ACT2FN[config.hidden_act] + +class GPTNeoX2Layer(GPTNeoXModel): + def __init__(self, config): + super().__init__() + self.mlp = GPTNeoX2MLP(config) + +class GPTNeoX2Model(GPTNeoXModel): + def __init__(self, config): + super().__init__(config) + self.layers = nn.ModuleList([GPTNeoX2Layer(config) for _ in range(config.num_hidden_layers)]) + +class GPTNeoX2ForCausalLM(GPTNeoXPreTrainedModel): + _tied_weights_keys = ["embed_out.weight"] + + def __init__(self, config): + super().__init__(config) + self.gpt_neox = GPTNeoX2Model(config) \ No newline at end of file From 6c956e00f0c8c52d184b5324d149f6e11b15e0c6 Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Sat, 16 Sep 2023 08:46:52 +0900 Subject: [PATCH 153/183] fix --- tools/hf_gptneox.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tools/hf_gptneox.py b/tools/hf_gptneox.py index 33e95c8cb..efbf9fe84 100644 --- a/tools/hf_gptneox.py +++ b/tools/hf_gptneox.py @@ -1,4 +1,5 @@ -from transformers.models.gpt_neox import GPTNeoXPreTrainedModel, GPTNeoXModel, GPTNeoXLayer, GPTNeoXMLP +from transformers.models.gpt_neox import GPTNeoXPreTrainedModel, GPTNeoXModel, GPTNeoXLayer +from transformers.models.gpt_neox.modeling_gpt_neox import GPTNeoXMLP from transformers.activations import ClassInstantier, ACT2CLS from torch import Tensor, nn @@ -36,7 +37,7 @@ def __init__(self, config): super().__init__() self.act = ACT2FN[config.hidden_act] -class GPTNeoX2Layer(GPTNeoXModel): +class GPTNeoX2Layer(GPTNeoXLayer): def __init__(self, config): super().__init__() self.mlp = GPTNeoX2MLP(config) From c21da19eed32d5c6eb5fcff6cf91ec5999be8744 Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Sat, 16 Sep 2023 08:50:18 +0900 Subject: [PATCH 154/183] ix --- tools/hf_gptneox.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/hf_gptneox.py b/tools/hf_gptneox.py index efbf9fe84..38b9a92e2 100644 --- a/tools/hf_gptneox.py +++ b/tools/hf_gptneox.py @@ -44,7 +44,9 @@ def __init__(self, config): class GPTNeoX2Model(GPTNeoXModel): def __init__(self, config): - super().__init__(config) + _config = config.deepcopy() + _config.hidden_act = "gelu" + super().__init__(_config) self.layers = nn.ModuleList([GPTNeoX2Layer(config) for _ in range(config.num_hidden_layers)]) class GPTNeoX2ForCausalLM(GPTNeoXPreTrainedModel): From 955a943dd0ba4d933bbbc98460cfef6630b50eae Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Sat, 16 Sep 2023 08:55:24 +0900 Subject: [PATCH 155/183] copy --- tools/hf_gptneox.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tools/hf_gptneox.py b/tools/hf_gptneox.py index 38b9a92e2..54e337650 100644 --- a/tools/hf_gptneox.py +++ b/tools/hf_gptneox.py @@ -44,9 +44,11 @@ def __init__(self, config): class GPTNeoX2Model(GPTNeoXModel): def __init__(self, config): - _config = config.deepcopy() - _config.hidden_act = "gelu" - super().__init__(_config) + _copy_hidden_act = config.hidden_act + config.hidden_act = "gelu" + super().__init__(config) + + config.hidden_act = _copy_hidden_act self.layers = nn.ModuleList([GPTNeoX2Layer(config) for _ in range(config.num_hidden_layers)]) class GPTNeoX2ForCausalLM(GPTNeoXPreTrainedModel): From 289f49600b34502b75c8614beb6ffa514ad60826 Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Sat, 16 Sep 2023 09:03:48 +0900 Subject: [PATCH 156/183] fix --- tools/hf_gptneox.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/hf_gptneox.py b/tools/hf_gptneox.py index 54e337650..b700f2d06 100644 --- a/tools/hf_gptneox.py +++ b/tools/hf_gptneox.py @@ -34,12 +34,12 @@ def forward(self, x: Tensor) -> Tensor: class GPTNeoX2MLP(GPTNeoXMLP): def __init__(self, config): - super().__init__() + super().__init__(config) self.act = ACT2FN[config.hidden_act] class GPTNeoX2Layer(GPTNeoXLayer): def __init__(self, config): - super().__init__() + super().__init__(config) self.mlp = GPTNeoX2MLP(config) class GPTNeoX2Model(GPTNeoXModel): From a417387dce88c2ba0a6d54f71f1525028ed3e79d Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Sat, 16 Sep 2023 09:07:32 +0900 Subject: [PATCH 157/183] fix --- tools/hf_gptneox.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/tools/hf_gptneox.py b/tools/hf_gptneox.py index b700f2d06..75a3201ee 100644 --- a/tools/hf_gptneox.py +++ b/tools/hf_gptneox.py @@ -34,17 +34,25 @@ def forward(self, x: Tensor) -> Tensor: class GPTNeoX2MLP(GPTNeoXMLP): def __init__(self, config): + _copy_hidden_act = config.hidden_act + config.hidden_act = "gelu" super().__init__(config) + + config.hidden_act = _copy_hidden_act self.act = ACT2FN[config.hidden_act] class GPTNeoX2Layer(GPTNeoXLayer): def __init__(self, config): + _copy_hidden_act = config.hidden_act + config.hidden_act = "gelu" super().__init__(config) + + config.hidden_act = _copy_hidden_act self.mlp = GPTNeoX2MLP(config) class GPTNeoX2Model(GPTNeoXModel): def __init__(self, config): - _copy_hidden_act = config.hidden_act + _copy_hidden_act = config.hidden_act config.hidden_act = "gelu" super().__init__(config) From 068c6aea6ce7e1198a2f2a7016535d3721d39d15 Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Sat, 16 Sep 2023 09:12:51 +0900 Subject: [PATCH 158/183] fix act --- tools/hf_gptneox.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/tools/hf_gptneox.py b/tools/hf_gptneox.py index 75a3201ee..b4eb0ae3c 100644 --- a/tools/hf_gptneox.py +++ b/tools/hf_gptneox.py @@ -28,8 +28,16 @@ def forward(self, x: Tensor) -> Tensor: x1, x2 = x12.chunk(2, dim=-1) hidden = F.silu(x1) * x2 return self.w3(hidden) + +class SwiGLU(nn.Module): + def __init__(self) -> None: + super().__init__() + + def forward(self, x: Tensor) -> Tensor: + return F.silu(x) * x -ACT2CLS['swiglu'] = SwiGLUFFN +# ACT2CLS['swiglu'] = SwiGLUFFN +ACT2CLS['swiglu'] = SwiGLU ACT2FN = ClassInstantier(ACT2CLS) class GPTNeoX2MLP(GPTNeoXMLP): @@ -37,7 +45,7 @@ def __init__(self, config): _copy_hidden_act = config.hidden_act config.hidden_act = "gelu" super().__init__(config) - + config.hidden_act = _copy_hidden_act self.act = ACT2FN[config.hidden_act] From d4a77a4a0f4a0d539dbbf504cfe1319b97636f9b Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Sat, 16 Sep 2023 09:17:37 +0900 Subject: [PATCH 159/183] fix --- tools/convert_module_to_hf.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/tools/convert_module_to_hf.py b/tools/convert_module_to_hf.py index 4b790d04c..ab97b1361 100644 --- a/tools/convert_module_to_hf.py +++ b/tools/convert_module_to_hf.py @@ -203,12 +203,18 @@ def convert(input_checkpoint_path, loaded_config, output_checkpoint_path): state_dict[key] = torch.cat([t[key] for t in loaded_tp_ranks], dim=1) # average layernorm stats over mp ranks - for key in [ + keysForOriginGPTNeoX=[ "input_layernorm.weight", "input_layernorm.bias", "post_attention_layernorm.weight", "post_attention_layernorm.bias", - ]: + ] + keysForSwiglu = [ + "input_layernorm.bias", + "post_attention_layernorm.weight", + "post_attention_layernorm.bias", + ] + for key in keysForSwiglu: state_dict[key] = (sum([t[key] for t in loaded_tp_ranks])) / len( loaded_tp_ranks ) From 0db8946ab8bebca45163ed41f3643b7ae21ddd65 Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Sat, 16 Sep 2023 09:18:49 +0900 Subject: [PATCH 160/183] fix --- tools/convert_module_to_hf.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tools/convert_module_to_hf.py b/tools/convert_module_to_hf.py index ab97b1361..d46dcec66 100644 --- a/tools/convert_module_to_hf.py +++ b/tools/convert_module_to_hf.py @@ -209,8 +209,7 @@ def convert(input_checkpoint_path, loaded_config, output_checkpoint_path): "post_attention_layernorm.weight", "post_attention_layernorm.bias", ] - keysForSwiglu = [ - "input_layernorm.bias", + keysForSwiglu = [ "post_attention_layernorm.weight", "post_attention_layernorm.bias", ] From d2e315934b64878e722fe766e2571f3d6376a646 Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Sat, 16 Sep 2023 09:20:34 +0900 Subject: [PATCH 161/183] fix --- tools/convert_module_to_hf.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/convert_module_to_hf.py b/tools/convert_module_to_hf.py index d46dcec66..94a09b8f1 100644 --- a/tools/convert_module_to_hf.py +++ b/tools/convert_module_to_hf.py @@ -195,6 +195,9 @@ def convert(input_checkpoint_path, loaded_config, output_checkpoint_path): input_checkpoint_path, mp_partitions, layer_i + 2 ) + for t in loaded_tp_ranks: + print('t', t.keys()) + state_dict = {} for key in [ "attention.dense.weight", From 98b5968f68ee84bb5e717218c394c9dd2f8fce3f Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Sat, 16 Sep 2023 09:22:07 +0900 Subject: [PATCH 162/183] debug --- tools/convert_module_to_hf.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/tools/convert_module_to_hf.py b/tools/convert_module_to_hf.py index 94a09b8f1..7244e76f1 100644 --- a/tools/convert_module_to_hf.py +++ b/tools/convert_module_to_hf.py @@ -187,7 +187,7 @@ def convert(input_checkpoint_path, loaded_config, output_checkpoint_path): # get layer from hf model hf_layer = hf_model.gpt_neox.layers[layer_i] for v in hf_layer.state_dict(): - print('state_dict: ', v) + print('debug state_dict: ', v) print('-'*200) # + 2 bc of embed layer and a dummy _pre_transformer_block @@ -196,7 +196,7 @@ def convert(input_checkpoint_path, loaded_config, output_checkpoint_path): ) for t in loaded_tp_ranks: - print('t', t.keys()) + print('debug loaded_tp_ranks: ', t.keys()) state_dict = {} for key in [ @@ -212,10 +212,7 @@ def convert(input_checkpoint_path, loaded_config, output_checkpoint_path): "post_attention_layernorm.weight", "post_attention_layernorm.bias", ] - keysForSwiglu = [ - "post_attention_layernorm.weight", - "post_attention_layernorm.bias", - ] + keysForSwiglu = [] for key in keysForSwiglu: state_dict[key] = (sum([t[key] for t in loaded_tp_ranks])) / len( loaded_tp_ranks From e4c1208fa760d3fc55ae4ba461b0222757d7cde9 Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Sat, 16 Sep 2023 09:27:12 +0900 Subject: [PATCH 163/183] debug --- tools/convert_module_to_hf.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/convert_module_to_hf.py b/tools/convert_module_to_hf.py index 7244e76f1..033639a95 100644 --- a/tools/convert_module_to_hf.py +++ b/tools/convert_module_to_hf.py @@ -235,9 +235,11 @@ def convert(input_checkpoint_path, loaded_config, output_checkpoint_path): state_dict[key] = sum([t[key] for t in loaded_tp_ranks]) # Just take one + print('debug: ', loaded_config) state_dict["attention.rotary_emb.inv_freq"] = loaded_tp_ranks[0][ "attention.rotary_emb.inv_freq" ] + state_dict["attention.dense.bias"] = hf_layer.state_dict()["attention.dense.bias"] From 7994b05fc78462f1d9276d4e855f724d480f8dee Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Sat, 16 Sep 2023 09:29:32 +0900 Subject: [PATCH 164/183] debug --- tools/convert_module_to_hf.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tools/convert_module_to_hf.py b/tools/convert_module_to_hf.py index 033639a95..138bef1ac 100644 --- a/tools/convert_module_to_hf.py +++ b/tools/convert_module_to_hf.py @@ -141,7 +141,7 @@ def convert(input_checkpoint_path, loaded_config, output_checkpoint_path): should perform model-parallel merging correctly but only supports features allowed by HF GPT-NeoX implementation (e.g. rotary embeddings) """ - + print('debug: ', loaded_config) hf_config = GPTNeoXConfig() hf_config = create_config(loaded_config) @@ -235,10 +235,10 @@ def convert(input_checkpoint_path, loaded_config, output_checkpoint_path): state_dict[key] = sum([t[key] for t in loaded_tp_ranks]) # Just take one - print('debug: ', loaded_config) - state_dict["attention.rotary_emb.inv_freq"] = loaded_tp_ranks[0][ - "attention.rotary_emb.inv_freq" - ] + if loaded_config['pos_emb'] == 'rotary': + state_dict["attention.rotary_emb.inv_freq"] = loaded_tp_ranks[0][ + "attention.rotary_emb.inv_freq" + ] state_dict["attention.dense.bias"] = hf_layer.state_dict()["attention.dense.bias"] From 86e23b0c08ef3c907192fcd57f7f1355934812ab Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Sat, 16 Sep 2023 09:30:05 +0900 Subject: [PATCH 165/183] fix config --- configs/convert_49M_settings.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/convert_49M_settings.yml b/configs/convert_49M_settings.yml index 9287a60f1..9dd0301bd 100644 --- a/configs/convert_49M_settings.yml +++ b/configs/convert_49M_settings.yml @@ -17,7 +17,7 @@ "pos_emb": "xpos", ## ------------------- - "pos_emb": "rotary", + # "pos_emb": "rotary", "no_weight_tying": true, "gpt_j_residual": false, "output_layer_parallelism": "column", From 6f0e4566a802476a4e0c5485d61f2c61d16b79b5 Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Sun, 24 Sep 2023 18:40:27 +0900 Subject: [PATCH 166/183] add text gen --- configs/text_generation.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/configs/text_generation.yml b/configs/text_generation.yml index 5a49d61e5..cae624c5e 100644 --- a/configs/text_generation.yml +++ b/configs/text_generation.yml @@ -2,7 +2,7 @@ # Make sure `load` is specified somewhere else { # Text gen type: `input-file`, `unconditional` or `interactive` - "text_gen_type": "unconditional", + "text_gen_type": "interactive", # Params for all "maximum_tokens": 102, @@ -13,9 +13,9 @@ "recompute": false, # `unconditional`: samples - "num_samples": 10, + # "num_samples": 10, # input/output file - "sample_input_file": "sample_input.txt", - "sample_output_file": "sample_output.txt", + #"sample_input_file": "sample_input.txt", + #"sample_output_file": "sample_output.txt", } From d8b4a1a0b81425740f03c968f63eaed0205f6cfd Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Sun, 24 Sep 2023 18:50:59 +0900 Subject: [PATCH 167/183] debug --- eval_tasks/eval_adapter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/eval_tasks/eval_adapter.py b/eval_tasks/eval_adapter.py index e0a32797d..66a0dd1a6 100644 --- a/eval_tasks/eval_adapter.py +++ b/eval_tasks/eval_adapter.py @@ -24,7 +24,7 @@ def _download_file(*args, **kwargs): fn(*args, **kwargs) -best_download.download_file = _download_file +# best_download.download_file = _download_file import os import sys From 7f82f08344cec1fdd7491b4b9920fc989d997e16 Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Sun, 24 Sep 2023 18:52:10 +0900 Subject: [PATCH 168/183] debug --- eval_tasks/eval_adapter.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/eval_tasks/eval_adapter.py b/eval_tasks/eval_adapter.py index 66a0dd1a6..cede4f93b 100644 --- a/eval_tasks/eval_adapter.py +++ b/eval_tasks/eval_adapter.py @@ -13,15 +13,15 @@ # limitations under the License. from megatron.utils import is_local_main, print_rank_0 -import best_download +# import best_download -# patch best_download (eval harness downloader) to only happen on the first local rank -fn = best_download.download_file +# # patch best_download (eval harness downloader) to only happen on the first local rank +# fn = best_download.download_file -def _download_file(*args, **kwargs): - if is_local_main(): - fn(*args, **kwargs) +# def _download_file(*args, **kwargs): +# if is_local_main(): +# fn(*args, **kwargs) # best_download.download_file = _download_file From c2f6214afe029b6625fdbebb44bda4cbda4cc718 Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Sun, 24 Sep 2023 19:20:41 +0900 Subject: [PATCH 169/183] debug --- configs/text_generation.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/configs/text_generation.yml b/configs/text_generation.yml index cae624c5e..871e3412c 100644 --- a/configs/text_generation.yml +++ b/configs/text_generation.yml @@ -18,4 +18,5 @@ # input/output file #"sample_input_file": "sample_input.txt", #"sample_output_file": "sample_output.txt", + "deepspeed": False } From 6e682f5957e076e81f6aebe2976d577e72dc3c98 Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Sun, 24 Sep 2023 19:26:37 +0900 Subject: [PATCH 170/183] debug --- configs/text_generation.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/text_generation.yml b/configs/text_generation.yml index 871e3412c..68b6ff3ee 100644 --- a/configs/text_generation.yml +++ b/configs/text_generation.yml @@ -18,5 +18,5 @@ # input/output file #"sample_input_file": "sample_input.txt", #"sample_output_file": "sample_output.txt", - "deepspeed": False + "deepspeed": false } From 12800e9c8256a09a73d3ef154193bfec941f27c1 Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Sun, 24 Sep 2023 19:27:08 +0900 Subject: [PATCH 171/183] debug --- configs/text_generation.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/configs/text_generation.yml b/configs/text_generation.yml index 68b6ff3ee..637105563 100644 --- a/configs/text_generation.yml +++ b/configs/text_generation.yml @@ -17,6 +17,5 @@ # input/output file #"sample_input_file": "sample_input.txt", - #"sample_output_file": "sample_output.txt", - "deepspeed": false + #"sample_output_file": "sample_output.txt", } From 72409875f6f40aa3d514642b3a2cc382f87f820f Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Sun, 24 Sep 2023 19:28:08 +0900 Subject: [PATCH 172/183] debug --- megatron/training.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/megatron/training.py b/megatron/training.py index 1f2231acd..08a611ae6 100644 --- a/megatron/training.py +++ b/megatron/training.py @@ -620,6 +620,9 @@ def setup_model_and_optimizer(neox_args, use_cache=False, iteration=None): optimizer, param_groups = get_optimizer(model=model, neox_args=neox_args) lr_scheduler = get_learning_rate_scheduler(optimizer=optimizer, neox_args=neox_args) + ## force enable + neox_args.deepspeed = False + if neox_args.deepspeed: print_rank_0("DeepSpeed is enabled.") if neox_args.no_load_optim: From 2ce7091fac65dc4862324c48ceb709022dc60025 Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Sun, 24 Sep 2023 19:29:35 +0900 Subject: [PATCH 173/183] debug --- megatron/training.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/megatron/training.py b/megatron/training.py index 08a611ae6..1f2231acd 100644 --- a/megatron/training.py +++ b/megatron/training.py @@ -620,9 +620,6 @@ def setup_model_and_optimizer(neox_args, use_cache=False, iteration=None): optimizer, param_groups = get_optimizer(model=model, neox_args=neox_args) lr_scheduler = get_learning_rate_scheduler(optimizer=optimizer, neox_args=neox_args) - ## force enable - neox_args.deepspeed = False - if neox_args.deepspeed: print_rank_0("DeepSpeed is enabled.") if neox_args.no_load_optim: From 7c1a46faefcde45dc2512aa6d9fa0485615ca3b7 Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Sun, 24 Sep 2023 19:35:31 +0900 Subject: [PATCH 174/183] for gptneox2 --- tools/convert_module_to_hf_gptneox2.py | 359 +++++++++++++++++++++++++ 1 file changed, 359 insertions(+) create mode 100644 tools/convert_module_to_hf_gptneox2.py diff --git a/tools/convert_module_to_hf_gptneox2.py b/tools/convert_module_to_hf_gptneox2.py new file mode 100644 index 000000000..7ea38a4e3 --- /dev/null +++ b/tools/convert_module_to_hf_gptneox2.py @@ -0,0 +1,359 @@ +# Copyright (c) 2023, EleutherAI +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import sys + +import yaml +import argparse +from tqdm import tqdm +from typing import List + +import torch +from transformers import GPTNeoXConfig, GPTNeoXForCausalLM + +from hf_gptneox import GPTNeoX2ForCausalLM + +sys.path.append( + os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir)) +) +from megatron.tokenizer import build_tokenizer + + +""" +A script for converting saved NeoX Checkpoints to Huggingface (HF) compatible GPT-NeoX type models. + +Note that this script does not support all NeoX features. +Please investigate carefully whether your model is compatible with all architectures supported by the GPTNeoXForCausalLM class in HF. + +(e.g. position embeddings such as AliBi may not be supported by Huggingface's GPT-NeoX architecture. +""" + + +def load_partitions( + input_checkpoint_path, mp_partitions, layer_idx +) -> List[torch.Tensor]: + """Returns a list containing all weights in a given layer from a model (across MP partitions)""" + + loaded_tp_ranks = [ + torch.load( + os.path.join( + input_checkpoint_path, + f"layer_{layer_idx:02}-model_{i:02}-model_states.pt", + ), + map_location=torch.device("cuda" if torch.cuda.is_available() else "cpu"), + ) + for i in range(mp_partitions) + ] + + return loaded_tp_ranks + + +def get_key(loaded_config, key, default=None): + """ + Search for a given key in a NeoX yaml. normalizes underscores -> hyphens + """ + key = key.replace("_", "-") + try: + return loaded_config[key] + except KeyError: + key = key.replace("-", "_") + try: + return loaded_config[key] + except KeyError: + return default + + +def create_config(neox_config): + """take in a loaded yaml from NeoX and assign relevant values to HF config. + Returns: GPTNeoXConfig() object + """ + + class TokenizerArgs: + # kinda hacky. + # this is to get something with the same interface as is used in build_tokenizer() + # without diving into loading a neox_args object or using argparse etc. + def __init__(self, neox_config): + self.make_vocab_size_divisible_by = get_key( + neox_config, "make-vocab-size-divisible-by", default=128 + ) + self.model_parallel_size = get_key(neox_config, "model-parallel-size") + self.vocab_file = get_key(neox_config, "vocab-file") + self.merge_file = get_key(neox_config, "merge-file") + self.tokenizer_type = get_key(neox_config, "tokenizer-type") + + self.rank = 0 + + args = TokenizerArgs(neox_config) + tokenizer = build_tokenizer(args) + try: # GPT2TokenizerFast raises NotImplementedError + pad_token = tokenizer.pad + except: + pad_token = ( + 1 # pad defaulting to 1. follows convention from GPT-NeoX-20b tokenizer + ) + + # TODO: change the default value here based on discussion regarding `gpt_j_tied` config parameter's default + use_tied_lns = get_key(neox_config, "gpt-j-tied", False) + + if use_tied_lns: + raise NotImplementedError( + """ERROR: Huggingface Transformers does not yet support a single shared layernorm + per transformer block for GPT-NeoX models trained w/ GPT-J parallel residuals. + See https://github.com/EleutherAI/gpt-neox/pull/481 for further details.""" + ) + + # set all config values. + hf_config = GPTNeoXConfig( + vocab_size=args.padded_vocab_size, + hidden_size=get_key(neox_config, "hidden-size"), + num_hidden_layers=get_key(neox_config, "num-layers"), + num_attention_heads=get_key(neox_config, "num-attention-heads"), + intermediate_size=(get_key(neox_config, "hidden-size") * 4), + hidden_act=get_key(neox_config, "activation", default="gelu"), + rotary_pct=get_key(neox_config, "rotary-pct", default=1.0), + rotary_emb_base=get_key(neox_config, "rotary-emb-base", default=10000), + max_position_embeddings=get_key(neox_config, "max-position-embeddings"), + initializer_range=get_key(neox_config, "init-method-std", 0.02), + layer_norm_eps=get_key(neox_config, "layernorm-epsilon", 1e-5), + use_cache=True, + bos_token_id=tokenizer.eod, + eos_token_id=tokenizer.eod, + tie_word_embeddings=(not get_key(neox_config, "no-weight-tying", False)), + use_parallel_residual=get_key(neox_config, "gpt-j-residual", False), + ) + return hf_config + + +def convert(input_checkpoint_path, loaded_config, output_checkpoint_path): + """convert a NeoX checkpoint to a HF model format. + should perform model-parallel merging correctly + but only supports features allowed by HF GPT-NeoX implementation (e.g. rotary embeddings) + """ + print('debug: ', loaded_config) + hf_config = GPTNeoXConfig() + + hf_config = create_config(loaded_config) + + # hf_model = GPTNeoXForCausalLM(hf_config) + ## for swiglu + hf_model = GPTNeoX2ForCausalLM(hf_config) + + # save model in fp16/bf16 if Deepspeed fp16 or bf16 mixed precision was used in config, else 32 bit weights + fp16 = get_key(loaded_config, "fp16") + if fp16: + try: + # this conditional is quite messy because there were a number of ways to specify bf16 or fp16 training + # in DeeperSpeed v1.0 . + if (fp16.get("fp16", None) or fp16["enabled"]) and not (fp16.get("type", None) == "bfloat16"): + hf_model.half() + print("Saving weights in fp16 precision...") + elif fp16.get("type", None) == "bfloat16": + hf_model.to(dtype=torch.bfloat16) + print("Saving weights in bf16 precision...") + except: + print("Model not trained in fp16 / bf16 mixed precision, saving weights in fp32...") + + mp_partitions = get_key(loaded_config, "model-parallel-size") + + ### Embedding layer ### + loaded_tp_ranks = load_partitions(input_checkpoint_path, mp_partitions, 0) + hf_model.gpt_neox.embed_in.load_state_dict( + { + "weight": torch.cat( + [t["word_embeddings.weight"] for t in loaded_tp_ranks], dim=0 + ) + } + ) + + assert ( + hf_config.vocab_size == hf_model.gpt_neox.embed_in.weight.shape[0] + ), f"ERROR: calculated vocab size {hf_config.vocab_size} != embed param size {hf_model.gpt_neox.embed_in.shape[0]}" + ### End Embedding Layer ### + + for layer_i in tqdm(range(get_key(loaded_config, "num-layers"))): + + # get layer from hf model + hf_layer = hf_model.gpt_neox.layers[layer_i] + for v in hf_layer.state_dict(): + print('debug state_dict: ', v) + print('-'*200) + + # + 2 bc of embed layer and a dummy _pre_transformer_block + loaded_tp_ranks = load_partitions( + input_checkpoint_path, mp_partitions, layer_i + 2 + ) + + for t in loaded_tp_ranks: + print('debug loaded_tp_ranks: ', t.keys()) + + state_dict = {} + + + + for key in [ + "attention.dense.weight", + "mlp.dense_4h_to_h.weight", + ]: + state_dict[key] = torch.cat([t[key] for t in loaded_tp_ranks], dim=1) + + # average layernorm stats over mp ranks + keysForOriginGPTNeoX=[ + "input_layernorm.weight", + "input_layernorm.bias", + "post_attention_layernorm.weight", + "post_attention_layernorm.bias", + ] + keysForSwiglu = [ + 'input_layernorm.scale', + 'post_attention_layernorm.scale' + ] + for key in keysForSwiglu: + state_dict[key] = (sum([t[key] for t in loaded_tp_ranks])) / len( + loaded_tp_ranks + ) + + # LinearWithTPMerge + for key in [ + "mlp.dense_h_to_4h.weight", + "mlp.dense_h_to_4h.bias", + "attention.query_key_value.weight", + "attention.query_key_value.bias", + ]: + state_dict[key] = torch.cat([t[key] for t in loaded_tp_ranks], dim=0) + + # LinearWithTPSplitBias + for key in [ + "mlp.dense_4h_to_h.bias", + "attention.dense.bias", + ]: + state_dict[key] = sum([t[key] for t in loaded_tp_ranks]) + + # Just take one + if loaded_config['pos_emb'] == 'rotary': + state_dict["attention.rotary_emb.inv_freq"] = loaded_tp_ranks[0][ + "attention.rotary_emb.inv_freq" + ] + + + state_dict["attention.dense.bias"] = hf_layer.state_dict()["attention.dense.bias"] + + if "attention.bias" in hf_layer.state_dict(): + state_dict["attention.bias"] = hf_layer.state_dict()["attention.bias"] + + if "attention.masked_bias" in hf_layer.state_dict(): + state_dict["attention.masked_bias"] = hf_layer.state_dict()[ + "attention.masked_bias" + ] + + # load state_dict into layer + hf_layer.load_state_dict(state_dict) + + # Load final layer norm + loaded_tp_ranks = load_partitions( + input_checkpoint_path, mp_partitions, get_key(loaded_config, "num-layers") + 3 + ) + + hf_model.gpt_neox.final_layer_norm.load_state_dict( + { + "weight": (sum([t["norm.weight"] for t in loaded_tp_ranks])) + / len(loaded_tp_ranks), + "bias": (sum([t["norm.bias"] for t in loaded_tp_ranks])) + / len(loaded_tp_ranks), + } + ) + del loaded_tp_ranks + + # Load output embedding + loaded_tp_ranks = load_partitions( + input_checkpoint_path, mp_partitions, get_key(loaded_config, "num-layers") + 4 + ) + + hf_model.embed_out.load_state_dict( + { + "weight": torch.cat( + [t["final_linear.weight"] for t in loaded_tp_ranks], dim=0 + ), + } + ) + + del loaded_tp_ranks + + return hf_model + + +if __name__ == "__main__": + + # before running script: + # `pip install --upgrade transformers` + # `huggingface-cli login` + # + from huggingface_hub import create_repo, HfApi + + parser = argparse.ArgumentParser( + description="Merge MP partitions and convert to HF Model." + ) + parser.add_argument( + "--input_dir", + type=str, + help="Path to NeoX checkpoint, e.g. /path/to/model/global_step143000", + ) + parser.add_argument( + "--config_file", + type=str, + help="Path to config file for the input NeoX checkpoint.", + ) + parser.add_argument( + "--output_dir", + type=str, + help="Output dir, where to save the HF Model, tokenizer, and configs", + ) + parser.add_argument( + "--upload", + action="store_true", + help="Set to true in order to upload to the HF Hub directly.", + ) + args = parser.parse_args() + + with open(args.config_file) as f: + loaded_config = yaml.full_load(f) + + hf_model = convert(args.input_dir, loaded_config, args.output_dir) + + hf_model.save_pretrained(args.output_dir) + + # save tokenizer to directory as well, for easy loading of model as a HF model + tokenizer_type = get_key(loaded_config, "tokenizer-type") + + if tokenizer_type == "HFTokenizer": + print(f"saving tokenizer from file {get_key(loaded_config, 'vocab-file')}") + from transformers import PreTrainedTokenizerFast + + tokenizer = PreTrainedTokenizerFast( + tokenizer_file=get_key(loaded_config, "vocab-file") + ) + print("loaded tokenizer: ", tokenizer) + tokenizer.save_pretrained(args.output_dir) + print("tokenizer saved!") + + if args.upload: + repo_name = input("Provide a repository name for the HF Hub: ") + create_repo(repo_name, repo_type="model", private=False, use_auth_token=True) + + api = HfApi() + api.upload_folder( + folder_path=args.output_dir, + repo_id=repo_name, + repo_type="model", + ) From 3e5283ed43eeab313cf1b10757794c16c0f60021 Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Sun, 24 Sep 2023 19:41:10 +0900 Subject: [PATCH 175/183] debug --- tools/convert_module_to_hf_gptneox2.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tools/convert_module_to_hf_gptneox2.py b/tools/convert_module_to_hf_gptneox2.py index 7ea38a4e3..8c875de20 100644 --- a/tools/convert_module_to_hf_gptneox2.py +++ b/tools/convert_module_to_hf_gptneox2.py @@ -21,7 +21,7 @@ from typing import List import torch -from transformers import GPTNeoXConfig, GPTNeoXForCausalLM +from transformers import GPTNeoXConfig, GPTNeoXForCausalLM, AutoModelForCausalLM from hf_gptneox import GPTNeoX2ForCausalLM @@ -148,7 +148,8 @@ def convert(input_checkpoint_path, loaded_config, output_checkpoint_path): # hf_model = GPTNeoXForCausalLM(hf_config) ## for swiglu - hf_model = GPTNeoX2ForCausalLM(hf_config) + # hf_model = GPTNeoX2ForCausalLM(hf_config) + hf_model = AutoModelForCausalLM(hf_config) # save model in fp16/bf16 if Deepspeed fp16 or bf16 mixed precision was used in config, else 32 bit weights fp16 = get_key(loaded_config, "fp16") From 7c00e854bb3bbc66ef513846cf459aeaa399e5a3 Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Sun, 24 Sep 2023 19:42:20 +0900 Subject: [PATCH 176/183] debug --- tools/convert_module_to_hf_gptneox2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/convert_module_to_hf_gptneox2.py b/tools/convert_module_to_hf_gptneox2.py index 8c875de20..5e90b5985 100644 --- a/tools/convert_module_to_hf_gptneox2.py +++ b/tools/convert_module_to_hf_gptneox2.py @@ -149,7 +149,7 @@ def convert(input_checkpoint_path, loaded_config, output_checkpoint_path): # hf_model = GPTNeoXForCausalLM(hf_config) ## for swiglu # hf_model = GPTNeoX2ForCausalLM(hf_config) - hf_model = AutoModelForCausalLM(hf_config) + hf_model = AutoModelForCausalLM.from_config(hf_config) # save model in fp16/bf16 if Deepspeed fp16 or bf16 mixed precision was used in config, else 32 bit weights fp16 = get_key(loaded_config, "fp16") From 0be6712fbea8ee142f15f971b6652e388e4a511d Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Sun, 24 Sep 2023 19:56:35 +0900 Subject: [PATCH 177/183] debug --- tools/convert_module_to_hf_gptneox2.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tools/convert_module_to_hf_gptneox2.py b/tools/convert_module_to_hf_gptneox2.py index 5e90b5985..07807672c 100644 --- a/tools/convert_module_to_hf_gptneox2.py +++ b/tools/convert_module_to_hf_gptneox2.py @@ -146,10 +146,9 @@ def convert(input_checkpoint_path, loaded_config, output_checkpoint_path): hf_config = create_config(loaded_config) - # hf_model = GPTNeoXForCausalLM(hf_config) + # hf_model = GPTNeoXForCausalLM(hf_config) ## for swiglu - # hf_model = GPTNeoX2ForCausalLM(hf_config) - hf_model = AutoModelForCausalLM.from_config(hf_config) + hf_model = GPTNeoX2ForCausalLM(hf_config) # save model in fp16/bf16 if Deepspeed fp16 or bf16 mixed precision was used in config, else 32 bit weights fp16 = get_key(loaded_config, "fp16") From 02651236265db44725223b542f903efbe7096008 Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Sun, 24 Sep 2023 20:54:50 +0900 Subject: [PATCH 178/183] fix model --- tools/hf_gptneox.py | 345 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 343 insertions(+), 2 deletions(-) diff --git a/tools/hf_gptneox.py b/tools/hf_gptneox.py index b4eb0ae3c..3ee257e10 100644 --- a/tools/hf_gptneox.py +++ b/tools/hf_gptneox.py @@ -1,9 +1,10 @@ from transformers.models.gpt_neox import GPTNeoXPreTrainedModel, GPTNeoXModel, GPTNeoXLayer -from transformers.models.gpt_neox.modeling_gpt_neox import GPTNeoXMLP +from transformers.models.gpt_neox.modeling_gpt_neox import GPTNeoXMLP, GPTNeoXAttention from transformers.activations import ClassInstantier, ACT2CLS from torch import Tensor, nn +import torch -from typing import Callable, Optional +from typing import Callable, Optional, Tuple import torch.nn.functional as F @@ -49,6 +50,297 @@ def __init__(self, config): config.hidden_act = _copy_hidden_act self.act = ACT2FN[config.hidden_act] + +def rotate_half(x): + x1, x2 = x.chunk(2, dim=-1) + return torch.cat((-x2, x1), dim=-1) + +def apply_rotary_pos_emb(q, k, cos, sin, position_ids, cos_k=None, sin_k=None): + """ + q, k: [bs, num_heads, seq_len, rot_dim] + cos, sin: [seq_len, rot_dim / 2] + position_ids: [bs, seq_len] + """ + # print(f"q: {q.shape}, k: {k.shape}, cos: {cos.shape}, sin: {sin.shape}, position_ids: {position_ids.shape}") + import einops + cos = einops.repeat(cos, 's r -> s (2 r)') + sin = einops.repeat(sin, 's r -> s (2 r)') + cos_k = einops.repeat(cos_k, 's r -> s (2 r)') + sin_k = einops.repeat(sin_k, 's r -> s (2 r)') + cos = cos[position_ids].unsqueeze(1) # [bs, 1, seq_len, rot_dim] + sin = sin[position_ids].unsqueeze(1) # [bs, 1, seq_len, rot_dim] + cos_k = cos_k[position_ids].unsqueeze(1) # [bs, 1, seq_len, rot_dim] + sin_k = sin_k[position_ids].unsqueeze(1) # [bs, 1, seq_len, rot_dim] + + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos_k) + (rotate_half(k) * sin_k) + return q_embed, k_embed + +class RotaryEmbedding(torch.nn.Module): + """Based on Tri Dao's XPos: https://github.com/HazyResearch/flash-attention/blob/main/flash_attn/layers/rotary.py""" + def __init__( + self, + dim: int, + max_position_embeddings: int, + base: int = 10_000, + scale_base: int = 512, + device: str = None + ): + super().__init__() + self.dim = dim + self.seq_len_cached = max_position_embeddings + + # Set up `inv_freq` term + inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2, device=device, dtype=torch.float32) / dim)) + self.register_buffer("inv_freq", inv_freq) + + # Set up `scale` term + self.scale_base = scale_base + scale = ( + (torch.arange(0, dim, 2, device=device, dtype=torch.float32) + 0.4 * dim) / (1.4 * dim) + if scale_base is not None else None + ) + self.register_buffer("scale", scale) + + # Seet up `cos..` and `sin...` cache terms + t = torch.arange(self.seq_len_cached, device=device, dtype=torch.float32) + freqs = torch.outer(t, self.inv_freq) + # freqs = torch.cat((freqs, freqs), dim=-1) + seq_range = torch.arange(self.seq_len_cached, dtype=self.scale.dtype, device=self.scale.device) + power = (seq_range - self.seq_len_cached // 2) / self.scale_base + scale_cached = self.scale.to(device=power.device) ** power.unsqueeze(-1) + # scale_cached = torch.cat((scale_cached, scale_cached), dim=-1) + self.register_buffer("cos_cached", torch.cos(freqs) * scale_cached, persistent=False) + self.register_buffer("sin_cached", torch.sin(freqs) * scale_cached, persistent=False) + self.register_buffer("cos_k_cached", torch.cos(freqs) / scale_cached, persistent=False) + self.register_buffer("sin_k_cached", torch.sin(freqs) / scale_cached, persistent=False) + + def forward(self, x, seq_len=None): + if seq_len > self.seq_len_cached: + self.seq_len_cached = seq_len + t = torch.arange(seq_len, device=x.device, dtype=torch.float32) + freqs = torch.outer(t, self.inv_freq) + freqs = torch.cat((freqs, freqs), dim=-1) + seq_range = torch.arange(self.seq_len_cached, dtype=self.scale.dtype, device=self.scale.device) + power = (seq_range - self.seq_len_cached // 2) / self.scale_base + scale_cached = self.scale.to(device=power.device) ** power.unsqueeze(-1) + scale_cached = torch.cat((scale_cached, scale_cached), dim=-1) + self.register_buffer("cos_cached", torch.cos(freqs) * scale_cached, persistent=False) + self.register_buffer("sin_cached", torch.sin(freqs) * scale_cached, persistent=False) + self.register_buffer("cos_k_cached", torch.cos(freqs) / scale_cached, persistent=False) + self.register_buffer("sin_k_cached", torch.sin(freqs) / scale_cached, persistent=False) + return ( + self.cos_cached[:seq_len, ...], + self.sin_cached[:seq_len, ...], + self.cos_k_cached[:seq_len, ...], + self.sin_k_cached[:seq_len, ...], + ) + +class GPTNeoX2Attention(nn.Module): + def __init__(self, config): + super().__init__() + self.num_attention_heads = config.num_attention_heads + self.hidden_size = config.hidden_size + if self.hidden_size % self.num_attention_heads != 0: + raise ValueError( + "The hidden size is not divisble by the number of attention heads! Make sure to update them" + ) + self.head_size = self.hidden_size // self.num_attention_heads + + max_positions = config.max_position_embeddings + self.register_buffer( + "bias", + torch.tril(torch.ones((max_positions, max_positions), dtype=torch.bool)).view( + 1, 1, max_positions, max_positions + ), + persistent=False, + ) + self.register_buffer("masked_bias", torch.tensor(-1e9), persistent=False) + + self.rotary_ndims = int(self.head_size * config.rotary_pct) + self.rotary_emb = RotaryEmbedding( + self.rotary_ndims, + max_position_embeddings=config.max_position_embeddings, + base=config.rotary_emb_base, + scale_base=config.rotary_scale_base, + ) + + self.register_buffer( + "norm_factor", + torch.sqrt(torch.tensor(self.head_size, dtype=torch.float32)).to(torch.get_default_dtype()), + persistent=False, + ) + + self.query_key_value = nn.Linear(self.hidden_size, 3 * self.hidden_size, bias=False) + self.dense = nn.Linear(self.hidden_size, self.hidden_size, bias=False) + + def forward( + self, + hidden_states: torch.FloatTensor, + attention_mask: torch.FloatTensor, + position_ids: torch.LongTensor, + head_mask: Optional[torch.FloatTensor] = None, + layer_past: Optional[Tuple[torch.Tensor]] = None, + use_cache: Optional[bool] = False, + output_attentions: Optional[bool] = False, + ): + has_layer_past = layer_past is not None + + # Compute QKV + # Attention heads [batch, seq_len, hidden_size] + # --> [batch, seq_len, (np * 3 * head_size)] + qkv = self.query_key_value(hidden_states) + + # [batch, seq_len, (num_heads * 3 * head_size)] + # --> [batch, seq_len, num_heads, 3 * head_size] + new_qkv_shape = qkv.size()[:-1] + (self.num_attention_heads, 3 * self.head_size) + qkv = qkv.view(*new_qkv_shape) + + # [batch, seq_len, num_attention_heads, 3 * head_size] --> 3 [batch, num_attention_heads, seq_len, head_size] + query = qkv[..., : self.head_size].permute(0, 2, 1, 3) + key = qkv[..., self.head_size : 2 * self.head_size].permute(0, 2, 1, 3) + value = qkv[..., 2 * self.head_size :].permute(0, 2, 1, 3) + + # Compute rotary embeddings on rotary_ndims + query_rot = query[..., : self.rotary_ndims] + query_pass = query[..., self.rotary_ndims :] + key_rot = key[..., : self.rotary_ndims] + key_pass = key[..., self.rotary_ndims :] + + # Compute token offset for rotary embeddings (when decoding) + kv_seq_len = key.shape[-2] + if has_layer_past: + kv_seq_len += layer_past[0].shape[-2] + + # Add rotary embeddings to query and key + # TODO: Check if using xpos + cos, sin, cos_k, sin_k = self.rotary_emb(value, seq_len=kv_seq_len) + query, key = apply_rotary_pos_emb( + query_rot, key_rot, cos, sin, position_ids, cos_k=cos_k, sin_k=sin_k) + + query = torch.cat((query, query_pass), dim=-1) + key = torch.cat((key, key_pass), dim=-1) + + # Cache QKV values + if has_layer_past: + past_key = layer_past[0] + past_value = layer_past[1] + key = torch.cat((past_key, key), dim=-2) + value = torch.cat((past_value, value), dim=-2) + present = (key, value) if use_cache else None + + # Compute attention + attn_output, attn_weights = self._attn(query, key, value, attention_mask, head_mask) + + # Merge attn_head_size dim and num_attn_heads dim into hidden dim + # [bs, seq_len, num_attention_heads, attn_head_size] + attn_output = attn_output.permute(0, 2, 1, 3).contiguous() + attn_output = attn_output.view(attn_output.size(0), attn_output.size(1), self.num_attention_heads * self.head_size) + + attn_output = self.dense(attn_output) + + outputs = (attn_output, present) + if output_attentions: + outputs += (attn_weights,) + + return outputs + + def _attn(self, query, key, value, attention_mask=None, head_mask=None): + # q, k, v: [bs, num_attention_heads, seq_len, attn_head_size] + # compute causal mask from causal mask buffer + + batch_size, num_attention_heads, query_length, attn_head_size = query.size() + key_length = key.size(-2) + + causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length] + + query = query.view(batch_size * num_attention_heads, query_length, attn_head_size) + key = key.view(batch_size * num_attention_heads, key_length, attn_head_size) + attn_scores = torch.zeros( + batch_size * num_attention_heads, + query_length, + key_length, + dtype=query.dtype, + device=key.device, + ) + attn_scores = torch.baddbmm( + attn_scores, + query, + key.transpose(1, 2), + beta=1.0, + alpha=(torch.tensor(1.0, dtype=self.norm_factor.dtype, device=self.norm_factor.device) / self.norm_factor), + ) + attn_scores = attn_scores.view(batch_size, num_attention_heads, query_length, key_length) + + mask_value = torch.finfo(attn_scores.dtype).min + # Need to be a tensor, otherwise we get error: `RuntimeError: expected scalar type float but found double`. + # Need to be on the same device, otherwise `RuntimeError: ..., x and y to be on the same device` + mask_value = torch.tensor(mask_value, dtype=attn_scores.dtype, device=attn_scores.device) + attn_scores = torch.where(causal_mask, attn_scores, mask_value) + + if attention_mask is not None: + # Apply the attention mask + attn_scores = attn_scores + attention_mask + + # NOTE: Upcast to float32 + attn_weights = nn.functional.softmax(attn_scores, dim=-1, dtype=torch.float32).type_as(value) + + # Mask heads if we want to + if head_mask is not None: + attn_weights = attn_weights * head_mask + + attn_output = torch.matmul(attn_weights, value) + return attn_output, attn_weights + + +def attention_mask_func(attention_scores, ltor_mask): + attention_scores.masked_fill_(~ltor_mask, torch.finfo(attention_scores.dtype).min) + return attention_scores + + +class RMSNorm(torch.nn.Module): + def __init__(self, dim, p=-1.0, eps=1e-8, bias=False): + """ + Root Mean Square Layer Normalization + :param dim: model size + :param p: partial RMSNorm, valid value [0, 1], default -1.0 (disabled) + :param eps: epsilon value, default 1e-8 + :param bias: whether use bias term for RMSNorm, disabled by + default because RMSNorm doesn't enforce re-centering invariance. + """ + super(RMSNorm, self).__init__() + + self.eps = eps + self.d = dim + self.p = p + self.bias = bias + + self.scale = torch.nn.Parameter(torch.ones(dim)) + self.register_parameter("scale", self.scale) + + if self.bias: + self.offset = torch.nn.Parameter(torch.zeros(dim)) + self.register_parameter("offset", self.offset) + + def forward(self, x): + if self.p < 0.0 or self.p > 1.0: + norm_x = x.norm(2, dim=-1, keepdim=True) + d_x = self.d + else: + partial_size = int(self.d * self.p) + partial_x, _ = torch.split(x, [partial_size, self.d - partial_size], dim=-1) + + norm_x = partial_x.norm(2, dim=-1, keepdim=True) + d_x = partial_size + + rms_x = norm_x * d_x ** (-1.0 / 2) + x_normed = x / (rms_x + self.eps) + + if self.bias: + return self.scale * x_normed + self.offset + + return self.scale * x_normed + class GPTNeoX2Layer(GPTNeoXLayer): def __init__(self, config): _copy_hidden_act = config.hidden_act @@ -56,8 +348,57 @@ def __init__(self, config): super().__init__(config) config.hidden_act = _copy_hidden_act + self.use_parallel_residual = config.use_parallel_residual + # self.input_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.input_layernorm = RMSNorm(config.hidden_size, eps=config.layer_norm_eps) + + # self.post_attention_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + # self.attention = GPTNeoXAttention(config) + self.attention = GPTNeoX2Attention(config) self.mlp = GPTNeoX2MLP(config) + def forward( + self, + hidden_states: Optional[torch.FloatTensor], + attention_mask: Optional[torch.FloatTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = False, + layer_past: Optional[Tuple[torch.Tensor]] = None, + output_attentions: Optional[bool] = False, + ): + attention_layer_outputs = self.attention( + self.input_layernorm(hidden_states), + attention_mask=attention_mask, + position_ids=position_ids, + layer_past=layer_past, + head_mask=head_mask, + use_cache=use_cache, + output_attentions=output_attentions, + ) + attn_output = attention_layer_outputs[0] # output_attn: attn_output, present, (attn_weights) + outputs = attention_layer_outputs[1:] + + if self.use_parallel_residual: + # pseudocode: + # x = x + attn(ln1(x)) + mlp(ln2(x)) + mlp_output = self.mlp(self.post_attention_layernorm(hidden_states)) + hidden_states = mlp_output + attn_output + hidden_states + else: + # pseudocode: + # x = x + attn(ln1(x)) + # x = x + mlp(ln2(x)) + attn_output = attn_output + hidden_states + mlp_output = self.mlp(self.post_attention_layernorm(attn_output)) + hidden_states = mlp_output + attn_output + + if use_cache: + outputs = (hidden_states,) + outputs # hidden_states, present, (attn_weights) + else: + outputs = (hidden_states,) + outputs[1:] # hidden_states, (attn_weights) + + return outputs + class GPTNeoX2Model(GPTNeoXModel): def __init__(self, config): _copy_hidden_act = config.hidden_act From 0855a52d54e7aa1142ca0e3004eed43aaa833e1e Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Sun, 24 Sep 2023 21:00:41 +0900 Subject: [PATCH 179/183] fix model --- tools/hf_gptneox.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/hf_gptneox.py b/tools/hf_gptneox.py index 3ee257e10..f82213082 100644 --- a/tools/hf_gptneox.py +++ b/tools/hf_gptneox.py @@ -158,6 +158,7 @@ def __init__(self, config): self.register_buffer("masked_bias", torch.tensor(-1e9), persistent=False) self.rotary_ndims = int(self.head_size * config.rotary_pct) + print('config', config) self.rotary_emb = RotaryEmbedding( self.rotary_ndims, max_position_embeddings=config.max_position_embeddings, From e4ce875c8dac0e2c81b62517fbd70bd6df076a2a Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Sun, 24 Sep 2023 21:01:43 +0900 Subject: [PATCH 180/183] fix model --- tools/hf_gptneox.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tools/hf_gptneox.py b/tools/hf_gptneox.py index f82213082..41d507a5b 100644 --- a/tools/hf_gptneox.py +++ b/tools/hf_gptneox.py @@ -162,8 +162,7 @@ def __init__(self, config): self.rotary_emb = RotaryEmbedding( self.rotary_ndims, max_position_embeddings=config.max_position_embeddings, - base=config.rotary_emb_base, - scale_base=config.rotary_scale_base, + base=config.rotary_emb_base ) self.register_buffer( From d063655edca84cf9e1da161719924e14bbd84db8 Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Sun, 24 Sep 2023 21:02:13 +0900 Subject: [PATCH 181/183] fix model --- tools/hf_gptneox.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/hf_gptneox.py b/tools/hf_gptneox.py index 41d507a5b..5d6767207 100644 --- a/tools/hf_gptneox.py +++ b/tools/hf_gptneox.py @@ -158,7 +158,7 @@ def __init__(self, config): self.register_buffer("masked_bias", torch.tensor(-1e9), persistent=False) self.rotary_ndims = int(self.head_size * config.rotary_pct) - print('config', config) + self.rotary_emb = RotaryEmbedding( self.rotary_ndims, max_position_embeddings=config.max_position_embeddings, From 45ad8383400f7cfacf48dbb821fbca91b092f763 Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Sun, 24 Sep 2023 21:26:28 +0900 Subject: [PATCH 182/183] fix model --- tools/convert_module_to_hf_gptneox2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/convert_module_to_hf_gptneox2.py b/tools/convert_module_to_hf_gptneox2.py index 07807672c..ce77a48b4 100644 --- a/tools/convert_module_to_hf_gptneox2.py +++ b/tools/convert_module_to_hf_gptneox2.py @@ -236,7 +236,7 @@ def convert(input_checkpoint_path, loaded_config, output_checkpoint_path): # LinearWithTPSplitBias for key in [ "mlp.dense_4h_to_h.bias", - "attention.dense.bias", + # "attention.dense.bias", ]: state_dict[key] = sum([t[key] for t in loaded_tp_ranks]) From d6cf7deb41bee042b5c8cfd0618e24a10b43a1dc Mon Sep 17 00:00:00 2001 From: windows_on_wsl Date: Sun, 24 Sep 2023 21:27:46 +0900 Subject: [PATCH 183/183] fix model --- tools/convert_module_to_hf_gptneox2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/convert_module_to_hf_gptneox2.py b/tools/convert_module_to_hf_gptneox2.py index ce77a48b4..8b099f04a 100644 --- a/tools/convert_module_to_hf_gptneox2.py +++ b/tools/convert_module_to_hf_gptneox2.py @@ -247,7 +247,7 @@ def convert(input_checkpoint_path, loaded_config, output_checkpoint_path): ] - state_dict["attention.dense.bias"] = hf_layer.state_dict()["attention.dense.bias"] + # state_dict["attention.dense.bias"] = hf_layer.state_dict()["attention.dense.bias"] if "attention.bias" in hf_layer.state_dict(): state_dict["attention.bias"] = hf_layer.state_dict()["attention.bias"]