From 592acbead89c7778c80a70c4a7c6485d598a26fc Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Sun, 13 Aug 2023 20:58:59 +0900
Subject: [PATCH 001/183] add for ja

---
 configs/local_setup_ja.yml | 27 +++++++++++++++++++++++++++
 prepare_data_ja.sh         |  5 +++++
 preprocess_ja.sh           |  8 ++++++++
 tools/corpora.py           |  9 +++++++++
 4 files changed, 49 insertions(+)
 create mode 100644 configs/local_setup_ja.yml
 create mode 100644 prepare_data_ja.sh
 create mode 100644 preprocess_ja.sh

diff --git a/configs/local_setup_ja.yml b/configs/local_setup_ja.yml
new file mode 100644
index 000000000..04af5b6fa
--- /dev/null
+++ b/configs/local_setup_ja.yml
@@ -0,0 +1,27 @@
+# Suggested data paths when using GPT-NeoX locally
+{
+  "data_path": "data/wiki_ja_en",
+
+  # or for weighted datasets:
+  # "train-data-paths": ["data/enwik8/enwik8_text_document", "data/enwik8/enwik8_text_document"],
+  # "test-data-paths": ["data/enwik8/enwik8_text_document", "data/enwik8/enwik8_text_document"],
+  # "valid-data-paths": ["data/enwik8/enwik8_text_document", "data/enwik8/enwik8_text_document"],
+  # "train-data-weights": [1., 2.],
+  # "test-data-weights": [2., 1.],
+  # "valid-data-weights": [0.5, 0.4],
+
+  # If weight_by_num_documents is True, Builds dataset weights from a multinomial distribution over groups of data according to the number of documents in each group.
+  # WARNING: setting this to True will override any user provided weights
+  # "weight_by_num_documents": false,
+  # "weighted_sampler_alpha": 0.3,
+
+  "vocab_file": "./novelAI/tokenizer.model",  
+
+  "save": "checkpoints",
+  "load": "checkpoints",
+  "checkpoint_validation_with_forward_pass": False,
+
+  "tensorboard_dir": "tensorboard",
+  "log_dir": "logs",
+  "use_wandb": False
+}
diff --git a/prepare_data_ja.sh b/prepare_data_ja.sh
new file mode 100644
index 000000000..cb15ffb82
--- /dev/null
+++ b/prepare_data_ja.sh
@@ -0,0 +1,5 @@
+#!/bin/sh
+python prepare_data.py -d ./data \
+-t SPMTokenizer \
+--vocab-file ./novelAI/tokenizer.model \
+wiki_ja_en
diff --git a/preprocess_ja.sh b/preprocess_ja.sh
new file mode 100644
index 000000000..7de047c6d
--- /dev/null
+++ b/preprocess_ja.sh
@@ -0,0 +1,8 @@
+#!/bin/sh
+python tools/preprocess_data.py \
+            --input ./data/mydataset.jsonl.zst \
+            --output-prefix ./data/wiki_ja_en \            
+            --vocab-file ./novelAI/tokenizer.model \
+            --dataset-impl mmap \
+            --tokenizer-type SPMTokenizer \
+            --append-eod
diff --git a/tools/corpora.py b/tools/corpora.py
index b9e846454..fb35477a3 100644
--- a/tools/corpora.py
+++ b/tools/corpora.py
@@ -293,6 +293,14 @@ class Enwik8(DataDownloader):
     urls = ["https://data.deepai.org/enwik8.zip"]
 
 
+class WikiJaEn(DataDownloader):
+    name = "wiki_ja_en"
+    urls = [
+        "jawikibooks-20230807-cirrussearch-content.json.gz",
+        "enwiki-20230807-cirrussearch-content.json.gz"
+    ]
+
+
 def maybe_download_gpt2_tokenizer_data(tokenizer_type, data_dir):
     if tokenizer_type is None or tokenizer_type == "GPT2BPETokenizer":
         GPT2_VOCAB_FP = f"{data_dir}//gpt2-vocab.json"
@@ -324,6 +332,7 @@ def maybe_download_gpt2_tokenizer_data(tokenizer_type, data_dir):
     "c4": C4,
     "c4_openwebtext": C4OpenWebText,
     "enwik8": Enwik8,
+    'wiki_ja_en': WikiJaEn
 }
 
 

From cd99f474c8aadd7e904bf62a69414702b7842ccf Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Sun, 13 Aug 2023 21:06:13 +0900
Subject: [PATCH 002/183] fix link

---
 tools/corpora.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/corpora.py b/tools/corpora.py
index fb35477a3..48a9930af 100644
--- a/tools/corpora.py
+++ b/tools/corpora.py
@@ -296,8 +296,8 @@ class Enwik8(DataDownloader):
 class WikiJaEn(DataDownloader):
     name = "wiki_ja_en"
     urls = [
-        "jawikibooks-20230807-cirrussearch-content.json.gz",
-        "enwiki-20230807-cirrussearch-content.json.gz"
+        "https://dumps.wikimedia.org/other/cirrussearch/20230807/jawiki-20230807-cirrussearch-content.json.gz",
+        "https://dumps.wikimedia.org/other/cirrussearch/20230807/enwiki-20230807-cirrussearch-content.json.gz"
     ]
 
 

From fc24f602855002d7a88b3771acdef39e9a713698 Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Mon, 14 Aug 2023 19:45:59 +0900
Subject: [PATCH 003/183] fix

---
 tools/corpora.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/tools/corpora.py b/tools/corpora.py
index 48a9930af..7bcde2484 100644
--- a/tools/corpora.py
+++ b/tools/corpora.py
@@ -301,6 +301,13 @@ class WikiJaEn(DataDownloader):
     ]
 
 
+class WikiJa(DataDownloader):
+    name = "wiki_ja"
+    urls = [
+        "https://dumps.wikimedia.org/other/cirrussearch/20230807/jawiki-20230807-cirrussearch-content.json.gz",        
+    ]
+
+
 def maybe_download_gpt2_tokenizer_data(tokenizer_type, data_dir):
     if tokenizer_type is None or tokenizer_type == "GPT2BPETokenizer":
         GPT2_VOCAB_FP = f"{data_dir}//gpt2-vocab.json"
@@ -332,7 +339,8 @@ def maybe_download_gpt2_tokenizer_data(tokenizer_type, data_dir):
     "c4": C4,
     "c4_openwebtext": C4OpenWebText,
     "enwik8": Enwik8,
-    'wiki_ja_en': WikiJaEn
+    'wiki_ja_en': WikiJaEn,
+    'wiki_ja': WikiJa
 }
 
 

From ed91147f77f959bdff83a5e567d6bf8a22f8ff61 Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Mon, 14 Aug 2023 20:38:19 +0900
Subject: [PATCH 004/183] debug

---
 tools/preprocess_data.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tools/preprocess_data.py b/tools/preprocess_data.py
index 862620eb8..7ff994527 100644
--- a/tools/preprocess_data.py
+++ b/tools/preprocess_data.py
@@ -187,6 +187,9 @@ def main():
         encoded_docs = pool.imap(encoder.encode, fin, chunksize=25)
     else:
         encoder.initializer()
+        for doc in fin:
+            a = encoder.encode(doc)
+            print('a,', a)
         encoded_docs = (encoder.encode(doc) for doc in fin)
 
     # make a dataset builder for each key in args.jsonl_keys

From a3ae99807ba395fe7bd8266a65e01137f19dff46 Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Mon, 14 Aug 2023 20:54:37 +0900
Subject: [PATCH 005/183] debug

---
 tools/preprocess_data.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/preprocess_data.py b/tools/preprocess_data.py
index 7ff994527..984646587 100644
--- a/tools/preprocess_data.py
+++ b/tools/preprocess_data.py
@@ -49,6 +49,7 @@ def initializer(self):
     def encode(self, text):
         if self.args.ftfy:
             text = ftfy.fix_text(text)
+        print('text', text)
         ids = {}
         for key in self.args.jsonl_keys:
             doc_ids = []

From 81301573f83ce36b8271fb323eb86facdfb4c7b2 Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Mon, 14 Aug 2023 21:07:25 +0900
Subject: [PATCH 006/183] fix

---
 tools/preprocess_data.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/tools/preprocess_data.py b/tools/preprocess_data.py
index 984646587..280265365 100644
--- a/tools/preprocess_data.py
+++ b/tools/preprocess_data.py
@@ -49,7 +49,7 @@ def initializer(self):
     def encode(self, text):
         if self.args.ftfy:
             text = ftfy.fix_text(text)
-        print('text', text)
+            
         ids = {}
         for key in self.args.jsonl_keys:
             doc_ids = []
@@ -188,10 +188,13 @@ def main():
         encoded_docs = pool.imap(encoder.encode, fin, chunksize=25)
     else:
         encoder.initializer()
+        new_fin = []
         for doc in fin:
-            a = encoder.encode(doc)
-            print('a,', a)
-        encoded_docs = (encoder.encode(doc) for doc in fin)
+            if 'text' in doc:
+                new_fin.append(doc['text'])        
+        encoded_docs = (encoder.encode(doc) for doc in new_fin)
+        
+        # encoded_docs = (encoder.encode(doc) for doc in fin)
 
     # make a dataset builder for each key in args.jsonl_keys
     # each key will output to a different file beginning with args.output_prefix

From e5be7b0001ebfc8b04cc266ac8ccc3ada98eb967 Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Mon, 14 Aug 2023 21:08:26 +0900
Subject: [PATCH 007/183] debug

---
 tools/preprocess_data.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/tools/preprocess_data.py b/tools/preprocess_data.py
index 280265365..601ed8fc2 100644
--- a/tools/preprocess_data.py
+++ b/tools/preprocess_data.py
@@ -49,7 +49,7 @@ def initializer(self):
     def encode(self, text):
         if self.args.ftfy:
             text = ftfy.fix_text(text)
-            
+        print('text,', text)
         ids = {}
         for key in self.args.jsonl_keys:
             doc_ids = []
@@ -192,8 +192,7 @@ def main():
         for doc in fin:
             if 'text' in doc:
                 new_fin.append(doc['text'])        
-        encoded_docs = (encoder.encode(doc) for doc in new_fin)
-        
+        encoded_docs = (encoder.encode(doc) for doc in new_fin)        
         # encoded_docs = (encoder.encode(doc) for doc in fin)
 
     # make a dataset builder for each key in args.jsonl_keys

From 910d8944b9b06b4cb754840e11540ad0dac6f5e6 Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Mon, 14 Aug 2023 21:09:31 +0900
Subject: [PATCH 008/183] debug

---
 tools/preprocess_data.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/preprocess_data.py b/tools/preprocess_data.py
index 601ed8fc2..05c986eef 100644
--- a/tools/preprocess_data.py
+++ b/tools/preprocess_data.py
@@ -183,6 +183,7 @@ def main():
     # use multiprocessing to iterate over input documents
     fin = yield_from_files(args.input.split(","), semaphore)
 
+    print('args.workers', args.workers)
     if args.workers > 1:
         pool = multiprocessing.Pool(args.workers, initializer=encoder.initializer)
         encoded_docs = pool.imap(encoder.encode, fin, chunksize=25)

From bd323688d1009d731b56356a485ac242e2243856 Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Mon, 14 Aug 2023 21:10:13 +0900
Subject: [PATCH 009/183] fix

---
 tools/preprocess_data.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/tools/preprocess_data.py b/tools/preprocess_data.py
index 05c986eef..30a446158 100644
--- a/tools/preprocess_data.py
+++ b/tools/preprocess_data.py
@@ -182,16 +182,20 @@ def main():
 
     # use multiprocessing to iterate over input documents
     fin = yield_from_files(args.input.split(","), semaphore)
-
+    new_fin = []
+    for doc in fin:
+        if 'text' in doc:                
+            new_fin.append(doc['text']) 
     print('args.workers', args.workers)
     if args.workers > 1:
         pool = multiprocessing.Pool(args.workers, initializer=encoder.initializer)
-        encoded_docs = pool.imap(encoder.encode, fin, chunksize=25)
+        # encoded_docs = pool.imap(encoder.encode, fin, chunksize=25)
+        encoded_docs = pool.imap(encoder.encode, new_fin, chunksize=25)
     else:
         encoder.initializer()
         new_fin = []
         for doc in fin:
-            if 'text' in doc:
+            if 'text' in doc:                
                 new_fin.append(doc['text'])        
         encoded_docs = (encoder.encode(doc) for doc in new_fin)        
         # encoded_docs = (encoder.encode(doc) for doc in fin)

From 50b273cd27ee538fc60e42539c9c443ac3b5f5d5 Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Tue, 15 Aug 2023 19:33:18 +0900
Subject: [PATCH 010/183] debug

---
 tools/preprocess_data.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/preprocess_data.py b/tools/preprocess_data.py
index 30a446158..9e70de5ce 100644
--- a/tools/preprocess_data.py
+++ b/tools/preprocess_data.py
@@ -160,6 +160,7 @@ def yield_from_files(fnames: list, semaphore):
 
     def yielder(fname, semaphore):
         for f in filter(lambda x: x, lmd.Reader(fname).stream_data()):
+            print('f', f)
             semaphore.acquire()
             yield f
 

From e4c653c42b5186db554a05301a9d0f3c3da920de Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Tue, 15 Aug 2023 20:12:11 +0900
Subject: [PATCH 011/183] add filter

---
 tools/preprocess_data.py | 20 ++++++--------------
 1 file changed, 6 insertions(+), 14 deletions(-)

diff --git a/tools/preprocess_data.py b/tools/preprocess_data.py
index 9e70de5ce..0b66432a1 100644
--- a/tools/preprocess_data.py
+++ b/tools/preprocess_data.py
@@ -159,7 +159,8 @@ def yield_from_files(fnames: list, semaphore):
     """
 
     def yielder(fname, semaphore):
-        for f in filter(lambda x: x, lmd.Reader(fname).stream_data()):
+        stream = filter(lambda x: x, lmd.Reader(fname).stream_data())    
+        for f in filter(lambda x: x, 'text' in stream):
             print('f', f)
             semaphore.acquire()
             yield f
@@ -182,23 +183,14 @@ def main():
     semaphore = Semaphore(10000 + args.workers)
 
     # use multiprocessing to iterate over input documents
-    fin = yield_from_files(args.input.split(","), semaphore)
-    new_fin = []
-    for doc in fin:
-        if 'text' in doc:                
-            new_fin.append(doc['text']) 
-    print('args.workers', args.workers)
+    fin = yield_from_files(args.input.split(","), semaphore)    
     if args.workers > 1:
         pool = multiprocessing.Pool(args.workers, initializer=encoder.initializer)
         # encoded_docs = pool.imap(encoder.encode, fin, chunksize=25)
-        encoded_docs = pool.imap(encoder.encode, new_fin, chunksize=25)
+        encoded_docs = pool.imap(encoder.encode, fin, chunksize=25)
     else:
-        encoder.initializer()
-        new_fin = []
-        for doc in fin:
-            if 'text' in doc:                
-                new_fin.append(doc['text'])        
-        encoded_docs = (encoder.encode(doc) for doc in new_fin)        
+        encoder.initializer()     
+        encoded_docs = (encoder.encode(doc) for doc in fin)
         # encoded_docs = (encoder.encode(doc) for doc in fin)
 
     # make a dataset builder for each key in args.jsonl_keys

From 45c6951e055aacb9f4b8f1a7a9e67d4172abee4b Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Tue, 15 Aug 2023 20:14:20 +0900
Subject: [PATCH 012/183] add filter

---
 tools/preprocess_data.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/preprocess_data.py b/tools/preprocess_data.py
index 0b66432a1..a90288e48 100644
--- a/tools/preprocess_data.py
+++ b/tools/preprocess_data.py
@@ -160,7 +160,7 @@ def yield_from_files(fnames: list, semaphore):
 
     def yielder(fname, semaphore):
         stream = filter(lambda x: x, lmd.Reader(fname).stream_data())    
-        for f in filter(lambda x: x, 'text' in stream):
+        for f in filter(lambda x: 'text' in x, stream):
             print('f', f)
             semaphore.acquire()
             yield f

From 37bbb2aaab786e46ca0b57ff72cae3b27d045506 Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Tue, 15 Aug 2023 20:17:33 +0900
Subject: [PATCH 013/183] debug

---
 tools/preprocess_data.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/preprocess_data.py b/tools/preprocess_data.py
index a90288e48..b57b1def2 100644
--- a/tools/preprocess_data.py
+++ b/tools/preprocess_data.py
@@ -161,7 +161,7 @@ def yield_from_files(fnames: list, semaphore):
     def yielder(fname, semaphore):
         stream = filter(lambda x: x, lmd.Reader(fname).stream_data())    
         for f in filter(lambda x: 'text' in x, stream):
-            print('f', f)
+            # print('f', f)
             semaphore.acquire()
             yield f
 

From 4bc3a9d2601884a17f94a0671614d6096ded9326 Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Tue, 15 Aug 2023 20:18:46 +0900
Subject: [PATCH 014/183] fix

---
 tools/preprocess_data.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/tools/preprocess_data.py b/tools/preprocess_data.py
index b57b1def2..24f6546a0 100644
--- a/tools/preprocess_data.py
+++ b/tools/preprocess_data.py
@@ -160,10 +160,9 @@ def yield_from_files(fnames: list, semaphore):
 
     def yielder(fname, semaphore):
         stream = filter(lambda x: x, lmd.Reader(fname).stream_data())    
-        for f in filter(lambda x: 'text' in x, stream):
-            # print('f', f)
+        for f in filter(lambda x: 'text' in x, stream):            
             semaphore.acquire()
-            yield f
+            yield f['text']
 
     for fname in fnames:
         semaphore.acquire()

From 152acc47aa0e24d006cf5c491ede769f7281a38a Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Tue, 15 Aug 2023 20:23:22 +0900
Subject: [PATCH 015/183] for wiki

---
 tools/preprocess_data.py | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/tools/preprocess_data.py b/tools/preprocess_data.py
index 24f6546a0..79787abfe 100644
--- a/tools/preprocess_data.py
+++ b/tools/preprocess_data.py
@@ -158,16 +158,26 @@ def yield_from_files(fnames: list, semaphore):
     :param fnames: list of filenames
     """
 
-    def yielder(fname, semaphore):
+    def yielder(fname, semaphore):        
+        for f in filter(lambda x: x, lmd.Reader(fname).stream_data()):
+            semaphore.acquire()
+            yield f
+
+    def wiki_yielder(fname, semaphore):
         stream = filter(lambda x: x, lmd.Reader(fname).stream_data())    
-        for f in filter(lambda x: 'text' in x, stream):            
+        for f in filter(lambda x: 'text' in x, stream):
             semaphore.acquire()
             yield f['text']
 
     for fname in fnames:
         semaphore.acquire()
-
-        yield from yielder(fname, semaphore)
+        print('fname', fname)
+        if 'wiki' in fname:        
+            yield from wiki_yielder(fname, semaphore)
+        else:
+            yield from yielder(fname, semaphore)
+        
+        
 
 
 def main():
@@ -182,7 +192,7 @@ def main():
     semaphore = Semaphore(10000 + args.workers)
 
     # use multiprocessing to iterate over input documents
-    fin = yield_from_files(args.input.split(","), semaphore)    
+    fin = yield_from_files(args.input.split(","), semaphore)
     if args.workers > 1:
         pool = multiprocessing.Pool(args.workers, initializer=encoder.initializer)
         # encoded_docs = pool.imap(encoder.encode, fin, chunksize=25)

From b3871c007a0bbeb211c7988255e5aa61f057ae94 Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Tue, 15 Aug 2023 20:23:52 +0900
Subject: [PATCH 016/183] rm debug

---
 tools/preprocess_data.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/preprocess_data.py b/tools/preprocess_data.py
index 79787abfe..44eb6e952 100644
--- a/tools/preprocess_data.py
+++ b/tools/preprocess_data.py
@@ -49,7 +49,7 @@ def initializer(self):
     def encode(self, text):
         if self.args.ftfy:
             text = ftfy.fix_text(text)
-        print('text,', text)
+        # print('text,', text)
         ids = {}
         for key in self.args.jsonl_keys:
             doc_ids = []

From f3a0a6619e7c482ab06151000bbec896ab31f108 Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Tue, 15 Aug 2023 20:25:37 +0900
Subject: [PATCH 017/183] debug

---
 tools/preprocess_data.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/preprocess_data.py b/tools/preprocess_data.py
index 44eb6e952..31b3196f1 100644
--- a/tools/preprocess_data.py
+++ b/tools/preprocess_data.py
@@ -57,6 +57,7 @@ def encode(self, text):
             if len(text_ids) > 0:
                 doc_ids.append(text_ids)
             if self.args.append_eod:
+                print('doc_ids', doc_ids)
                 doc_ids[-1].append(Encoder.tokenizer.eod)
             ids[key] = doc_ids
         return ids, len(text)

From 5750065e0298883e78efc79ea02d1de4725ab24e Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Tue, 15 Aug 2023 20:29:17 +0900
Subject: [PATCH 018/183] debug

---
 tools/preprocess_data.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/tools/preprocess_data.py b/tools/preprocess_data.py
index 31b3196f1..ebd1f851b 100644
--- a/tools/preprocess_data.py
+++ b/tools/preprocess_data.py
@@ -57,8 +57,11 @@ def encode(self, text):
             if len(text_ids) > 0:
                 doc_ids.append(text_ids)
             if self.args.append_eod:
-                print('doc_ids', doc_ids)
-                doc_ids[-1].append(Encoder.tokenizer.eod)
+                try:
+                    doc_ids[-1].append(Encoder.tokenizer.eod)
+                except Exception as e:
+                    print('text', text)
+                    print('doc_ids', doc_ids)
             ids[key] = doc_ids
         return ids, len(text)
 

From 4cf6be8f7affea91c1d21bd75b7be259c0ad186c Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Tue, 15 Aug 2023 21:29:31 +0900
Subject: [PATCH 019/183] fix filter

---
 tools/preprocess_data.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/preprocess_data.py b/tools/preprocess_data.py
index ebd1f851b..a923afcc3 100644
--- a/tools/preprocess_data.py
+++ b/tools/preprocess_data.py
@@ -169,7 +169,7 @@ def yielder(fname, semaphore):
 
     def wiki_yielder(fname, semaphore):
         stream = filter(lambda x: x, lmd.Reader(fname).stream_data())    
-        for f in filter(lambda x: 'text' in x, stream):
+        for f in filter(lambda x: 'text' in x and len(x['text']) != 0, stream):
             semaphore.acquire()
             yield f['text']
 

From c06e201da09f24a296ce1d7dcdfc4e824d18faef Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Tue, 15 Aug 2023 22:42:47 +0900
Subject: [PATCH 020/183] fix

---
 configs/local_setup_ja.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configs/local_setup_ja.yml b/configs/local_setup_ja.yml
index 04af5b6fa..bc7035c4a 100644
--- a/configs/local_setup_ja.yml
+++ b/configs/local_setup_ja.yml
@@ -1,6 +1,6 @@
 # Suggested data paths when using GPT-NeoX locally
 {
-  "data_path": "data/wiki_ja_en",
+  "data_path": "data/wiki_ja",
 
   # or for weighted datasets:
   # "train-data-paths": ["data/enwik8/enwik8_text_document", "data/enwik8/enwik8_text_document"],

From b3723144894afba6caf95385c47367bd53b04d23 Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Tue, 15 Aug 2023 23:04:17 +0900
Subject: [PATCH 021/183] fix

---
 configs/1-3B.yml           | 3 +++
 configs/local_setup_ja.yml | 1 +
 2 files changed, 4 insertions(+)

diff --git a/configs/1-3B.yml b/configs/1-3B.yml
index 3e80ae7fc..0a093f271 100644
--- a/configs/1-3B.yml
+++ b/configs/1-3B.yml
@@ -88,4 +88,7 @@
    "steps_per_print": 10,
    "keep_last_n_checkpoints": 4,
    "wall_clock_breakdown": true,
+
+   ## tokenizer type
+   "tokenizer_type": "HFTokenizer",
 }
diff --git a/configs/local_setup_ja.yml b/configs/local_setup_ja.yml
index bc7035c4a..e22de21f5 100644
--- a/configs/local_setup_ja.yml
+++ b/configs/local_setup_ja.yml
@@ -15,6 +15,7 @@
   # "weight_by_num_documents": false,
   # "weighted_sampler_alpha": 0.3,
 
+  "tokenizer_type": "SPMTokenizer",
   "vocab_file": "./novelAI/tokenizer.model",  
 
   "save": "checkpoints",

From 74818bbd6050e16dec4515a7191aab5e254e28ae Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Tue, 15 Aug 2023 23:05:26 +0900
Subject: [PATCH 022/183] fix

---
 configs/1-3B.yml           | 2 +-
 configs/local_setup_ja.yml | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/configs/1-3B.yml b/configs/1-3B.yml
index 0a093f271..f5523c6ba 100644
--- a/configs/1-3B.yml
+++ b/configs/1-3B.yml
@@ -90,5 +90,5 @@
    "wall_clock_breakdown": true,
 
    ## tokenizer type
-   "tokenizer_type": "HFTokenizer",
+   "tokenizer_type": "SPMTokenizer",
 }
diff --git a/configs/local_setup_ja.yml b/configs/local_setup_ja.yml
index e22de21f5..8ad4e2d30 100644
--- a/configs/local_setup_ja.yml
+++ b/configs/local_setup_ja.yml
@@ -14,8 +14,7 @@
   # WARNING: setting this to True will override any user provided weights
   # "weight_by_num_documents": false,
   # "weighted_sampler_alpha": 0.3,
-
-  "tokenizer_type": "SPMTokenizer",
+  
   "vocab_file": "./novelAI/tokenizer.model",  
 
   "save": "checkpoints",

From 62fde152260271e8ecc7394ddbfe1d6e44f56f39 Mon Sep 17 00:00:00 2001
From: if001 <otomijuf.004@gmail.com>
Date: Wed, 23 Aug 2023 17:24:28 +0900
Subject: [PATCH 023/183] fix data_path

---
 configs/local_setup_ja.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/configs/local_setup_ja.yml b/configs/local_setup_ja.yml
index 8ad4e2d30..9644c6515 100644
--- a/configs/local_setup_ja.yml
+++ b/configs/local_setup_ja.yml
@@ -1,6 +1,7 @@
 # Suggested data paths when using GPT-NeoX locally
 {
-  "data_path": "data/wiki_ja",
+  # "data_path": "data/wiki_ja",
+  "data_path": "data/wiki_ja/wiki_ja_text_document",
 
   # or for weighted datasets:
   # "train-data-paths": ["data/enwik8/enwik8_text_document", "data/enwik8/enwik8_text_document"],

From 75c0ba3fa87002595615ede7a9c0b31f4a135861 Mon Sep 17 00:00:00 2001
From: if001 <otomijuf.004@gmail.com>
Date: Wed, 23 Aug 2023 18:26:31 +0900
Subject: [PATCH 024/183] Update local_setup_ja.yml

---
 configs/local_setup_ja.yml | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/configs/local_setup_ja.yml b/configs/local_setup_ja.yml
index 9644c6515..5ffa64d02 100644
--- a/configs/local_setup_ja.yml
+++ b/configs/local_setup_ja.yml
@@ -21,6 +21,11 @@
   "save": "checkpoints",
   "load": "checkpoints",
   "checkpoint_validation_with_forward_pass": False,
+  "log_dir": "logs",
+  "save_interval": 10000,
+  "eval_interval": 1000,
+  "eval_iters": 10,
+  "keep_last_n_checkpoints": 4,
 
   "tensorboard_dir": "tensorboard",
   "log_dir": "logs",

From ab9f79f8c8b47382c7b3d577475b578986540520 Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Fri, 1 Sep 2023 14:04:50 +0900
Subject: [PATCH 025/183] fix config

---
 configs/49M.yml            | 1 +
 configs/local_setup_ja.yml | 7 +++----
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/configs/49M.yml b/configs/49M.yml
index 9852320b0..f9822de9b 100644
--- a/configs/49M.yml
+++ b/configs/49M.yml
@@ -87,5 +87,6 @@
   # logging
   "log_interval": 10,
   "steps_per_print": 10,
+  "keep_last_n_checkpoints": 4,
   "wall_clock_breakdown": true,
 }
diff --git a/configs/local_setup_ja.yml b/configs/local_setup_ja.yml
index 5ffa64d02..286f33d76 100644
--- a/configs/local_setup_ja.yml
+++ b/configs/local_setup_ja.yml
@@ -21,11 +21,10 @@
   "save": "checkpoints",
   "load": "checkpoints",
   "checkpoint_validation_with_forward_pass": False,
+
+  ## logging
   "log_dir": "logs",
-  "save_interval": 10000,
-  "eval_interval": 1000,
-  "eval_iters": 10,
-  "keep_last_n_checkpoints": 4,
+  "save_interval": 10000,  
 
   "tensorboard_dir": "tensorboard",
   "log_dir": "logs",

From 0f48e56c164efb02ffc9aabd1fa09fa106cbe44d Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Fri, 1 Sep 2023 14:04:57 +0900
Subject: [PATCH 026/183] add dataset

---
 tools/corpora.py | 24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/tools/corpora.py b/tools/corpora.py
index 7bcde2484..f36285d07 100644
--- a/tools/corpora.py
+++ b/tools/corpora.py
@@ -307,6 +307,27 @@ class WikiJa(DataDownloader):
         "https://dumps.wikimedia.org/other/cirrussearch/20230807/jawiki-20230807-cirrussearch-content.json.gz",        
     ]
 
+class DataDownloaderWithHF(DataDownloader):
+    def __init__(self, hf_repo_ids = [], *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.hf_repo_ids = hf_repo_ids
+
+    def download(self):
+        super().download()
+        from huggingface_hub import snapshot_download 
+        save_dir = os.path.join(self.base_dir, self.name)
+        for repo_id in self.hf_repo_ids:
+            snapshot_download(repo_id=repo_id, revision="main", allow_patterns="*.jsonl", local_dir=save_dir)
+
+class WikiOSCARJa(DataDownloader):
+    name = "wiki_oscar_ja"
+    urls = [
+        "https://dumps.wikimedia.org/other/cirrussearch/20230807/jawiki-20230807-cirrussearch-content.json.gz",        
+    ]
+    hf_repo_ids = [
+        'if001/oscar_2023_filtered'
+    ]
+
 
 def maybe_download_gpt2_tokenizer_data(tokenizer_type, data_dir):
     if tokenizer_type is None or tokenizer_type == "GPT2BPETokenizer":
@@ -340,7 +361,8 @@ def maybe_download_gpt2_tokenizer_data(tokenizer_type, data_dir):
     "c4_openwebtext": C4OpenWebText,
     "enwik8": Enwik8,
     'wiki_ja_en': WikiJaEn,
-    'wiki_ja': WikiJa
+    'wiki_ja': WikiJa,
+    'wiki_oscar_ja': WikiOSCARJa
 }
 
 

From c4a8876128f766ae9bc5d433fe051d24d4329498 Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Fri, 1 Sep 2023 14:17:50 +0900
Subject: [PATCH 027/183] oscar

---
 tools/corpora.py | 22 +++++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/tools/corpora.py b/tools/corpora.py
index f36285d07..5e3745837 100644
--- a/tools/corpora.py
+++ b/tools/corpora.py
@@ -319,7 +319,7 @@ def download(self):
         for repo_id in self.hf_repo_ids:
             snapshot_download(repo_id=repo_id, revision="main", allow_patterns="*.jsonl", local_dir=save_dir)
 
-class WikiOSCARJa(DataDownloader):
+class WikiOSCARJa(DataDownloaderWithHF):
     name = "wiki_oscar_ja"
     urls = [
         "https://dumps.wikimedia.org/other/cirrussearch/20230807/jawiki-20230807-cirrussearch-content.json.gz",        
@@ -329,6 +329,25 @@ class WikiOSCARJa(DataDownloader):
     ]
 
 
+class HFDataDownloader(DataDownloader):
+    def __init__(self, hf_repo_ids = [], *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.hf_repo_ids = hf_repo_ids
+
+    def download(self):        
+        from huggingface_hub import snapshot_download 
+        save_dir = os.path.join(self.base_dir, self.name)
+        for repo_id in self.hf_repo_ids:
+            snapshot_download(repo_id=repo_id, revision="main", allow_patterns="*.jsonl", local_dir=save_dir)
+
+class OSCARJa(HFDataDownloader):
+    name = "oscar_ja"
+    urls = [""]
+    hf_repo_ids = [
+        'if001/oscar_2023_filtered'
+    ]
+
+
 def maybe_download_gpt2_tokenizer_data(tokenizer_type, data_dir):
     if tokenizer_type is None or tokenizer_type == "GPT2BPETokenizer":
         GPT2_VOCAB_FP = f"{data_dir}//gpt2-vocab.json"
@@ -362,6 +381,7 @@ def maybe_download_gpt2_tokenizer_data(tokenizer_type, data_dir):
     "enwik8": Enwik8,
     'wiki_ja_en': WikiJaEn,
     'wiki_ja': WikiJa,
+    'oscar_ja': OSCARJa,
     'wiki_oscar_ja': WikiOSCARJa
 }
 

From 0a334487b76edac307d71a3cb6b87041cf2e98c8 Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Fri, 1 Sep 2023 14:19:45 +0900
Subject: [PATCH 028/183] debug

---
 tools/corpora.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/corpora.py b/tools/corpora.py
index 5e3745837..39bd24b75 100644
--- a/tools/corpora.py
+++ b/tools/corpora.py
@@ -338,6 +338,7 @@ def download(self):
         from huggingface_hub import snapshot_download 
         save_dir = os.path.join(self.base_dir, self.name)
         for repo_id in self.hf_repo_ids:
+            print('download', save_dir)
             snapshot_download(repo_id=repo_id, revision="main", allow_patterns="*.jsonl", local_dir=save_dir)
 
 class OSCARJa(HFDataDownloader):

From 713f669c0af415689eb467926a8a314f3d9d59f0 Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Fri, 1 Sep 2023 14:21:58 +0900
Subject: [PATCH 029/183] debug

---
 tools/corpora.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tools/corpora.py b/tools/corpora.py
index 39bd24b75..5351cf22b 100644
--- a/tools/corpora.py
+++ b/tools/corpora.py
@@ -334,9 +334,10 @@ def __init__(self, hf_repo_ids = [], *args, **kwargs):
         super().__init__(*args, **kwargs)
         self.hf_repo_ids = hf_repo_ids
 
-    def download(self):        
+    def download(self):  
         from huggingface_hub import snapshot_download 
         save_dir = os.path.join(self.base_dir, self.name)
+        print('donwload0', self.hf_repo_ids)
         for repo_id in self.hf_repo_ids:
             print('download', save_dir)
             snapshot_download(repo_id=repo_id, revision="main", allow_patterns="*.jsonl", local_dir=save_dir)

From 1778c12437cd6976d62ede13b374986b823e21b1 Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Fri, 1 Sep 2023 14:24:25 +0900
Subject: [PATCH 030/183] debug

---
 tools/corpora.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/corpora.py b/tools/corpora.py
index 5351cf22b..aad9644fa 100644
--- a/tools/corpora.py
+++ b/tools/corpora.py
@@ -345,7 +345,7 @@ def download(self):
 class OSCARJa(HFDataDownloader):
     name = "oscar_ja"
     urls = [""]
-    hf_repo_ids = [
+    super().hf_repo_ids = [
         'if001/oscar_2023_filtered'
     ]
 

From e80130c947c051994375e6cbf274ef43d38564d0 Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Fri, 1 Sep 2023 14:27:27 +0900
Subject: [PATCH 031/183] debug

---
 tools/corpora.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/tools/corpora.py b/tools/corpora.py
index aad9644fa..3512af1d6 100644
--- a/tools/corpora.py
+++ b/tools/corpora.py
@@ -330,11 +330,15 @@ class WikiOSCARJa(DataDownloaderWithHF):
 
 
 class HFDataDownloader(DataDownloader):
-    def __init__(self, hf_repo_ids = [], *args, **kwargs):
+    def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
-        self.hf_repo_ids = hf_repo_ids
 
-    def download(self):  
+    @property
+    @abstractmethod
+    def hf_repo_ids(self):
+        pass
+
+    def download(self):
         from huggingface_hub import snapshot_download 
         save_dir = os.path.join(self.base_dir, self.name)
         print('donwload0', self.hf_repo_ids)
@@ -345,9 +349,7 @@ def download(self):
 class OSCARJa(HFDataDownloader):
     name = "oscar_ja"
     urls = [""]
-    super().hf_repo_ids = [
-        'if001/oscar_2023_filtered'
-    ]
+    hf_repo_ids = ['if001/oscar_2023_filtered']
 
 
 def maybe_download_gpt2_tokenizer_data(tokenizer_type, data_dir):

From 84a60e4e9edeb81d4c3efea86f1da9f201940135 Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Fri, 1 Sep 2023 14:31:11 +0900
Subject: [PATCH 032/183] fix save

---
 tools/corpora.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/tools/corpora.py b/tools/corpora.py
index 3512af1d6..8694a9533 100644
--- a/tools/corpora.py
+++ b/tools/corpora.py
@@ -339,12 +339,11 @@ def hf_repo_ids(self):
         pass
 
     def download(self):
-        from huggingface_hub import snapshot_download 
+        from datasets import load_dataset        
         save_dir = os.path.join(self.base_dir, self.name)
-        print('donwload0', self.hf_repo_ids)
         for repo_id in self.hf_repo_ids:
-            print('download', save_dir)
-            snapshot_download(repo_id=repo_id, revision="main", allow_patterns="*.jsonl", local_dir=save_dir)
+            ds=load_dataset(repo_id)
+            ds.save_to_disk(save_dir)
 
 class OSCARJa(HFDataDownloader):
     name = "oscar_ja"

From 82805ce67118cfef4d81722ac2b3fbf7133af762 Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Fri, 1 Sep 2023 15:52:11 +0900
Subject: [PATCH 033/183] debug

---
 tools/corpora.py         | 1 +
 tools/preprocess_data.py | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/tools/corpora.py b/tools/corpora.py
index 8694a9533..f6699e427 100644
--- a/tools/corpora.py
+++ b/tools/corpora.py
@@ -343,6 +343,7 @@ def download(self):
         save_dir = os.path.join(self.base_dir, self.name)
         for repo_id in self.hf_repo_ids:
             ds=load_dataset(repo_id)
+            print('save to', save_dir)
             ds.save_to_disk(save_dir)
 
 class OSCARJa(HFDataDownloader):
diff --git a/tools/preprocess_data.py b/tools/preprocess_data.py
index a923afcc3..852342658 100644
--- a/tools/preprocess_data.py
+++ b/tools/preprocess_data.py
@@ -49,7 +49,7 @@ def initializer(self):
     def encode(self, text):
         if self.args.ftfy:
             text = ftfy.fix_text(text)
-        # print('text,', text)
+        print('text,', text)
         ids = {}
         for key in self.args.jsonl_keys:
             doc_ids = []

From 77edca715a539dfb942d4f3caf14ea7f511eae5f Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Fri, 1 Sep 2023 16:04:56 +0900
Subject: [PATCH 034/183] fix

---
 tools/corpora.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tools/corpora.py b/tools/corpora.py
index f6699e427..e686b1aa6 100644
--- a/tools/corpora.py
+++ b/tools/corpora.py
@@ -342,9 +342,8 @@ def download(self):
         from datasets import load_dataset        
         save_dir = os.path.join(self.base_dir, self.name)
         for repo_id in self.hf_repo_ids:
-            ds=load_dataset(repo_id)
             print('save to', save_dir)
-            ds.save_to_disk(save_dir)
+            load_dataset(repo_id, data_dir=save_dir)
 
 class OSCARJa(HFDataDownloader):
     name = "oscar_ja"

From d3954c263c786baf5d5fa20f05603be839d4f25b Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Fri, 1 Sep 2023 16:13:20 +0900
Subject: [PATCH 035/183] fix save dir

---
 tools/corpora.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tools/corpora.py b/tools/corpora.py
index e686b1aa6..5d4097b4c 100644
--- a/tools/corpora.py
+++ b/tools/corpora.py
@@ -339,11 +339,13 @@ def hf_repo_ids(self):
         pass
 
     def download(self):
-        from datasets import load_dataset        
+        from datasets import load_dataset, config
+        from pathlib import Path
         save_dir = os.path.join(self.base_dir, self.name)
         for repo_id in self.hf_repo_ids:
             print('save to', save_dir)
-            load_dataset(repo_id, data_dir=save_dir)
+            config.DOWNLOADED_DATASETS_PATH = Path(save_dir)
+            load_dataset(repo_id)
 
 class OSCARJa(HFDataDownloader):
     name = "oscar_ja"

From bcb1311a2fbe49b32f7d0eb6cf689200b9810527 Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Fri, 1 Sep 2023 16:53:07 +0900
Subject: [PATCH 036/183] ix

---
 tools/corpora.py         | 13 ++++++-------
 tools/preprocess_data.py |  2 +-
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/tools/corpora.py b/tools/corpora.py
index 5d4097b4c..ef944aa71 100644
--- a/tools/corpora.py
+++ b/tools/corpora.py
@@ -316,8 +316,8 @@ def download(self):
         super().download()
         from huggingface_hub import snapshot_download 
         save_dir = os.path.join(self.base_dir, self.name)
-        for repo_id in self.hf_repo_ids:
-            snapshot_download(repo_id=repo_id, revision="main", allow_patterns="*.jsonl", local_dir=save_dir)
+        for repo_id in self.hf_repo_ids:            
+            snapshot_download(repo_id=repo_id, revision="main", allow_patterns="*.jsonl", local_dir=save_dir, repo_type='dataset')
 
 class WikiOSCARJa(DataDownloaderWithHF):
     name = "wiki_oscar_ja"
@@ -338,14 +338,13 @@ def __init__(self, *args, **kwargs):
     def hf_repo_ids(self):
         pass
 
-    def download(self):
-        from datasets import load_dataset, config
-        from pathlib import Path
+    def download(self):        
+        from huggingface_hub import snapshot_download        
         save_dir = os.path.join(self.base_dir, self.name)
         for repo_id in self.hf_repo_ids:
             print('save to', save_dir)
-            config.DOWNLOADED_DATASETS_PATH = Path(save_dir)
-            load_dataset(repo_id)
+            snapshot_download(repo_id=repo_id, allow_patterns="*.jsonl.zst", local_dir=save_dir)
+
 
 class OSCARJa(HFDataDownloader):
     name = "oscar_ja"
diff --git a/tools/preprocess_data.py b/tools/preprocess_data.py
index 852342658..a923afcc3 100644
--- a/tools/preprocess_data.py
+++ b/tools/preprocess_data.py
@@ -49,7 +49,7 @@ def initializer(self):
     def encode(self, text):
         if self.args.ftfy:
             text = ftfy.fix_text(text)
-        print('text,', text)
+        # print('text,', text)
         ids = {}
         for key in self.args.jsonl_keys:
             doc_ids = []

From e0fb9c32b4501825974b535ea8b993f2796ccdce Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Fri, 1 Sep 2023 20:04:59 +0900
Subject: [PATCH 037/183] fix

---
 configs/local_setup_ja.yml | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/configs/local_setup_ja.yml b/configs/local_setup_ja.yml
index 286f33d76..156f2196a 100644
--- a/configs/local_setup_ja.yml
+++ b/configs/local_setup_ja.yml
@@ -1,15 +1,15 @@
 # Suggested data paths when using GPT-NeoX locally
 {
   # "data_path": "data/wiki_ja",
-  "data_path": "data/wiki_ja/wiki_ja_text_document",
+  # "data_path": "data/wiki_ja/wiki_ja_text_document",
 
   # or for weighted datasets:
-  # "train-data-paths": ["data/enwik8/enwik8_text_document", "data/enwik8/enwik8_text_document"],
-  # "test-data-paths": ["data/enwik8/enwik8_text_document", "data/enwik8/enwik8_text_document"],
-  # "valid-data-paths": ["data/enwik8/enwik8_text_document", "data/enwik8/enwik8_text_document"],
-  # "train-data-weights": [1., 2.],
-  # "test-data-weights": [2., 1.],
-  # "valid-data-weights": [0.5, 0.4],
+  "train-data-paths": ["data/wiki_ja/wiki_ja_text_document", "data/oscar_ja/oscar_ja_text_document"],
+  "test-data-paths": ["data/wiki_ja/wiki_ja_text_document", "data/oscar_ja/oscar_ja_text_document"],
+  "valid-data-paths": ["data/wiki_ja/wiki_ja_text_document", "data/oscar_ja/oscar_ja_text_document"],
+  "train-data-weights": [1., 1.],
+  "test-data-weights": [0.1, 0.1],
+  "valid-data-weights": [0.1, 0.1],
 
   # If weight_by_num_documents is True, Builds dataset weights from a multinomial distribution over groups of data according to the number of documents in each group.
   # WARNING: setting this to True will override any user provided weights

From 3c210a0e68ab37bc08fdd93246a9692e76de00da Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Fri, 1 Sep 2023 20:38:21 +0900
Subject: [PATCH 038/183] fix config

---
 configs/49M.yml            | 1 +
 configs/local_setup_ja.yml | 1 -
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/configs/49M.yml b/configs/49M.yml
index f9822de9b..71a0bb6dd 100644
--- a/configs/49M.yml
+++ b/configs/49M.yml
@@ -83,6 +83,7 @@
   "checkpoint_factor": 1000,
   "eval_interval": 100000,
   "eval_iters": 10,
+  "save_interval": 10000,
 
   # logging
   "log_interval": 10,
diff --git a/configs/local_setup_ja.yml b/configs/local_setup_ja.yml
index 156f2196a..66d72be44 100644
--- a/configs/local_setup_ja.yml
+++ b/configs/local_setup_ja.yml
@@ -24,7 +24,6 @@
 
   ## logging
   "log_dir": "logs",
-  "save_interval": 10000,  
 
   "tensorboard_dir": "tensorboard",
   "log_dir": "logs",

From afedf90a7540eafb23b4de2ee1befc77144635a1 Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Fri, 1 Sep 2023 20:40:44 +0900
Subject: [PATCH 039/183] fix config

---
 configs/49M.yml            | 1 -
 configs/local_setup_ja.yml | 2 ++
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/configs/49M.yml b/configs/49M.yml
index 71a0bb6dd..f9822de9b 100644
--- a/configs/49M.yml
+++ b/configs/49M.yml
@@ -83,7 +83,6 @@
   "checkpoint_factor": 1000,
   "eval_interval": 100000,
   "eval_iters": 10,
-  "save_interval": 10000,
 
   # logging
   "log_interval": 10,
diff --git a/configs/local_setup_ja.yml b/configs/local_setup_ja.yml
index 66d72be44..16fe3fb8a 100644
--- a/configs/local_setup_ja.yml
+++ b/configs/local_setup_ja.yml
@@ -21,6 +21,8 @@
   "save": "checkpoints",
   "load": "checkpoints",
   "checkpoint_validation_with_forward_pass": False,
+  
+  "save_interval": 10000,
 
   ## logging
   "log_dir": "logs",

From 7f9da23c7509ed46ceb81b4ba59c7cd1b55d696a Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Fri, 1 Sep 2023 20:41:31 +0900
Subject: [PATCH 040/183] fix config

---
 configs/local_setup_ja.yml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/configs/local_setup_ja.yml b/configs/local_setup_ja.yml
index 16fe3fb8a..66d72be44 100644
--- a/configs/local_setup_ja.yml
+++ b/configs/local_setup_ja.yml
@@ -21,8 +21,6 @@
   "save": "checkpoints",
   "load": "checkpoints",
   "checkpoint_validation_with_forward_pass": False,
-  
-  "save_interval": 10000,
 
   ## logging
   "log_dir": "logs",

From e7d90d349821977fe22859f324ebd63a80d640d9 Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Fri, 1 Sep 2023 20:54:15 +0900
Subject: [PATCH 041/183] fix tokenizer

---
 configs/125M.yml | 5 ++++-
 configs/49M.yml  | 3 +++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/configs/125M.yml b/configs/125M.yml
index 15a4b3b01..504879123 100644
--- a/configs/125M.yml
+++ b/configs/125M.yml
@@ -90,5 +90,8 @@
    "wall_clock_breakdown": true,
 
   #  networking
-  "hostfile": "/mock_path"
+  "hostfile": "/mock_path",
+
+  ## tokenizer type
+  "tokenizer_type": "SPMTokenizer"
 }
diff --git a/configs/49M.yml b/configs/49M.yml
index f9822de9b..94e08ea2d 100644
--- a/configs/49M.yml
+++ b/configs/49M.yml
@@ -89,4 +89,7 @@
   "steps_per_print": 10,
   "keep_last_n_checkpoints": 4,
   "wall_clock_breakdown": true,
+
+  ## tokenizer type
+  "tokenizer_type": "SPMTokenizer"
 }

From 36ee68c70265f1717fd8eb2c28d883bc0fbf0177 Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Fri, 1 Sep 2023 21:00:45 +0900
Subject: [PATCH 042/183] add

---
 configs/19M.yml | 6 ++++--
 configs/20B.yml | 3 ++-
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/configs/19M.yml b/configs/19M.yml
index 83e5c594a..39d6247b8 100644
--- a/configs/19M.yml
+++ b/configs/19M.yml
@@ -90,6 +90,8 @@
         "prof_all": true,
         "debug": false
     },
-  }
-
+  },
+  
+  ## tokenizer type
+  "tokenizer_type": "SPMTokenizer"
 }
diff --git a/configs/20B.yml b/configs/20B.yml
index 243f794d0..46b44c04b 100644
--- a/configs/20B.yml
+++ b/configs/20B.yml
@@ -104,7 +104,8 @@
   "wall_clock_breakdown": false,
 
   ### NEW DATA: ####
-  "tokenizer_type": "HFTokenizer",
+  # "tokenizer_type": "HFTokenizer",
+  "tokenizer_type": "SPMTokenizer"
   "tensorboard-dir": "./tensorboard",
   "log_dir": "./logs",
 

From 2afafe03d2736c34e276d26aa6b4d3831f7bd7dc Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Sun, 3 Sep 2023 11:39:13 +0900
Subject: [PATCH 043/183] fix conf

---
 configs/19M.yml | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/configs/19M.yml b/configs/19M.yml
index 39d6247b8..794c0c4ec 100644
--- a/configs/19M.yml
+++ b/configs/19M.yml
@@ -77,9 +77,11 @@
   "checkpoint_factor": 1000,
   "eval_interval": 100000,
   "eval_iters": 10,
+  "keep_last_n_checkpoints": 4,
+  "save_interval": 10000,
 
-  "log_interval": 10,
-  "steps_per_print": 10,
+  "log_interval": 100,
+  "steps_per_print": 100,
   "wall_clock_breakdown": true,
 
   # additional deepspeed args not specified above

From 43535a3ecb5ff634d78c4e1caa1910e998a45881 Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Sun, 3 Sep 2023 11:39:20 +0900
Subject: [PATCH 044/183] add en wiki

---
 tools/corpora.py | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/tools/corpora.py b/tools/corpora.py
index ef944aa71..b878d4cad 100644
--- a/tools/corpora.py
+++ b/tools/corpora.py
@@ -293,10 +293,9 @@ class Enwik8(DataDownloader):
     urls = ["https://data.deepai.org/enwik8.zip"]
 
 
-class WikiJaEn(DataDownloader):
-    name = "wiki_ja_en"
-    urls = [
-        "https://dumps.wikimedia.org/other/cirrussearch/20230807/jawiki-20230807-cirrussearch-content.json.gz",
+class WikiEn(DataDownloader):
+    name = "wiki_en"
+    urls = [        
         "https://dumps.wikimedia.org/other/cirrussearch/20230807/enwiki-20230807-cirrussearch-content.json.gz"
     ]
 
@@ -328,7 +327,6 @@ class WikiOSCARJa(DataDownloaderWithHF):
         'if001/oscar_2023_filtered'
     ]
 
-
 class HFDataDownloader(DataDownloader):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
@@ -351,6 +349,11 @@ class OSCARJa(HFDataDownloader):
     urls = [""]
     hf_repo_ids = ['if001/oscar_2023_filtered']
 
+class AozoraJa(HFDataDownloader):
+    name = "aozora_ja"
+    urls = [""]
+    hf_repo_ids = ['globis-university/aozorabunko-clean']
+
 
 def maybe_download_gpt2_tokenizer_data(tokenizer_type, data_dir):
     if tokenizer_type is None or tokenizer_type == "GPT2BPETokenizer":
@@ -383,10 +386,11 @@ def maybe_download_gpt2_tokenizer_data(tokenizer_type, data_dir):
     "c4": C4,
     "c4_openwebtext": C4OpenWebText,
     "enwik8": Enwik8,
-    'wiki_ja_en': WikiJaEn,
+    'wiki_en': WikiEn,
     'wiki_ja': WikiJa,
     'oscar_ja': OSCARJa,
-    'wiki_oscar_ja': WikiOSCARJa
+    'wiki_oscar_ja': WikiOSCARJa,
+    'aozora_ja': AozoraJa
 }
 
 

From acbb2792e6a17bbfd2fd3ffa51b81653dc703cd7 Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Sun, 3 Sep 2023 22:49:04 +0900
Subject: [PATCH 045/183] fix pattern

---
 tools/corpora.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tools/corpora.py b/tools/corpora.py
index b878d4cad..efbcd497e 100644
--- a/tools/corpora.py
+++ b/tools/corpora.py
@@ -341,7 +341,11 @@ def download(self):
         save_dir = os.path.join(self.base_dir, self.name)
         for repo_id in self.hf_repo_ids:
             print('save to', save_dir)
-            snapshot_download(repo_id=repo_id, allow_patterns="*.jsonl.zst", local_dir=save_dir)
+            if 'if001/oscar_2023_filtered' == repo_id:
+                allow_patterns="*.jsonl.zst"
+            if 'globis-university/aozorabunko-clean' == repo_id:
+                allow_patterns="*.jsonl.gz"
+            snapshot_download(repo_id=repo_id, allow_patterns=allow_patterns, local_dir=save_dir)
 
 
 class OSCARJa(HFDataDownloader):

From dc4d9994aba628d8da9b69fef36dc22ed5e7d2bd Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Sun, 3 Sep 2023 22:51:54 +0900
Subject: [PATCH 046/183] fix type

---
 tools/corpora.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/corpora.py b/tools/corpora.py
index efbcd497e..e615e4107 100644
--- a/tools/corpora.py
+++ b/tools/corpora.py
@@ -345,7 +345,7 @@ def download(self):
                 allow_patterns="*.jsonl.zst"
             if 'globis-university/aozorabunko-clean' == repo_id:
                 allow_patterns="*.jsonl.gz"
-            snapshot_download(repo_id=repo_id, allow_patterns=allow_patterns, local_dir=save_dir)
+            snapshot_download(repo_id=repo_id, allow_patterns=allow_patterns, local_dir=save_dir, repo_type="dataset")
 
 
 class OSCARJa(HFDataDownloader):

From 8d75b6dae8ff05f7dc7c0a5183121dbfc081a029 Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Tue, 5 Sep 2023 20:54:12 +0900
Subject: [PATCH 047/183] for aozora

---
 tools/corpora.py         |  4 ++--
 tools/preprocess_data.py | 10 +++++++---
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/tools/corpora.py b/tools/corpora.py
index e615e4107..9430c66c0 100644
--- a/tools/corpora.py
+++ b/tools/corpora.py
@@ -343,7 +343,7 @@ def download(self):
             print('save to', save_dir)
             if 'if001/oscar_2023_filtered' == repo_id:
                 allow_patterns="*.jsonl.zst"
-            if 'globis-university/aozorabunko-clean' == repo_id:
+            if 'if001/aozorabunko-clean-sin' == repo_id:
                 allow_patterns="*.jsonl.gz"
             snapshot_download(repo_id=repo_id, allow_patterns=allow_patterns, local_dir=save_dir, repo_type="dataset")
 
@@ -356,7 +356,7 @@ class OSCARJa(HFDataDownloader):
 class AozoraJa(HFDataDownloader):
     name = "aozora_ja"
     urls = [""]
-    hf_repo_ids = ['globis-university/aozorabunko-clean']
+    hf_repo_ids = ['if001/aozorabunko-clean-sin']
 
 
 def maybe_download_gpt2_tokenizer_data(tokenizer_type, data_dir):
diff --git a/tools/preprocess_data.py b/tools/preprocess_data.py
index a923afcc3..1a780a1ed 100644
--- a/tools/preprocess_data.py
+++ b/tools/preprocess_data.py
@@ -173,16 +173,20 @@ def wiki_yielder(fname, semaphore):
             semaphore.acquire()
             yield f['text']
 
+    def aozora_yielder(fname, semaphore):        
+        for f in filter(lambda x: x, lmd.Reader(fname).stream_data()):
+            semaphore.acquire()
+            yield f['text']
+
     for fname in fnames:
         semaphore.acquire()
         print('fname', fname)
         if 'wiki' in fname:        
             yield from wiki_yielder(fname, semaphore)
+        if 'aozora' in fname:
+            yield from aozora_yielder(fname, semaphore)
         else:
             yield from yielder(fname, semaphore)
-        
-        
-
 
 def main():
     args = get_args()

From 45c24c3fac5fe683c025449dbdd17f28d3424159 Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Tue, 5 Sep 2023 20:59:37 +0900
Subject: [PATCH 048/183] debug

---
 tools/preprocess_data.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/preprocess_data.py b/tools/preprocess_data.py
index 1a780a1ed..19550a53a 100644
--- a/tools/preprocess_data.py
+++ b/tools/preprocess_data.py
@@ -176,6 +176,7 @@ def wiki_yielder(fname, semaphore):
     def aozora_yielder(fname, semaphore):        
         for f in filter(lambda x: x, lmd.Reader(fname).stream_data()):
             semaphore.acquire()
+            print('f', f)
             yield f['text']
 
     for fname in fnames:

From fde35c223d5aed3ab416d1eda9c51eb991a15f66 Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Tue, 5 Sep 2023 21:01:47 +0900
Subject: [PATCH 049/183] debug

---
 tools/preprocess_data.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/preprocess_data.py b/tools/preprocess_data.py
index 19550a53a..ef6f8d750 100644
--- a/tools/preprocess_data.py
+++ b/tools/preprocess_data.py
@@ -177,6 +177,7 @@ def aozora_yielder(fname, semaphore):
         for f in filter(lambda x: x, lmd.Reader(fname).stream_data()):
             semaphore.acquire()
             print('f', f)
+            print('f text', f['text'])
             yield f['text']
 
     for fname in fnames:

From 2391f3a1cf572a03f9612c1bda0d08f1a7a24d16 Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Tue, 5 Sep 2023 21:03:21 +0900
Subject: [PATCH 050/183] debug

---
 tools/preprocess_data.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/preprocess_data.py b/tools/preprocess_data.py
index ef6f8d750..3e64b5709 100644
--- a/tools/preprocess_data.py
+++ b/tools/preprocess_data.py
@@ -177,6 +177,7 @@ def aozora_yielder(fname, semaphore):
         for f in filter(lambda x: x, lmd.Reader(fname).stream_data()):
             semaphore.acquire()
             print('f', f)
+            print('f type', type(f))
             print('f text', f['text'])
             yield f['text']
 

From a2ba8e46c1cb6b67777b7ebdb8b4ee8563395fdb Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Tue, 5 Sep 2023 21:04:33 +0900
Subject: [PATCH 051/183] debug

---
 tools/preprocess_data.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tools/preprocess_data.py b/tools/preprocess_data.py
index 3e64b5709..14b8b8aeb 100644
--- a/tools/preprocess_data.py
+++ b/tools/preprocess_data.py
@@ -32,6 +32,7 @@
 import tqdm
 import torch
 import ftfy
+import json
 
 from megatron.tokenizer import build_tokenizer
 from megatron.data import indexed_dataset
@@ -178,8 +179,8 @@ def aozora_yielder(fname, semaphore):
             semaphore.acquire()
             print('f', f)
             print('f type', type(f))
-            print('f text', f['text'])
-            yield f['text']
+            print('f text', f['text'])            
+            yield json.load(f)['text']
 
     for fname in fnames:
         semaphore.acquire()

From 0b633fc239b4ca3a7bdf3981942705d9920caf57 Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Tue, 5 Sep 2023 21:06:46 +0900
Subject: [PATCH 052/183] debug

---
 tools/preprocess_data.py | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/tools/preprocess_data.py b/tools/preprocess_data.py
index 14b8b8aeb..909baba05 100644
--- a/tools/preprocess_data.py
+++ b/tools/preprocess_data.py
@@ -174,21 +174,13 @@ def wiki_yielder(fname, semaphore):
             semaphore.acquire()
             yield f['text']
 
-    def aozora_yielder(fname, semaphore):        
-        for f in filter(lambda x: x, lmd.Reader(fname).stream_data()):
-            semaphore.acquire()
-            print('f', f)
-            print('f type', type(f))
-            print('f text', f['text'])            
-            yield json.load(f)['text']
-
     for fname in fnames:
         semaphore.acquire()
         print('fname', fname)
         if 'wiki' in fname:        
             yield from wiki_yielder(fname, semaphore)
         if 'aozora' in fname:
-            yield from aozora_yielder(fname, semaphore)
+            yield from wiki_yielder(fname, semaphore)            
         else:
             yield from yielder(fname, semaphore)
 

From 50eeaeaee36f655793c004a93e65285dcb64e934 Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Tue, 5 Sep 2023 21:12:27 +0900
Subject: [PATCH 053/183] debug

---
 tools/preprocess_data.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/tools/preprocess_data.py b/tools/preprocess_data.py
index 909baba05..158312caa 100644
--- a/tools/preprocess_data.py
+++ b/tools/preprocess_data.py
@@ -174,6 +174,12 @@ def wiki_yielder(fname, semaphore):
             semaphore.acquire()
             yield f['text']
 
+    def wiki_yielder(fname, semaphore):
+        for f in filter(lambda x: x, lmd.Reader(fname).stream_data()):
+            semaphore.acquire()
+            print('type', type(f))
+            yield f['text']
+
     for fname in fnames:
         semaphore.acquire()
         print('fname', fname)

From 0a7957ea6deb50edd6dbd4342ea10f2ac7f69ab8 Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Tue, 5 Sep 2023 21:13:08 +0900
Subject: [PATCH 054/183] debug

---
 tools/preprocess_data.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tools/preprocess_data.py b/tools/preprocess_data.py
index 158312caa..7d5a5ca18 100644
--- a/tools/preprocess_data.py
+++ b/tools/preprocess_data.py
@@ -177,8 +177,7 @@ def wiki_yielder(fname, semaphore):
     def wiki_yielder(fname, semaphore):
         for f in filter(lambda x: x, lmd.Reader(fname).stream_data()):
             semaphore.acquire()
-            print('type', type(f))
-            yield f['text']
+            yield json.load(f)['text']
 
     for fname in fnames:
         semaphore.acquire()

From 73496e70acf4fe4aaa3758279b7dc59100fc1f0c Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Tue, 5 Sep 2023 21:13:35 +0900
Subject: [PATCH 055/183] debug

---
 tools/preprocess_data.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/preprocess_data.py b/tools/preprocess_data.py
index 7d5a5ca18..bb38ddeff 100644
--- a/tools/preprocess_data.py
+++ b/tools/preprocess_data.py
@@ -174,7 +174,7 @@ def wiki_yielder(fname, semaphore):
             semaphore.acquire()
             yield f['text']
 
-    def wiki_yielder(fname, semaphore):
+    def aozora_yielder(fname, semaphore):
         for f in filter(lambda x: x, lmd.Reader(fname).stream_data()):
             semaphore.acquire()
             yield json.load(f)['text']
@@ -185,7 +185,7 @@ def wiki_yielder(fname, semaphore):
         if 'wiki' in fname:        
             yield from wiki_yielder(fname, semaphore)
         if 'aozora' in fname:
-            yield from wiki_yielder(fname, semaphore)            
+            yield from aozora_yielder(fname, semaphore)
         else:
             yield from yielder(fname, semaphore)
 

From 23df3e1f038e0e5fcd420fe38cce2a2751777044 Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Tue, 5 Sep 2023 21:14:18 +0900
Subject: [PATCH 056/183] debug

---
 tools/preprocess_data.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/preprocess_data.py b/tools/preprocess_data.py
index bb38ddeff..052799fb9 100644
--- a/tools/preprocess_data.py
+++ b/tools/preprocess_data.py
@@ -177,7 +177,7 @@ def wiki_yielder(fname, semaphore):
     def aozora_yielder(fname, semaphore):
         for f in filter(lambda x: x, lmd.Reader(fname).stream_data()):
             semaphore.acquire()
-            yield json.load(f)['text']
+            yield json.loads(f)['text']
 
     for fname in fnames:
         semaphore.acquire()

From 2a47daf1adcc799e2206405b440f700571577170 Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Wed, 6 Sep 2023 13:38:51 +0900
Subject: [PATCH 057/183] fix config

---
 configs/local_setup_ja.yml | 12 ++++++------
 tools/corpora.py           | 39 ++++++++++++++++++++++++++++++++++----
 2 files changed, 41 insertions(+), 10 deletions(-)

diff --git a/configs/local_setup_ja.yml b/configs/local_setup_ja.yml
index 66d72be44..d0c4d19d0 100644
--- a/configs/local_setup_ja.yml
+++ b/configs/local_setup_ja.yml
@@ -4,12 +4,12 @@
   # "data_path": "data/wiki_ja/wiki_ja_text_document",
 
   # or for weighted datasets:
-  "train-data-paths": ["data/wiki_ja/wiki_ja_text_document", "data/oscar_ja/oscar_ja_text_document"],
-  "test-data-paths": ["data/wiki_ja/wiki_ja_text_document", "data/oscar_ja/oscar_ja_text_document"],
-  "valid-data-paths": ["data/wiki_ja/wiki_ja_text_document", "data/oscar_ja/oscar_ja_text_document"],
-  "train-data-weights": [1., 1.],
-  "test-data-weights": [0.1, 0.1],
-  "valid-data-weights": [0.1, 0.1],
+  "train-data-paths": ["data/wiki_ja_novelAI_bin/wiki_ja_text_document", "data/oscar_ja_novelAI_bin/oscar_ja_text_document","wiki_en_novelAI_bin/wiki_en_text_document" ,"aozora_ja_novelAI_bin/aozora_ja_text_document"],
+  "test-data-paths": ["data/wiki_ja_novelAI_bin/wiki_ja_text_document", "data/oscar_ja_novelAI_bin/oscar_ja_text_document","wiki_en_novelAI_bin/wiki_en_text_document" ,"aozora_ja_novelAI_bin/aozora_ja_text_document"],
+  "valid-data-paths": ["data/wiki_ja_novelAI_bin/wiki_ja_text_document", "data/oscar_ja_novelAI_bin/oscar_ja_text_document","wiki_en_novelAI_bin/wiki_en_text_document" ,"aozora_ja_novelAI_bin/aozora_ja_text_document"],
+  "train-data-weights": [0.9, 0.9, 0.9, 0.9],
+  "test-data-weights": [0.1, 0.1, 0.1, 0.1],
+  "valid-data-weights": [0.1, 0.1, 0.1, 0.1],
 
   # If weight_by_num_documents is True, Builds dataset weights from a multinomial distribution over groups of data according to the number of documents in each group.
   # WARNING: setting this to True will override any user provided weights
diff --git a/tools/corpora.py b/tools/corpora.py
index 9430c66c0..390b47954 100644
--- a/tools/corpora.py
+++ b/tools/corpora.py
@@ -327,7 +327,7 @@ class WikiOSCARJa(DataDownloaderWithHF):
         'if001/oscar_2023_filtered'
     ]
 
-class HFDataDownloader(DataDownloader):
+class HFSnapshotDownloader(DataDownloader):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
@@ -341,6 +341,7 @@ def download(self):
         save_dir = os.path.join(self.base_dir, self.name)
         for repo_id in self.hf_repo_ids:
             print('save to', save_dir)
+            allow_patterns = None
             if 'if001/oscar_2023_filtered' == repo_id:
                 allow_patterns="*.jsonl.zst"
             if 'if001/aozorabunko-clean-sin' == repo_id:
@@ -348,16 +349,45 @@ def download(self):
             snapshot_download(repo_id=repo_id, allow_patterns=allow_patterns, local_dir=save_dir, repo_type="dataset")
 
 
-class OSCARJa(HFDataDownloader):
+class OSCARJa(HFSnapshotDownloader):
     name = "oscar_ja"
     urls = [""]
     hf_repo_ids = ['if001/oscar_2023_filtered']
 
-class AozoraJa(HFDataDownloader):
+class AozoraJa(HFSnapshotDownloader):
     name = "aozora_ja"
     urls = [""]
     hf_repo_ids = ['if001/aozorabunko-clean-sin']
 
+class HFDataDownloader(DataDownloader):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    @property
+    @abstractmethod
+    def hf_repo_ids(self):
+        pass
+
+    def download(self):        
+        from datasets import load_dataset
+        save_dir = os.path.join(self.base_dir, self.name)
+        for repo_id in self.hf_repo_ids:
+            ds = load_dataset(repo_id)
+            name = repo_id.split('/')[0]
+            save_path = f'{save_dir}/{name}.json'
+            print('save to', save_path)
+            ds['train'].to_json(save_path)
+
+
+class IzumiDataset(HFSnapshotDownloader):
+    name = "izumi_dataset"
+    urls = [""]
+    hf_repo_ids = [
+        "izumi-lab/wikipedia-ja-20230720",
+        "izumi-lab/wikipedia-en-20230720",
+        "izumi-lab/wikinews-ja-20230728"
+    ]
+
 
 def maybe_download_gpt2_tokenizer_data(tokenizer_type, data_dir):
     if tokenizer_type is None or tokenizer_type == "GPT2BPETokenizer":
@@ -394,7 +424,8 @@ def maybe_download_gpt2_tokenizer_data(tokenizer_type, data_dir):
     'wiki_ja': WikiJa,
     'oscar_ja': OSCARJa,
     'wiki_oscar_ja': WikiOSCARJa,
-    'aozora_ja': AozoraJa
+    'aozora_ja': AozoraJa,
+    'izumi_dataset': IzumiDataset
 }
 
 

From 0e6f1e268ac9676438499ec388ea92e6d8dee694 Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Wed, 6 Sep 2023 14:04:32 +0900
Subject: [PATCH 058/183] fix config

---
 configs/19M.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/configs/19M.yml b/configs/19M.yml
index 794c0c4ec..af111c57d 100644
--- a/configs/19M.yml
+++ b/configs/19M.yml
@@ -76,12 +76,12 @@
   "warmup": 0.01,
   "checkpoint_factor": 1000,
   "eval_interval": 100000,
-  "eval_iters": 10,
+  "eval_iters": 1000,
   "keep_last_n_checkpoints": 4,
-  "save_interval": 10000,
+  "save_iters": 1000,
 
-  "log_interval": 100,
-  "steps_per_print": 100,
+  "log_interval": 1000,
+  "steps_per_print": 1000,
   "wall_clock_breakdown": true,
 
   # additional deepspeed args not specified above

From 5e12b35e11e7e8dcf27626bad80adb07962fd709 Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Wed, 6 Sep 2023 14:13:37 +0900
Subject: [PATCH 059/183] fix config

---
 configs/local_setup_ja.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/configs/local_setup_ja.yml b/configs/local_setup_ja.yml
index d0c4d19d0..29af25041 100644
--- a/configs/local_setup_ja.yml
+++ b/configs/local_setup_ja.yml
@@ -4,9 +4,9 @@
   # "data_path": "data/wiki_ja/wiki_ja_text_document",
 
   # or for weighted datasets:
-  "train-data-paths": ["data/wiki_ja_novelAI_bin/wiki_ja_text_document", "data/oscar_ja_novelAI_bin/oscar_ja_text_document","wiki_en_novelAI_bin/wiki_en_text_document" ,"aozora_ja_novelAI_bin/aozora_ja_text_document"],
-  "test-data-paths": ["data/wiki_ja_novelAI_bin/wiki_ja_text_document", "data/oscar_ja_novelAI_bin/oscar_ja_text_document","wiki_en_novelAI_bin/wiki_en_text_document" ,"aozora_ja_novelAI_bin/aozora_ja_text_document"],
-  "valid-data-paths": ["data/wiki_ja_novelAI_bin/wiki_ja_text_document", "data/oscar_ja_novelAI_bin/oscar_ja_text_document","wiki_en_novelAI_bin/wiki_en_text_document" ,"aozora_ja_novelAI_bin/aozora_ja_text_document"],
+  "train-data-paths": ["data/wiki_ja_novelAI_bin/wiki_ja_text_document", "data/oscar_ja_novelAI_bin/oscar_ja_text_document","data/wiki_en_novelAI_bin/wiki_en_text_document" ,"data/aozora_ja_novelAI_bin/aozora_ja_text_document"],
+  "test-data-paths": ["data/wiki_ja_novelAI_bin/wiki_ja_text_document", "data/oscar_ja_novelAI_bin/oscar_ja_text_document","data/wiki_en_novelAI_bin/wiki_en_text_document" ,"data/aozora_ja_novelAI_bin/aozora_ja_text_document"],
+  "valid-data-paths": ["data/wiki_ja_novelAI_bin/wiki_ja_text_document", "data/oscar_ja_novelAI_bin/oscar_ja_text_document","data/wiki_en_novelAI_bin/wiki_en_text_document" ,"data/aozora_ja_novelAI_bin/aozora_ja_text_document"],
   "train-data-weights": [0.9, 0.9, 0.9, 0.9],
   "test-data-weights": [0.1, 0.1, 0.1, 0.1],
   "valid-data-weights": [0.1, 0.1, 0.1, 0.1],

From 56e601468acc20782d9aca3c25431ade74be14e7 Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Wed, 6 Sep 2023 15:04:33 +0900
Subject: [PATCH 060/183] debug

---
 megatron/training.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/megatron/training.py b/megatron/training.py
index 96a94a1d0..ac33ea6cd 100644
--- a/megatron/training.py
+++ b/megatron/training.py
@@ -814,6 +814,7 @@ def train(
             lr = 0
 
         # Logging.
+        print('hoge'*100)
         report_memory_flag = training_log(
             neox_args=neox_args,
             timers=timers,
@@ -828,7 +829,7 @@ def train(
             optimizer=optimizer,
             noise_scale_logger=noise_scale_logger,
         )
-
+        print('bbbb'*100)
         # Checkpointing
         if neox_args.save and iteration in neox_args.save_iters:
             save_checkpoint(

From 70e9094721fb81540da6a303304837424b921e2a Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Wed, 6 Sep 2023 15:06:19 +0900
Subject: [PATCH 061/183] debug

---
 megatron/training.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/megatron/training.py b/megatron/training.py
index ac33ea6cd..4422ebb58 100644
--- a/megatron/training.py
+++ b/megatron/training.py
@@ -814,7 +814,7 @@ def train(
             lr = 0
 
         # Logging.
-        print('hoge'*100)
+        print('aaaa'*100)
         report_memory_flag = training_log(
             neox_args=neox_args,
             timers=timers,
@@ -846,6 +846,7 @@ def train(
             and iteration % neox_args.eval_interval == 0
             and neox_args.do_valid
         ):
+            print('cccc'*100)
             prefix = "iteration {}".format(iteration)
             evaluate_and_print_results(
                 neox_args=neox_args,
@@ -857,7 +858,8 @@ def train(
                 verbose=False,
                 timers=timers,
             )
-
+            print('dddd'*100)
+        print('eeee'*100)
         if neox_args.exit_interval and iteration % neox_args.exit_interval == 0:
             torch.distributed.barrier()
             time_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
@@ -868,7 +870,7 @@ def train(
                 )
             )
             sys.exit()
-
+        print('ffff'*100)
     return iteration
 
 

From 92761651f1f0e475cf4a68261cb1566ecc98e3d7 Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Wed, 6 Sep 2023 15:08:14 +0900
Subject: [PATCH 062/183] debug

---
 megatron/training.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/megatron/training.py b/megatron/training.py
index 4422ebb58..493c5fe52 100644
--- a/megatron/training.py
+++ b/megatron/training.py
@@ -791,6 +791,7 @@ def train(
     # to monitor if we've skipped many iterations in a row and trigger an early exit
     overflow_monitor = OverflowMonitor(optimizer)
     while iteration < neox_args.train_iters:
+        print('0000'*100)
         loss_dict, skipped_iter = train_step(
             neox_args=neox_args,
             timers=timers,
@@ -799,13 +800,14 @@ def train(
             optimizer=optimizer,
             lr_scheduler=lr_scheduler,
         )
+        print('1111'*100)
         iteration += 1
         neox_args.iteration = iteration
         if neox_args.precision == "fp16":
             overflow_monitor.check(skipped_iter)  # check for repeated overflow
         if neox_args.log_gradient_noise_scale:  # log noise scale if applicable
             noise_scale_logger.update()
-
+        print('2222'*100)
         # get learning rate (if present) - if doing soft prompt tuning + pipe parallel, you
         # may have no tunable parameters on a specific rank
         if optimizer.param_groups:

From 621241096462e02fee315a820a8eeffc208ffe04 Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Wed, 6 Sep 2023 15:10:04 +0900
Subject: [PATCH 063/183] debug

---
 megatron/training.py | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/megatron/training.py b/megatron/training.py
index 493c5fe52..58afe1030 100644
--- a/megatron/training.py
+++ b/megatron/training.py
@@ -705,6 +705,7 @@ def train_step(neox_args, timers, data_iterator, model, optimizer, lr_scheduler)
         for _ in range(neox_args.gradient_accumulation_steps):
             # Forward model for one step.
             timers("forward").start()
+            print('aaaa'*100)
             loss = forward_step(
                 neox_args=neox_args,
                 timers=timers,
@@ -712,6 +713,7 @@ def train_step(neox_args, timers, data_iterator, model, optimizer, lr_scheduler)
                 model=model,
                 is_train=True,
             )
+            print('bbbb'*100)
             timers("forward").stop()
             losses.append(loss)
             # Calculate gradients, reduce across processes, and clip.
@@ -723,6 +725,7 @@ def train_step(neox_args, timers, data_iterator, model, optimizer, lr_scheduler)
                 model=model,
                 loss=loss,
             )
+            print('ccccc'*100)
             timers("backward").stop()
             # Update parameters.
             timers("optimizer").start()
@@ -734,7 +737,7 @@ def train_step(neox_args, timers, data_iterator, model, optimizer, lr_scheduler)
         reduced_loss = {
             "lm_loss": reduce_losses(losses).mean()
         }  # reduces losses across machines for logging
-
+    print('dddd'*100)
     if neox_args.precision == "fp16" and model.optimizer.overflow:
         skipped_iter = 1
     else:
@@ -807,7 +810,7 @@ def train(
             overflow_monitor.check(skipped_iter)  # check for repeated overflow
         if neox_args.log_gradient_noise_scale:  # log noise scale if applicable
             noise_scale_logger.update()
-        print('2222'*100)
+        
         # get learning rate (if present) - if doing soft prompt tuning + pipe parallel, you
         # may have no tunable parameters on a specific rank
         if optimizer.param_groups:
@@ -816,7 +819,6 @@ def train(
             lr = 0
 
         # Logging.
-        print('aaaa'*100)
         report_memory_flag = training_log(
             neox_args=neox_args,
             timers=timers,
@@ -831,7 +833,6 @@ def train(
             optimizer=optimizer,
             noise_scale_logger=noise_scale_logger,
         )
-        print('bbbb'*100)
         # Checkpointing
         if neox_args.save and iteration in neox_args.save_iters:
             save_checkpoint(
@@ -848,7 +849,6 @@ def train(
             and iteration % neox_args.eval_interval == 0
             and neox_args.do_valid
         ):
-            print('cccc'*100)
             prefix = "iteration {}".format(iteration)
             evaluate_and_print_results(
                 neox_args=neox_args,
@@ -860,8 +860,7 @@ def train(
                 verbose=False,
                 timers=timers,
             )
-            print('dddd'*100)
-        print('eeee'*100)
+
         if neox_args.exit_interval and iteration % neox_args.exit_interval == 0:
             torch.distributed.barrier()
             time_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
@@ -872,7 +871,7 @@ def train(
                 )
             )
             sys.exit()
-        print('ffff'*100)
+
     return iteration
 
 

From 176f4ea6db323b904eeef4f6244b9b2f86a7016b Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Wed, 6 Sep 2023 15:11:57 +0900
Subject: [PATCH 064/183] debug

---
 megatron/training.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/megatron/training.py b/megatron/training.py
index 58afe1030..fe0f5904f 100644
--- a/megatron/training.py
+++ b/megatron/training.py
@@ -705,7 +705,6 @@ def train_step(neox_args, timers, data_iterator, model, optimizer, lr_scheduler)
         for _ in range(neox_args.gradient_accumulation_steps):
             # Forward model for one step.
             timers("forward").start()
-            print('aaaa'*100)
             loss = forward_step(
                 neox_args=neox_args,
                 timers=timers,
@@ -713,7 +712,6 @@ def train_step(neox_args, timers, data_iterator, model, optimizer, lr_scheduler)
                 model=model,
                 is_train=True,
             )
-            print('bbbb'*100)
             timers("forward").stop()
             losses.append(loss)
             # Calculate gradients, reduce across processes, and clip.
@@ -724,8 +722,7 @@ def train_step(neox_args, timers, data_iterator, model, optimizer, lr_scheduler)
                 optimizer=optimizer,
                 model=model,
                 loss=loss,
-            )
-            print('ccccc'*100)
+            )            
             timers("backward").stop()
             # Update parameters.
             timers("optimizer").start()
@@ -737,7 +734,7 @@ def train_step(neox_args, timers, data_iterator, model, optimizer, lr_scheduler)
         reduced_loss = {
             "lm_loss": reduce_losses(losses).mean()
         }  # reduces losses across machines for logging
-    print('dddd'*100)
+    
     if neox_args.precision == "fp16" and model.optimizer.overflow:
         skipped_iter = 1
     else:
@@ -750,7 +747,9 @@ def train_step_pipe(neox_args, timers, model, data_iterator):
     """Single training step with DeepSpeed's pipeline parallel engine."""
 
     assert neox_args.deepspeed
+    print('aaaa'*100)
     loss = model.train_batch(data_iter=data_iterator)
+    print('bbbb'*100)
     loss_dict = {"lm_loss": loss}
     # Don't break Megatron's timers because we changed code paths.
     for t in [

From 4412f2021ab4c0da7ae69be8cc144be1b7c5246d Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Wed, 6 Sep 2023 15:16:53 +0900
Subject: [PATCH 065/183] debug

---
 megatron/training.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/megatron/training.py b/megatron/training.py
index fe0f5904f..dd7b4a054 100644
--- a/megatron/training.py
+++ b/megatron/training.py
@@ -746,10 +746,8 @@ def train_step(neox_args, timers, data_iterator, model, optimizer, lr_scheduler)
 def train_step_pipe(neox_args, timers, model, data_iterator):
     """Single training step with DeepSpeed's pipeline parallel engine."""
 
-    assert neox_args.deepspeed
-    print('aaaa'*100)
-    loss = model.train_batch(data_iter=data_iterator)
-    print('bbbb'*100)
+    assert neox_args.deepspeed    
+    loss = model.train_batch(data_iter=data_iterator)    
     loss_dict = {"lm_loss": loss}
     # Don't break Megatron's timers because we changed code paths.
     for t in [

From da157ea02cd6b0a2dd82a4805ad2bf7cf15e4c2f Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Wed, 6 Sep 2023 15:17:03 +0900
Subject: [PATCH 066/183] debug

---
 configs/19M.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configs/19M.yml b/configs/19M.yml
index af111c57d..8d470be8a 100644
--- a/configs/19M.yml
+++ b/configs/19M.yml
@@ -88,7 +88,7 @@
   "deepspeed_extra_args": {
     "comms_logger": {
         "enabled": true,
-        "verbose": true,
+        "verbose": false,
         "prof_all": true,
         "debug": false
     },

From 788a87124bef62d7a9d09fa46a5fcf97b8d20470 Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Wed, 6 Sep 2023 15:19:33 +0900
Subject: [PATCH 067/183] debug

---
 configs/19M.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/configs/19M.yml b/configs/19M.yml
index 8d470be8a..94648b3a8 100644
--- a/configs/19M.yml
+++ b/configs/19M.yml
@@ -87,9 +87,9 @@
   # additional deepspeed args not specified above
   "deepspeed_extra_args": {
     "comms_logger": {
-        "enabled": true,
+        "enabled": false,
         "verbose": false,
-        "prof_all": true,
+        "prof_all": false,
         "debug": false
     },
   },

From 99060c291b93ac89f3ce0591a1e3d1ef7a46a360 Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Wed, 6 Sep 2023 15:20:05 +0900
Subject: [PATCH 068/183] debug

---
 megatron/training.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/megatron/training.py b/megatron/training.py
index dd7b4a054..ab598cc5e 100644
--- a/megatron/training.py
+++ b/megatron/training.py
@@ -790,8 +790,7 @@ def train(
 
     # to monitor if we've skipped many iterations in a row and trigger an early exit
     overflow_monitor = OverflowMonitor(optimizer)
-    while iteration < neox_args.train_iters:
-        print('0000'*100)
+    while iteration < neox_args.train_iters:        
         loss_dict, skipped_iter = train_step(
             neox_args=neox_args,
             timers=timers,
@@ -800,7 +799,6 @@ def train(
             optimizer=optimizer,
             lr_scheduler=lr_scheduler,
         )
-        print('1111'*100)
         iteration += 1
         neox_args.iteration = iteration
         if neox_args.precision == "fp16":

From ee3b182a4ae451d243238bf5ce5cdaa72a0866f6 Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Wed, 6 Sep 2023 15:23:10 +0900
Subject: [PATCH 069/183] debug

---
 megatron/training.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/megatron/training.py b/megatron/training.py
index ab598cc5e..5c0f60cb1 100644
--- a/megatron/training.py
+++ b/megatron/training.py
@@ -746,8 +746,10 @@ def train_step(neox_args, timers, data_iterator, model, optimizer, lr_scheduler)
 def train_step_pipe(neox_args, timers, model, data_iterator):
     """Single training step with DeepSpeed's pipeline parallel engine."""
 
-    assert neox_args.deepspeed    
-    loss = model.train_batch(data_iter=data_iterator)    
+    assert neox_args.deepspeed
+    print('0000'*100)
+    loss = model.train_batch(data_iter=data_iterator)
+    print('1111'*100)
     loss_dict = {"lm_loss": loss}
     # Don't break Megatron's timers because we changed code paths.
     for t in [
@@ -805,7 +807,7 @@ def train(
             overflow_monitor.check(skipped_iter)  # check for repeated overflow
         if neox_args.log_gradient_noise_scale:  # log noise scale if applicable
             noise_scale_logger.update()
-        
+        print('2222'*100)
         # get learning rate (if present) - if doing soft prompt tuning + pipe parallel, you
         # may have no tunable parameters on a specific rank
         if optimizer.param_groups:
@@ -828,6 +830,7 @@ def train(
             optimizer=optimizer,
             noise_scale_logger=noise_scale_logger,
         )
+        print('3333'*100)
         # Checkpointing
         if neox_args.save and iteration in neox_args.save_iters:
             save_checkpoint(

From d3851e113e86613a752fa1d41138b0cff5c42313 Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Wed, 6 Sep 2023 15:34:15 +0900
Subject: [PATCH 070/183] debug

---
 megatron/training.py | 5 +----
 megatron/utils.py    | 3 +++
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/megatron/training.py b/megatron/training.py
index 5c0f60cb1..8a4c8cf8f 100644
--- a/megatron/training.py
+++ b/megatron/training.py
@@ -747,9 +747,7 @@ def train_step_pipe(neox_args, timers, model, data_iterator):
     """Single training step with DeepSpeed's pipeline parallel engine."""
 
     assert neox_args.deepspeed
-    print('0000'*100)
     loss = model.train_batch(data_iter=data_iterator)
-    print('1111'*100)
     loss_dict = {"lm_loss": loss}
     # Don't break Megatron's timers because we changed code paths.
     for t in [
@@ -806,8 +804,7 @@ def train(
         if neox_args.precision == "fp16":
             overflow_monitor.check(skipped_iter)  # check for repeated overflow
         if neox_args.log_gradient_noise_scale:  # log noise scale if applicable
-            noise_scale_logger.update()
-        print('2222'*100)
+            noise_scale_logger.update()        
         # get learning rate (if present) - if doing soft prompt tuning + pipe parallel, you
         # may have no tunable parameters on a specific rank
         if optimizer.param_groups:
diff --git a/megatron/utils.py b/megatron/utils.py
index 0071ef872..e1e835835 100644
--- a/megatron/utils.py
+++ b/megatron/utils.py
@@ -301,6 +301,9 @@ def log(self, names, normalizer=1.0, reset=True):
         for name in names:
             elapsed_time = self.timers[name].elapsed(reset=reset) * 1000.0 / normalizer
             string += " | {}: {:.2f}".format(name, elapsed_time)
+        print('log'*100)
+        print(string)
+        print('log'*100)
         if torch.distributed.is_initialized():
             if torch.distributed.get_rank() == 0:
                 print(string, flush=True)

From fea9ef72f406ebf4b0137207f53289dee86d0581 Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Wed, 6 Sep 2023 15:39:30 +0900
Subject: [PATCH 071/183] debug

---
 megatron/__init__.py | 3 ++-
 megatron/training.py | 5 +++--
 megatron/utils.py    | 5 +----
 3 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/megatron/__init__.py b/megatron/__init__.py
index 4a9f98a31..2b07725c3 100644
--- a/megatron/__init__.py
+++ b/megatron/__init__.py
@@ -16,12 +16,13 @@
 
 def print_rank_0(*message):
     """If distributed is initialized print only on rank 0."""
+    print('call1111'*100)
     if torch.distributed.is_initialized():
         if torch.distributed.get_rank() == 0:
             print(*message, flush=True)
     else:
         print(*message, flush=True)
-
+    print('call2222'*100)
 
 from .initialize import initialize_megatron
 from .neox_arguments import NeoXArgs
diff --git a/megatron/training.py b/megatron/training.py
index 8a4c8cf8f..583683bbf 100644
--- a/megatron/training.py
+++ b/megatron/training.py
@@ -747,7 +747,9 @@ def train_step_pipe(neox_args, timers, model, data_iterator):
     """Single training step with DeepSpeed's pipeline parallel engine."""
 
     assert neox_args.deepspeed
+    print('1'*100)
     loss = model.train_batch(data_iter=data_iterator)
+    print('2'*100)
     loss_dict = {"lm_loss": loss}
     # Don't break Megatron's timers because we changed code paths.
     for t in [
@@ -826,8 +828,7 @@ def train(
             model=model,
             optimizer=optimizer,
             noise_scale_logger=noise_scale_logger,
-        )
-        print('3333'*100)
+        )        
         # Checkpointing
         if neox_args.save and iteration in neox_args.save_iters:
             save_checkpoint(
diff --git a/megatron/utils.py b/megatron/utils.py
index e1e835835..44fa98a1a 100644
--- a/megatron/utils.py
+++ b/megatron/utils.py
@@ -300,10 +300,7 @@ def log(self, names, normalizer=1.0, reset=True):
         string = "time (ms)"
         for name in names:
             elapsed_time = self.timers[name].elapsed(reset=reset) * 1000.0 / normalizer
-            string += " | {}: {:.2f}".format(name, elapsed_time)
-        print('log'*100)
-        print(string)
-        print('log'*100)
+            string += " | {}: {:.2f}".format(name, elapsed_time)        
         if torch.distributed.is_initialized():
             if torch.distributed.get_rank() == 0:
                 print(string, flush=True)

From 31f2ec45d91e83009bd8460e7254398a41d7cbbc Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Wed, 6 Sep 2023 15:54:22 +0900
Subject: [PATCH 072/183] debug

---
 megatron/__init__.py | 6 ++----
 megatron/training.py | 4 +---
 2 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/megatron/__init__.py b/megatron/__init__.py
index 2b07725c3..a2cf4df99 100644
--- a/megatron/__init__.py
+++ b/megatron/__init__.py
@@ -15,14 +15,12 @@
 
 
 def print_rank_0(*message):
-    """If distributed is initialized print only on rank 0."""
-    print('call1111'*100)
+    """If distributed is initialized print only on rank 0."""    
     if torch.distributed.is_initialized():
         if torch.distributed.get_rank() == 0:
             print(*message, flush=True)
     else:
-        print(*message, flush=True)
-    print('call2222'*100)
+        print(*message, flush=True)    
 
 from .initialize import initialize_megatron
 from .neox_arguments import NeoXArgs
diff --git a/megatron/training.py b/megatron/training.py
index 583683bbf..bca3057b7 100644
--- a/megatron/training.py
+++ b/megatron/training.py
@@ -747,9 +747,7 @@ def train_step_pipe(neox_args, timers, model, data_iterator):
     """Single training step with DeepSpeed's pipeline parallel engine."""
 
     assert neox_args.deepspeed
-    print('1'*100)
-    loss = model.train_batch(data_iter=data_iterator)
-    print('2'*100)
+    loss = model.train_batch(data_iter=data_iterator)    
     loss_dict = {"lm_loss": loss}
     # Don't break Megatron's timers because we changed code paths.
     for t in [

From 8a11228a33e21ba318ac30f7453ed2056d58a8c4 Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Thu, 7 Sep 2023 19:55:00 +0900
Subject: [PATCH 073/183] add convert settings

---
 configs/convert_settings.yml | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)
 create mode 100644 configs/convert_settings.yml

diff --git a/configs/convert_settings.yml b/configs/convert_settings.yml
new file mode 100644
index 000000000..edb2aa560
--- /dev/null
+++ b/configs/convert_settings.yml
@@ -0,0 +1,30 @@
+{
+  "tokenizer_type": "SPMTokenizer"
+
+  "pipe_parallel_size": 1,
+  "model_parallel_size": 1,
+
+  # model settings
+  "num_layers": 6,
+  "hidden_size": 512,
+  "num_attention_heads": 8,
+  "seq_length": 2048,
+  "max_position_embeddings": 2048,
+  "pos_emb": "rotary",
+  "no_weight_tying": true,
+  "gpt_j_residual": false,
+  "output_layer_parallelism": "column",
+
+  "scaled_upper_triang_masked_softmax_fusion": false,
+  "bias_gelu_fusion": false,
+
+  "optimizer": {
+    "type": "Adam",
+    "params": {
+      "lr": 0.001,
+      "betas": [0.9, 0.95],
+      "eps": 1.0e-8,
+    }
+  },
+  "min_lr": 0.0001
+}

From 9a0bf455e2ebd2538997ab0dbb2f68b374698ba0 Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Thu, 7 Sep 2023 20:08:05 +0900
Subject: [PATCH 074/183] add convert settings

---
 configs/convert_settings.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configs/convert_settings.yml b/configs/convert_settings.yml
index edb2aa560..8cf817987 100644
--- a/configs/convert_settings.yml
+++ b/configs/convert_settings.yml
@@ -1,5 +1,5 @@
 {
-  "tokenizer_type": "SPMTokenizer"
+  "tokenizer_type": "SPMTokenizer",
 
   "pipe_parallel_size": 1,
   "model_parallel_size": 1,

From f5cb60676d8c18db77bfad328744fa1db83c6551 Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Thu, 7 Sep 2023 20:17:47 +0900
Subject: [PATCH 075/183] add

---
 configs/convert_settings.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configs/convert_settings.yml b/configs/convert_settings.yml
index 8cf817987..1fe676739 100644
--- a/configs/convert_settings.yml
+++ b/configs/convert_settings.yml
@@ -1,6 +1,6 @@
 {
   "tokenizer_type": "SPMTokenizer",
-
+  "vocab-file": "./novelAI/tokenizer.model"
   "pipe_parallel_size": 1,
   "model_parallel_size": 1,
 

From fbe2e2b9816170e38c0b224553feca8faa7bb539 Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Thu, 7 Sep 2023 20:23:50 +0900
Subject: [PATCH 076/183] add

---
 configs/convert_settings.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/configs/convert_settings.yml b/configs/convert_settings.yml
index 1fe676739..baf797385 100644
--- a/configs/convert_settings.yml
+++ b/configs/convert_settings.yml
@@ -1,6 +1,7 @@
 {
   "tokenizer_type": "SPMTokenizer",
-  "vocab-file": "./novelAI/tokenizer.model"
+  "vocab-file": "./novelAI/tokenizer.model",
+  
   "pipe_parallel_size": 1,
   "model_parallel_size": 1,
 

From 288175581ba72bed26a9923204528bbc3b2e086d Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Thu, 7 Sep 2023 20:26:23 +0900
Subject: [PATCH 077/183] debug

---
 tools/convert_module_to_hf.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/convert_module_to_hf.py b/tools/convert_module_to_hf.py
index 905bdfa16..3f7ccb080 100644
--- a/tools/convert_module_to_hf.py
+++ b/tools/convert_module_to_hf.py
@@ -183,6 +183,7 @@ def convert(input_checkpoint_path, loaded_config, output_checkpoint_path):
 
         # get layer from hf model
         hf_layer = hf_model.gpt_neox.layers[layer_i]
+        print('state_dict: ', hf_layer.state_dict())
 
         # + 2 bc of embed layer and a dummy _pre_transformer_block
         loaded_tp_ranks = load_partitions(

From 4bede8c1181f9b16801183c39aa1f7d13dd8964e Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Thu, 7 Sep 2023 20:45:11 +0900
Subject: [PATCH 078/183] fix

---
 tools/convert_module_to_hf.py | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/tools/convert_module_to_hf.py b/tools/convert_module_to_hf.py
index 3f7ccb080..a79e13d28 100644
--- a/tools/convert_module_to_hf.py
+++ b/tools/convert_module_to_hf.py
@@ -183,7 +183,9 @@ def convert(input_checkpoint_path, loaded_config, output_checkpoint_path):
 
         # get layer from hf model
         hf_layer = hf_model.gpt_neox.layers[layer_i]
-        print('state_dict: ', hf_layer.state_dict())
+        for v, _ in hf_layer.state_dict():
+            print('state_dict: ', v)
+        print('-'*200)
 
         # + 2 bc of embed layer and a dummy _pre_transformer_block
         loaded_tp_ranks = load_partitions(
@@ -228,10 +230,16 @@ def convert(input_checkpoint_path, loaded_config, output_checkpoint_path):
         state_dict["attention.rotary_emb.inv_freq"] = loaded_tp_ranks[0][
             "attention.rotary_emb.inv_freq"
         ]
-        state_dict["attention.bias"] = hf_layer.state_dict()["attention.bias"]
-        state_dict["attention.masked_bias"] = hf_layer.state_dict()[
-            "attention.masked_bias"
-        ]
+
+        state_dict["attention.dense.bias"] = hf_layer.state_dict()["attention.dense.bias"]
+
+        if "attention.bias" in hf_layer.state_dict():
+            state_dict["attention.bias"] = hf_layer.state_dict()["attention.bias"]
+            
+        if "attention.masked_bias" in hf_layer.state_dict():
+            state_dict["attention.masked_bias"] = hf_layer.state_dict()[
+                "attention.masked_bias"
+            ]
 
         # load state_dict into layer
         hf_layer.load_state_dict(state_dict)

From dea764f6ac78a767c817882f170a289d2ba50283 Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Thu, 7 Sep 2023 20:49:00 +0900
Subject: [PATCH 079/183] fix

---
 tools/convert_module_to_hf.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/convert_module_to_hf.py b/tools/convert_module_to_hf.py
index a79e13d28..c46d78402 100644
--- a/tools/convert_module_to_hf.py
+++ b/tools/convert_module_to_hf.py
@@ -183,7 +183,7 @@ def convert(input_checkpoint_path, loaded_config, output_checkpoint_path):
 
         # get layer from hf model
         hf_layer = hf_model.gpt_neox.layers[layer_i]
-        for v, _ in hf_layer.state_dict():
+        for v in hf_layer.state_dict():
             print('state_dict: ', v)
         print('-'*200)
 
@@ -235,7 +235,7 @@ def convert(input_checkpoint_path, loaded_config, output_checkpoint_path):
 
         if "attention.bias" in hf_layer.state_dict():
             state_dict["attention.bias"] = hf_layer.state_dict()["attention.bias"]
-            
+
         if "attention.masked_bias" in hf_layer.state_dict():
             state_dict["attention.masked_bias"] = hf_layer.state_dict()[
                 "attention.masked_bias"

From 643a3d18db54417a82a1e494c3bd47431e5e870a Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Thu, 7 Sep 2023 22:06:40 +0900
Subject: [PATCH 080/183] add dataset

---
 tools/corpora.py | 26 +++++++++++++++++++++++---
 1 file changed, 23 insertions(+), 3 deletions(-)

diff --git a/tools/corpora.py b/tools/corpora.py
index 390b47954..698336d8d 100644
--- a/tools/corpora.py
+++ b/tools/corpora.py
@@ -379,8 +379,8 @@ def download(self):
             ds['train'].to_json(save_path)
 
 
-class IzumiDataset(HFSnapshotDownloader):
-    name = "izumi_dataset"
+class IzumiFullDataset(HFSnapshotDownloader):
+    name = "izumi_full_dataset"
     urls = [""]
     hf_repo_ids = [
         "izumi-lab/wikipedia-ja-20230720",
@@ -388,6 +388,26 @@ class IzumiDataset(HFSnapshotDownloader):
         "izumi-lab/wikinews-ja-20230728"
     ]
 
+class IzumiWikiJaDataset(HFSnapshotDownloader):
+    name = "izumi_wiki_ja_dataset"
+    urls = [""]
+    hf_repo_ids = [
+        "izumi-lab/wikipedia-ja-20230720",
+    ]
+
+class IzumiWikiJaDataset(HFSnapshotDownloader):
+    name = "izumi_wiki_en_dataset"
+    urls = [""]
+    hf_repo_ids = [
+        "izumi-lab/wikipedia-en-20230720",        
+    ]
+
+class IzumiWikiNewsJaDataset(HFSnapshotDownloader):
+    name = "izumi_wiki_news_dataset"
+    urls = [""]
+    hf_repo_ids = [
+        "izumi-lab/wikinews-ja-20230728"        
+    ]    
 
 def maybe_download_gpt2_tokenizer_data(tokenizer_type, data_dir):
     if tokenizer_type is None or tokenizer_type == "GPT2BPETokenizer":
@@ -425,7 +445,7 @@ def maybe_download_gpt2_tokenizer_data(tokenizer_type, data_dir):
     'oscar_ja': OSCARJa,
     'wiki_oscar_ja': WikiOSCARJa,
     'aozora_ja': AozoraJa,
-    'izumi_dataset': IzumiDataset
+    'izumi_dataset': IzumiFullDataset
 }
 
 

From 8d7ba794e3adc30a608bdf9c32df22ea30766b7b Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Thu, 7 Sep 2023 22:20:57 +0900
Subject: [PATCH 081/183] fix

---
 tools/corpora.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tools/corpora.py b/tools/corpora.py
index 698336d8d..5aa783b5f 100644
--- a/tools/corpora.py
+++ b/tools/corpora.py
@@ -379,7 +379,7 @@ def download(self):
             ds['train'].to_json(save_path)
 
 
-class IzumiFullDataset(HFSnapshotDownloader):
+class IzumiFullDataset(HFDataDownloader):
     name = "izumi_full_dataset"
     urls = [""]
     hf_repo_ids = [
@@ -388,21 +388,21 @@ class IzumiFullDataset(HFSnapshotDownloader):
         "izumi-lab/wikinews-ja-20230728"
     ]
 
-class IzumiWikiJaDataset(HFSnapshotDownloader):
+class IzumiWikiJaDataset(HFDataDownloader):
     name = "izumi_wiki_ja_dataset"
     urls = [""]
     hf_repo_ids = [
         "izumi-lab/wikipedia-ja-20230720",
     ]
 
-class IzumiWikiJaDataset(HFSnapshotDownloader):
+class IzumiWikiJaDataset(HFDataDownloader):
     name = "izumi_wiki_en_dataset"
     urls = [""]
     hf_repo_ids = [
         "izumi-lab/wikipedia-en-20230720",        
     ]
 
-class IzumiWikiNewsJaDataset(HFSnapshotDownloader):
+class IzumiWikiNewsJaDataset(HFDataDownloader):
     name = "izumi_wiki_news_dataset"
     urls = [""]
     hf_repo_ids = [

From 2fa07ee0142e2fa135f705b627efc91b206587ba Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Thu, 7 Sep 2023 22:34:05 +0900
Subject: [PATCH 082/183] add config

---
 tools/corpora.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/tools/corpora.py b/tools/corpora.py
index 5aa783b5f..50141cbc5 100644
--- a/tools/corpora.py
+++ b/tools/corpora.py
@@ -395,7 +395,7 @@ class IzumiWikiJaDataset(HFDataDownloader):
         "izumi-lab/wikipedia-ja-20230720",
     ]
 
-class IzumiWikiJaDataset(HFDataDownloader):
+class IzumiWikiEnDataset(HFDataDownloader):
     name = "izumi_wiki_en_dataset"
     urls = [""]
     hf_repo_ids = [
@@ -445,7 +445,10 @@ def maybe_download_gpt2_tokenizer_data(tokenizer_type, data_dir):
     'oscar_ja': OSCARJa,
     'wiki_oscar_ja': WikiOSCARJa,
     'aozora_ja': AozoraJa,
-    'izumi_dataset': IzumiFullDataset
+    'izumi_dataset': IzumiFullDataset,
+    'izumi_wiki_ja_dataset': IzumiWikiJaDataset,
+    'izumi_wiki_en_dataset': IzumiWikiEnDataset,
+    'izumi_wiki_news_dataset': IzumiWikiNewsJaDataset
 }
 
 

From 11d6bdb3a19c7c82201fdc0cc35586897a6cfdad Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Thu, 7 Sep 2023 22:40:54 +0900
Subject: [PATCH 083/183] fix save name

---
 tools/corpora.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/corpora.py b/tools/corpora.py
index 50141cbc5..d12fa839a 100644
--- a/tools/corpora.py
+++ b/tools/corpora.py
@@ -374,7 +374,7 @@ def download(self):
         for repo_id in self.hf_repo_ids:
             ds = load_dataset(repo_id)
             name = repo_id.split('/')[0]
-            save_path = f'{save_dir}/{name}.json'
+            save_path = f'{save_dir}/{name}.jsonl'
             print('save to', save_path)
             ds['train'].to_json(save_path)
 

From f4157e51574d6b9ee1aa3078b3358887f200ce22 Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Thu, 7 Sep 2023 22:46:50 +0900
Subject: [PATCH 084/183] fix to_json

---
 tools/corpora.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/corpora.py b/tools/corpora.py
index d12fa839a..daeae07ed 100644
--- a/tools/corpora.py
+++ b/tools/corpora.py
@@ -376,7 +376,7 @@ def download(self):
             name = repo_id.split('/')[0]
             save_path = f'{save_dir}/{name}.jsonl'
             print('save to', save_path)
-            ds['train'].to_json(save_path)
+            ds['train'].to_json(save_path, force_ascii=False)
 
 
 class IzumiFullDataset(HFDataDownloader):

From a8c1a318188c33fc5125e9c90810de35642d69b6 Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Thu, 7 Sep 2023 22:50:22 +0900
Subject: [PATCH 085/183] fix yielder

---
 tools/preprocess_data.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/tools/preprocess_data.py b/tools/preprocess_data.py
index 052799fb9..fa16a6087 100644
--- a/tools/preprocess_data.py
+++ b/tools/preprocess_data.py
@@ -168,6 +168,13 @@ def yielder(fname, semaphore):
             semaphore.acquire()
             yield f
 
+
+    def hf_yielder(fname, semaphore):
+        stream = filter(lambda x: x, lmd.Reader(fname).stream_data())    
+        for f in filter(lambda x: 'text' in x and len(json.leads(x)['text']) != 0, stream):
+            semaphore.acquire()
+            yield f['text']
+
     def wiki_yielder(fname, semaphore):
         stream = filter(lambda x: x, lmd.Reader(fname).stream_data())    
         for f in filter(lambda x: 'text' in x and len(x['text']) != 0, stream):
@@ -182,6 +189,8 @@ def aozora_yielder(fname, semaphore):
     for fname in fnames:
         semaphore.acquire()
         print('fname', fname)
+        if 'izumi' in fname:
+            yield from hf_yielder(fname, semaphore)
         if 'wiki' in fname:        
             yield from wiki_yielder(fname, semaphore)
         if 'aozora' in fname:

From 2841cca50881606092f4b323b60b9e2077a69d75 Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Thu, 7 Sep 2023 22:51:10 +0900
Subject: [PATCH 086/183] fix yielder

---
 tools/preprocess_data.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/preprocess_data.py b/tools/preprocess_data.py
index fa16a6087..ebdcfc4c2 100644
--- a/tools/preprocess_data.py
+++ b/tools/preprocess_data.py
@@ -171,7 +171,7 @@ def yielder(fname, semaphore):
 
     def hf_yielder(fname, semaphore):
         stream = filter(lambda x: x, lmd.Reader(fname).stream_data())    
-        for f in filter(lambda x: 'text' in x and len(json.leads(x)['text']) != 0, stream):
+        for f in filter(lambda x: 'text' in x and len(json.loads(x)['text']) != 0, stream):
             semaphore.acquire()
             yield f['text']
 

From 8b9d57beba556765a1caf4a4474e15145678318c Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Thu, 7 Sep 2023 22:54:39 +0900
Subject: [PATCH 087/183] fix yielder

---
 tools/preprocess_data.py | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/tools/preprocess_data.py b/tools/preprocess_data.py
index ebdcfc4c2..b2a54e052 100644
--- a/tools/preprocess_data.py
+++ b/tools/preprocess_data.py
@@ -168,13 +168,6 @@ def yielder(fname, semaphore):
             semaphore.acquire()
             yield f
 
-
-    def hf_yielder(fname, semaphore):
-        stream = filter(lambda x: x, lmd.Reader(fname).stream_data())    
-        for f in filter(lambda x: 'text' in x and len(json.loads(x)['text']) != 0, stream):
-            semaphore.acquire()
-            yield f['text']
-
     def wiki_yielder(fname, semaphore):
         stream = filter(lambda x: x, lmd.Reader(fname).stream_data())    
         for f in filter(lambda x: 'text' in x and len(x['text']) != 0, stream):
@@ -190,7 +183,7 @@ def aozora_yielder(fname, semaphore):
         semaphore.acquire()
         print('fname', fname)
         if 'izumi' in fname:
-            yield from hf_yielder(fname, semaphore)
+            yield from aozora_yielder(fname, semaphore)
         if 'wiki' in fname:        
             yield from wiki_yielder(fname, semaphore)
         if 'aozora' in fname:

From 87ee88df1eb3fdccf8848163b1af5c078710f6cf Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Thu, 7 Sep 2023 22:56:06 +0900
Subject: [PATCH 088/183] debug

---
 tools/preprocess_data.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/preprocess_data.py b/tools/preprocess_data.py
index b2a54e052..e6c8193a7 100644
--- a/tools/preprocess_data.py
+++ b/tools/preprocess_data.py
@@ -177,6 +177,7 @@ def wiki_yielder(fname, semaphore):
     def aozora_yielder(fname, semaphore):
         for f in filter(lambda x: x, lmd.Reader(fname).stream_data()):
             semaphore.acquire()
+            print('debug: ', type(f), f)
             yield json.loads(f)['text']
 
     for fname in fnames:

From 31e28b89c748858b8cdffa339efeef1d05331940 Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Thu, 7 Sep 2023 22:57:03 +0900
Subject: [PATCH 089/183] debug

---
 tools/preprocess_data.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/tools/preprocess_data.py b/tools/preprocess_data.py
index e6c8193a7..876fae8a9 100644
--- a/tools/preprocess_data.py
+++ b/tools/preprocess_data.py
@@ -176,15 +176,14 @@ def wiki_yielder(fname, semaphore):
 
     def aozora_yielder(fname, semaphore):
         for f in filter(lambda x: x, lmd.Reader(fname).stream_data()):
-            semaphore.acquire()
-            print('debug: ', type(f), f)
+            semaphore.acquire()            
             yield json.loads(f)['text']
 
     for fname in fnames:
         semaphore.acquire()
         print('fname', fname)
         if 'izumi' in fname:
-            yield from aozora_yielder(fname, semaphore)
+            yield from yielder(fname, semaphore)
         if 'wiki' in fname:        
             yield from wiki_yielder(fname, semaphore)
         if 'aozora' in fname:

From e405194bb2d4baf8168f4b91404c879edcc932a8 Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Thu, 7 Sep 2023 23:05:01 +0900
Subject: [PATCH 090/183] fix

---
 tools/preprocess_data.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/preprocess_data.py b/tools/preprocess_data.py
index 876fae8a9..f0c8822e3 100644
--- a/tools/preprocess_data.py
+++ b/tools/preprocess_data.py
@@ -184,9 +184,9 @@ def aozora_yielder(fname, semaphore):
         print('fname', fname)
         if 'izumi' in fname:
             yield from yielder(fname, semaphore)
-        if 'wiki' in fname:        
+        elif 'wiki' in fname:        
             yield from wiki_yielder(fname, semaphore)
-        if 'aozora' in fname:
+        elif 'aozora' in fname:
             yield from aozora_yielder(fname, semaphore)
         else:
             yield from yielder(fname, semaphore)

From 1852b8e47be3aa7bd5c412adc843c6a530ed7837 Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Fri, 8 Sep 2023 14:32:42 +0900
Subject: [PATCH 091/183] fix config

---
 configs/49M.yml | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/configs/49M.yml b/configs/49M.yml
index 94e08ea2d..6cfb4e4fa 100644
--- a/configs/49M.yml
+++ b/configs/49M.yml
@@ -80,13 +80,15 @@
   "distributed_backend": "nccl",
   "lr_decay_style": "cosine",
   "warmup": 0.01,
-  "checkpoint_factor": 1000,
+  "checkpoint_factor": 10000,
   "eval_interval": 100000,
-  "eval_iters": 10,
+  "eval_iters": 1000,
+  "keep_last_n_checkpoints": 4,
+  "save_iters": 10000,
 
   # logging
-  "log_interval": 10,
-  "steps_per_print": 10,
+  "log_interval": 1000,
+  "steps_per_print": 1000,
   "keep_last_n_checkpoints": 4,
   "wall_clock_breakdown": true,
 

From f561019dcf3e319f50342f169a7f31c849e3409d Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Fri, 8 Sep 2023 15:13:43 +0900
Subject: [PATCH 092/183] fix config

---
 configs/49M.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configs/49M.yml b/configs/49M.yml
index 6cfb4e4fa..9621346fa 100644
--- a/configs/49M.yml
+++ b/configs/49M.yml
@@ -46,7 +46,7 @@
   },
 
   # batch / data settings
-  "train_micro_batch_size_per_gpu": 32,
+  "train_micro_batch_size_per_gpu": 16,
   "gas": 1,
   "data_impl": "mmap",
   "num_workers": 1,

From 12a545c9579bd09fac7a79da262cd9b0bf2c6da8 Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Fri, 8 Sep 2023 15:15:44 +0900
Subject: [PATCH 093/183] fix config

---
 configs/49M.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/configs/49M.yml b/configs/49M.yml
index 9621346fa..4896c1ba0 100644
--- a/configs/49M.yml
+++ b/configs/49M.yml
@@ -46,7 +46,8 @@
   },
 
   # batch / data settings
-  "train_micro_batch_size_per_gpu": 16,
+  "train_micro_batch_size_per_gpu": 8,
+  # "train_micro_batch_size_per_gpu": 32,
   "gas": 1,
   "data_impl": "mmap",
   "num_workers": 1,

From fe8ebcf0b9d7ee1cfbb65043faca42ae44b6158f Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Fri, 8 Sep 2023 18:40:40 +0900
Subject: [PATCH 094/183] force hide log

---
 megatron/utils.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/megatron/utils.py b/megatron/utils.py
index 44fa98a1a..77cc38ff4 100644
--- a/megatron/utils.py
+++ b/megatron/utils.py
@@ -300,7 +300,11 @@ def log(self, names, normalizer=1.0, reset=True):
         string = "time (ms)"
         for name in names:
             elapsed_time = self.timers[name].elapsed(reset=reset) * 1000.0 / normalizer
-            string += " | {}: {:.2f}".format(name, elapsed_time)        
+            string += " | {}: {:.2f}".format(name, elapsed_time)
+        
+        if "optimizer_allgather" in string:
+            return
+        
         if torch.distributed.is_initialized():
             if torch.distributed.get_rank() == 0:
                 print(string, flush=True)

From eae78d5c387120360d82d1c800dc94eb68dccadb Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Fri, 8 Sep 2023 18:42:29 +0900
Subject: [PATCH 095/183] debug

---
 megatron/utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/megatron/utils.py b/megatron/utils.py
index 77cc38ff4..a6aea1e90 100644
--- a/megatron/utils.py
+++ b/megatron/utils.py
@@ -302,6 +302,7 @@ def log(self, names, normalizer=1.0, reset=True):
             elapsed_time = self.timers[name].elapsed(reset=reset) * 1000.0 / normalizer
             string += " | {}: {:.2f}".format(name, elapsed_time)
         
+        print('--------------', string, flush=True)
         if "optimizer_allgather" in string:
             return
         

From 1354502306d4ab67d629bffdf56d6f4284609908 Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Fri, 8 Sep 2023 19:01:10 +0900
Subject: [PATCH 096/183] set log level

---
 configs/49M.yml   | 2 ++
 megatron/utils.py | 6 +-----
 train.py          | 3 +++
 3 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/configs/49M.yml b/configs/49M.yml
index 4896c1ba0..eae03552f 100644
--- a/configs/49M.yml
+++ b/configs/49M.yml
@@ -14,6 +14,8 @@
   "no_weight_tying": true,
   "gpt_j_residual": true,
   "output_layer_parallelism": "column",
+  "use_bnb_optimizer": true,
+
 
   # these should provide some speedup but takes a while to build, set to true if desired
   "scaled_upper_triang_masked_softmax_fusion": false,
diff --git a/megatron/utils.py b/megatron/utils.py
index a6aea1e90..dced16b0c 100644
--- a/megatron/utils.py
+++ b/megatron/utils.py
@@ -301,11 +301,7 @@ def log(self, names, normalizer=1.0, reset=True):
         for name in names:
             elapsed_time = self.timers[name].elapsed(reset=reset) * 1000.0 / normalizer
             string += " | {}: {:.2f}".format(name, elapsed_time)
-        
-        print('--------------', string, flush=True)
-        if "optimizer_allgather" in string:
-            return
-        
+                
         if torch.distributed.is_initialized():
             if torch.distributed.get_rank() == 0:
                 print(string, flush=True)
diff --git a/train.py b/train.py
index 358ab3a81..a16887036 100644
--- a/train.py
+++ b/train.py
@@ -18,6 +18,9 @@
 """Train"""
 from megatron.neox_arguments import NeoXArgs
 from megatron.training import pretrain
+import logging
+
+logging.basicConfig(level=logging.WARNING)
 
 if __name__ == "__main__":
     neox_args = NeoXArgs.consume_neox_args()

From 599829363414a8de3afe57e05bf941a8c58538c8 Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Fri, 8 Sep 2023 19:06:05 +0900
Subject: [PATCH 097/183] debug

---
 configs/49M.yml | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/configs/49M.yml b/configs/49M.yml
index eae03552f..bde1938a8 100644
--- a/configs/49M.yml
+++ b/configs/49M.yml
@@ -97,4 +97,13 @@
 
   ## tokenizer type
   "tokenizer_type": "SPMTokenizer"
+
+  "deepspeed_extra_args": {
+    "comms_logger": {
+      "enabled": false,
+      "verbose": false,
+      "prof_all": false,
+      "debug": false
+    }
+  }
 }

From 3a8efcba50197521a11333767786141de12f95f9 Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Fri, 8 Sep 2023 19:07:00 +0900
Subject: [PATCH 098/183] debug

---
 configs/49M.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/configs/49M.yml b/configs/49M.yml
index bde1938a8..7084e294d 100644
--- a/configs/49M.yml
+++ b/configs/49M.yml
@@ -96,8 +96,8 @@
   "wall_clock_breakdown": true,
 
   ## tokenizer type
-  "tokenizer_type": "SPMTokenizer"
-
+  "tokenizer_type": "SPMTokenizer",
+  
   "deepspeed_extra_args": {
     "comms_logger": {
       "enabled": false,

From 6b3f64f787c179d78fca5d48bb22e07a7f2c9beb Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Fri, 8 Sep 2023 19:09:38 +0900
Subject: [PATCH 099/183] debug

---
 configs/49M.yml | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/configs/49M.yml b/configs/49M.yml
index 7084e294d..5e87d5035 100644
--- a/configs/49M.yml
+++ b/configs/49M.yml
@@ -44,8 +44,9 @@
     "overlap_comm": True,
     "reduce_scatter": True,
     "reduce_bucket_size": 500000000,
-    "contiguous_gradients": True,
+    "contiguous_gradients": True
   },
+  "zero_allow_untested_optimizer": True
 
   # batch / data settings
   "train_micro_batch_size_per_gpu": 8,
@@ -97,7 +98,7 @@
 
   ## tokenizer type
   "tokenizer_type": "SPMTokenizer",
-  
+
   "deepspeed_extra_args": {
     "comms_logger": {
       "enabled": false,

From 032b708105fca436514319dfbf75a4a58236e9c4 Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Fri, 8 Sep 2023 19:10:47 +0900
Subject: [PATCH 100/183] debug

---
 configs/49M.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configs/49M.yml b/configs/49M.yml
index 5e87d5035..b09b62f48 100644
--- a/configs/49M.yml
+++ b/configs/49M.yml
@@ -46,7 +46,7 @@
     "reduce_bucket_size": 500000000,
     "contiguous_gradients": True
   },
-  "zero_allow_untested_optimizer": True
+  "zero_allow_untested_optimizer": True,
 
   # batch / data settings
   "train_micro_batch_size_per_gpu": 8,

From 0f8ac2975d80c27ac4dafed4242dc32d3c778063 Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Fri, 8 Sep 2023 19:11:23 +0900
Subject: [PATCH 101/183] debug

---
 configs/49M.yml | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/configs/49M.yml b/configs/49M.yml
index b09b62f48..da90946d0 100644
--- a/configs/49M.yml
+++ b/configs/49M.yml
@@ -44,9 +44,10 @@
     "overlap_comm": True,
     "reduce_scatter": True,
     "reduce_bucket_size": 500000000,
-    "contiguous_gradients": True
+    "contiguous_gradients": True,
+    "zero_allow_untested_optimizer": True,
   },
-  "zero_allow_untested_optimizer": True,
+  
 
   # batch / data settings
   "train_micro_batch_size_per_gpu": 8,

From 52952c950d426cfb2e7548e346a388f3e2d0245f Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Fri, 8 Sep 2023 19:15:13 +0900
Subject: [PATCH 102/183] debug

---
 configs/49M.yml | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/configs/49M.yml b/configs/49M.yml
index da90946d0..1d1c13926 100644
--- a/configs/49M.yml
+++ b/configs/49M.yml
@@ -44,10 +44,9 @@
     "overlap_comm": True,
     "reduce_scatter": True,
     "reduce_bucket_size": 500000000,
-    "contiguous_gradients": True,
-    "zero_allow_untested_optimizer": True,
+    "contiguous_gradients": True,    
   },
-  
+  "zero_allow_untested_optimizer": true,
 
   # batch / data settings
   "train_micro_batch_size_per_gpu": 8,

From 4e26eaf106979c29af0eae3636f617a19cc66527 Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Fri, 8 Sep 2023 19:17:15 +0900
Subject: [PATCH 103/183] debug

---
 configs/49M.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/configs/49M.yml b/configs/49M.yml
index 1d1c13926..68a0f9ab1 100644
--- a/configs/49M.yml
+++ b/configs/49M.yml
@@ -38,7 +38,7 @@
 
   # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
   "zero_optimization": {
-    "stage": 1,
+    "stage": 0,
     "allgather_partitions": True,
     "allgather_bucket_size": 500000000,
     "overlap_comm": True,
@@ -46,7 +46,7 @@
     "reduce_bucket_size": 500000000,
     "contiguous_gradients": True,    
   },
-  "zero_allow_untested_optimizer": true,
+  "zero_allow_untested_optimizer": false,
 
   # batch / data settings
   "train_micro_batch_size_per_gpu": 8,

From 8579e3c41944b50b38a23e2aefac560b720a2776 Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Fri, 8 Sep 2023 19:18:10 +0900
Subject: [PATCH 104/183] debug

---
 configs/49M.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configs/49M.yml b/configs/49M.yml
index 68a0f9ab1..8e9f71002 100644
--- a/configs/49M.yml
+++ b/configs/49M.yml
@@ -46,7 +46,7 @@
     "reduce_bucket_size": 500000000,
     "contiguous_gradients": True,    
   },
-  "zero_allow_untested_optimizer": false,
+  # "zero_allow_untested_optimizer": true,
 
   # batch / data settings
   "train_micro_batch_size_per_gpu": 8,

From a5eb327d1c07b31cc0c06c340c463bbfa3a498a3 Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Fri, 8 Sep 2023 19:19:13 +0900
Subject: [PATCH 105/183] debug

---
 configs/49M.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configs/49M.yml b/configs/49M.yml
index 8e9f71002..9127e6823 100644
--- a/configs/49M.yml
+++ b/configs/49M.yml
@@ -14,7 +14,7 @@
   "no_weight_tying": true,
   "gpt_j_residual": true,
   "output_layer_parallelism": "column",
-  "use_bnb_optimizer": true,
+  # "use_bnb_optimizer": true,
 
 
   # these should provide some speedup but takes a while to build, set to true if desired

From fc5e9076ed79cd6e07cc37a1e769c5e0442d5b68 Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Fri, 8 Sep 2023 19:36:28 +0900
Subject: [PATCH 106/183] debug

---
 megatron/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/megatron/utils.py b/megatron/utils.py
index dced16b0c..ab51f0667 100644
--- a/megatron/utils.py
+++ b/megatron/utils.py
@@ -301,7 +301,7 @@ def log(self, names, normalizer=1.0, reset=True):
         for name in names:
             elapsed_time = self.timers[name].elapsed(reset=reset) * 1000.0 / normalizer
             string += " | {}: {:.2f}".format(name, elapsed_time)
-                
+        print("-"*10, string)
         if torch.distributed.is_initialized():
             if torch.distributed.get_rank() == 0:
                 print(string, flush=True)

From b4ea1bbaeaf18bc34ff2d68f1774a5d0259ae997 Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Fri, 8 Sep 2023 19:47:44 +0900
Subject: [PATCH 107/183] debug

---
 configs/49M.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configs/49M.yml b/configs/49M.yml
index 9127e6823..cf7a63b3c 100644
--- a/configs/49M.yml
+++ b/configs/49M.yml
@@ -38,7 +38,7 @@
 
   # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
   "zero_optimization": {
-    "stage": 0,
+    "stage": 2,
     "allgather_partitions": True,
     "allgather_bucket_size": 500000000,
     "overlap_comm": True,

From fb70e4f89481fc5dae8fe1695f70ecc61ea3bce6 Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Fri, 8 Sep 2023 19:48:35 +0900
Subject: [PATCH 108/183] debug

---
 configs/49M.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configs/49M.yml b/configs/49M.yml
index cf7a63b3c..560eb4de6 100644
--- a/configs/49M.yml
+++ b/configs/49M.yml
@@ -38,7 +38,7 @@
 
   # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
   "zero_optimization": {
-    "stage": 2,
+    "stage": 1,
     "allgather_partitions": True,
     "allgather_bucket_size": 500000000,
     "overlap_comm": True,

From 12ac35236138d58643e052e95d134c02d60d9183 Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Fri, 8 Sep 2023 19:51:53 +0900
Subject: [PATCH 109/183] debug

---
 train.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/train.py b/train.py
index a16887036..d7c065a2e 100644
--- a/train.py
+++ b/train.py
@@ -18,9 +18,9 @@
 """Train"""
 from megatron.neox_arguments import NeoXArgs
 from megatron.training import pretrain
-import logging
 
-logging.basicConfig(level=logging.WARNING)
+import logging
+logging.getLogger('deepspeed').setLevel(logging.WARNING)
 
 if __name__ == "__main__":
     neox_args = NeoXArgs.consume_neox_args()

From b94532dbeed22b3a5b51eff0910a81e1034a2079 Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Fri, 8 Sep 2023 19:53:19 +0900
Subject: [PATCH 110/183] debug

---
 megatron/training.py | 2 ++
 train.py             | 3 ---
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/megatron/training.py b/megatron/training.py
index bca3057b7..eed4b10d6 100644
--- a/megatron/training.py
+++ b/megatron/training.py
@@ -57,6 +57,8 @@
 from megatron.model.gpt2_model import cross_entropy
 from eval_tasks import run_eval_harness
 
+import logging
+logging.getLogger('deepspeed').setLevel(logging.WARNING)
 
 def mup_weights_reinit(neox_args, model):
     def has_method(o, name):
diff --git a/train.py b/train.py
index d7c065a2e..358ab3a81 100644
--- a/train.py
+++ b/train.py
@@ -19,9 +19,6 @@
 from megatron.neox_arguments import NeoXArgs
 from megatron.training import pretrain
 
-import logging
-logging.getLogger('deepspeed').setLevel(logging.WARNING)
-
 if __name__ == "__main__":
     neox_args = NeoXArgs.consume_neox_args()
     neox_args.configure_distributed_args()

From 1c232a791dd49fbb482c8d893c741bdf6daf8241 Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Fri, 8 Sep 2023 19:56:15 +0900
Subject: [PATCH 111/183] debug

---
 megatron/training.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/megatron/training.py b/megatron/training.py
index eed4b10d6..1f2231acd 100644
--- a/megatron/training.py
+++ b/megatron/training.py
@@ -58,7 +58,9 @@
 from eval_tasks import run_eval_harness
 
 import logging
-logging.getLogger('deepspeed').setLevel(logging.WARNING)
+from deepspeed.utils import logger as ds_logger
+ds_logger.setLevel(logging.WARNING)
+
 
 def mup_weights_reinit(neox_args, model):
     def has_method(o, name):

From 0644f3c1079ea51f3281dc5a34910ed86854ee5e Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Sun, 10 Sep 2023 14:57:51 +0900
Subject: [PATCH 112/183] add swiglu

---
 megatron/model/activations.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/megatron/model/activations.py b/megatron/model/activations.py
index 5c4ba1d5a..fc4c73445 100644
--- a/megatron/model/activations.py
+++ b/megatron/model/activations.py
@@ -46,7 +46,9 @@ def get_activation(neox_args):
     elif neox_args.activation == "mish":
         activation_func = mish
     elif neox_args.activation == "silu":
-        activation_func = F.silu
+        activation_func = F.silu        
+    elif neox_args.activation == "swiglu":
+        activation_func = swiglu        
     else:
         raise ValueError(f"Activation function {neox_args.activation} not recognized")
     return activation_func
@@ -120,6 +122,10 @@ def swish(x, beta: float = 1.0):
 def mish(x):
     return x * torch.tanh(F.softplus(x))
 
+@torch.jit.script
+def swiglu(x):
+    x = torch.chunk(x, 2, dim=-1)
+    return F.silu(x[0]) * x[1]
 
 class GEGLU(torch.nn.Module):
     def __init__(self, neox_args):

From c6264b0fe24c893097816f50cc6a5a47a1093f74 Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Sun, 10 Sep 2023 15:00:53 +0900
Subject: [PATCH 113/183] add swiglu

---
 megatron/neox_arguments/neox_args.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/megatron/neox_arguments/neox_args.py b/megatron/neox_arguments/neox_args.py
index e1ea16a16..6ddae6e92 100644
--- a/megatron/neox_arguments/neox_args.py
+++ b/megatron/neox_arguments/neox_args.py
@@ -229,10 +229,10 @@ class NeoXArgsModel(NeoXArgsTemplate):
     """
 
     activation: Literal[
-        "gelu", "geglu", "relu", "softsign", "swish", "mish", "silu"
+        "gelu", "geglu", "relu", "softsign", "swish", "mish", "silu", "swiglu"
     ] = "gelu"
     """
-    Activation function to use - choose from ["gelu", "geglu", "relu", "softsign", "swish", "mish", "silu"]
+    Activation function to use - choose from ["gelu", "geglu", "relu", "softsign", "swish", "mish", "silu", "swiglu"]
     """
 
     scaled_upper_triang_masked_softmax_fusion: bool = False

From ea72bd8063903bba89438991720973d23b6388af Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Sun, 10 Sep 2023 15:05:26 +0900
Subject: [PATCH 114/183] curriculum flash activation

---
 configs/49M.yml | 30 +++++++++++++++++++++++++++---
 1 file changed, 27 insertions(+), 3 deletions(-)

diff --git a/configs/49M.yml b/configs/49M.yml
index 560eb4de6..496210217 100644
--- a/configs/49M.yml
+++ b/configs/49M.yml
@@ -14,12 +14,36 @@
   "no_weight_tying": true,
   "gpt_j_residual": true,
   "output_layer_parallelism": "column",
+  # "activation": "glue",
+  "activation": "swiglu",
+  "norm": "rmsnorm",
   # "use_bnb_optimizer": true,
 
-
   # these should provide some speedup but takes a while to build, set to true if desired
-  "scaled_upper_triang_masked_softmax_fusion": false,
-  "bias_gelu_fusion": false,
+  #"scaled_upper_triang_masked_softmax_fusion": false,
+  #"bias_gelu_fusion": false,
+  "scaled_upper_triang_masked_softmax_fusion": true,
+  "bias-gelu-fusion": true,
+  "attention-config": [
+        [
+            [
+                "flash"
+            ],
+            10
+        ]
+  ],
+  "curriculum_learning": {
+        "enabled": true,
+        "curriculum_type": "seqlen",
+        "min_difficulty": 64,
+        "max_difficulty": 2048,
+        "schedule_type": "fixed_linear",
+        "schedule_config": {
+            "total_curriculum_step": 20000,
+            "difficulty_step": 8
+        }
+  },
+
 
   # init methods
   "init_method": "small_init",

From f02ab6959433a1fbf401ea565803e4c1058ff204 Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Sun, 10 Sep 2023 15:30:38 +0900
Subject: [PATCH 115/183] fix

---
 configs/49M.yml            |  6 +++---
 configs/local_setup_ja.yml | 10 +++++++---
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/configs/49M.yml b/configs/49M.yml
index 496210217..fa9fdd7fc 100644
--- a/configs/49M.yml
+++ b/configs/49M.yml
@@ -14,15 +14,15 @@
   "no_weight_tying": true,
   "gpt_j_residual": true,
   "output_layer_parallelism": "column",
-  # "activation": "glue",
-  "activation": "swiglu",
+  "activation": "glue",
+  # "activation": "swiglu",
   "norm": "rmsnorm",
   # "use_bnb_optimizer": true,
 
   # these should provide some speedup but takes a while to build, set to true if desired
   #"scaled_upper_triang_masked_softmax_fusion": false,
   #"bias_gelu_fusion": false,
-  "scaled_upper_triang_masked_softmax_fusion": true,
+  "scaled_upper_triang_masked_softmax_fusion": true,  
   "bias-gelu-fusion": true,
   "attention-config": [
         [
diff --git a/configs/local_setup_ja.yml b/configs/local_setup_ja.yml
index 29af25041..972fe7fc2 100644
--- a/configs/local_setup_ja.yml
+++ b/configs/local_setup_ja.yml
@@ -18,14 +18,18 @@
   
   "vocab_file": "./novelAI/tokenizer.model",  
 
-  "save": "checkpoints",
-  "load": "checkpoints",
+  #"save": "checkpoints",
+  # "load": "checkpoints",
+  "save": "/content/drive/MyDrive/pre_trained/49M"
+  "load": "/content/drive/MyDrive/pre_trained/49M"
+
   "checkpoint_validation_with_forward_pass": False,
 
   ## logging
   "log_dir": "logs",
 
-  "tensorboard_dir": "tensorboard",
+  # "tensorboard_dir": "tensorboard",
+  "tensorboard_dir": "/content/drive/MyDrive/pre_trained/49M"  
   "log_dir": "logs",
   "use_wandb": False
 }

From b5a05302c17a5988c33d2aab199dc89f98e5648d Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Sun, 10 Sep 2023 15:33:39 +0900
Subject: [PATCH 116/183] fix

---
 configs/local_setup_ja.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/configs/local_setup_ja.yml b/configs/local_setup_ja.yml
index 972fe7fc2..547f8b164 100644
--- a/configs/local_setup_ja.yml
+++ b/configs/local_setup_ja.yml
@@ -20,8 +20,8 @@
 
   #"save": "checkpoints",
   # "load": "checkpoints",
-  "save": "/content/drive/MyDrive/pre_trained/49M"
-  "load": "/content/drive/MyDrive/pre_trained/49M"
+  "save": "/content/drive/MyDrive/pre_trained/49M",
+  "load": "/content/drive/MyDrive/pre_trained/49M",
 
   "checkpoint_validation_with_forward_pass": False,
 
@@ -29,7 +29,7 @@
   "log_dir": "logs",
 
   # "tensorboard_dir": "tensorboard",
-  "tensorboard_dir": "/content/drive/MyDrive/pre_trained/49M"  
+  "tensorboard_dir": "/content/drive/MyDrive/pre_trained/49M", 
   "log_dir": "logs",
   "use_wandb": False
 }

From 4b17a49970272d1b5d186b4be26dabdfec6480db Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Sun, 10 Sep 2023 15:35:03 +0900
Subject: [PATCH 117/183] fix

---
 configs/49M.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configs/49M.yml b/configs/49M.yml
index fa9fdd7fc..b24ee884e 100644
--- a/configs/49M.yml
+++ b/configs/49M.yml
@@ -14,7 +14,7 @@
   "no_weight_tying": true,
   "gpt_j_residual": true,
   "output_layer_parallelism": "column",
-  "activation": "glue",
+  "activation": "gelu",
   # "activation": "swiglu",
   "norm": "rmsnorm",
   # "use_bnb_optimizer": true,

From 401bf31cba0dfc55511018f2a71556f6e5fa60dc Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Sun, 10 Sep 2023 16:45:17 +0900
Subject: [PATCH 118/183] wsiglu

---
 configs/49M.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/configs/49M.yml b/configs/49M.yml
index b24ee884e..52443e367 100644
--- a/configs/49M.yml
+++ b/configs/49M.yml
@@ -14,8 +14,8 @@
   "no_weight_tying": true,
   "gpt_j_residual": true,
   "output_layer_parallelism": "column",
-  "activation": "gelu",
-  # "activation": "swiglu",
+  # "activation": "gelu",
+  "activation": "swiglu",
   "norm": "rmsnorm",
   # "use_bnb_optimizer": true,
 

From 9dfe92f8bf49d3787d2412abb0ecda64caeed870 Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Sun, 10 Sep 2023 16:50:49 +0900
Subject: [PATCH 119/183] swiglu

---
 megatron/model/transformer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py
index 4e81b70b6..890703e80 100644
--- a/megatron/model/transformer.py
+++ b/megatron/model/transformer.py
@@ -120,7 +120,7 @@ def forward(self, hidden_states):
 
         if (
             self.activation_type == "gelu" and self.bias_gelu_fusion
-        ) or self.activation_type == "geglu":
+        ) or self.activation_type == "geglu" or self.activation_type == "swiglu":
             intermediate_parallel = self.activation_func(
                 intermediate_parallel, bias_parallel
             )

From 322deed28914105a425887e0a92bfb5192bcd46e Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Sun, 10 Sep 2023 17:17:53 +0900
Subject: [PATCH 120/183] xpos

---
 configs/49M.yml                         |   7 +-
 megatron/model/positional_embeddings.py | 107 ++++++++++++++++++++++++
 megatron/model/transformer.py           |  18 +++-
 megatron/neox_arguments/neox_args.py    |   2 +-
 4 files changed, 129 insertions(+), 5 deletions(-)

diff --git a/configs/49M.yml b/configs/49M.yml
index 52443e367..4e162b9f1 100644
--- a/configs/49M.yml
+++ b/configs/49M.yml
@@ -9,13 +9,14 @@
   "num_attention_heads": 10,
   "seq_length": 2048,
   "max_position_embeddings": 2048,
-  "pos_emb": "rotary",
+  # "pos_emb": "rotary",
+  "pos_emb": "xpos",
   "rotary_pct": 0.25,
   "no_weight_tying": true,
   "gpt_j_residual": true,
   "output_layer_parallelism": "column",
-  # "activation": "gelu",
-  "activation": "swiglu",
+  "activation": "gelu",
+  # "activation": "swiglu",
   "norm": "rmsnorm",
   # "use_bnb_optimizer": true,
 
diff --git a/megatron/model/positional_embeddings.py b/megatron/model/positional_embeddings.py
index 68815075a..59f1bb02e 100644
--- a/megatron/model/positional_embeddings.py
+++ b/megatron/model/positional_embeddings.py
@@ -221,3 +221,110 @@ def forward(self, x):
             )  # seq_len_k - 1 points to the last token index in the current inference batch.
 
         return x + a
+
+
+# Original implementation adjusted from https://github.com/sunyt32/torchscale
+
+def fixed_pos_embedding(x, base):
+    seq_len, dim = x.shape
+    inv_freq = 1.0 / (base ** (torch.arange(0, dim) / dim))
+    sinusoid_inp = (
+        torch.einsum("i , j -> i j", torch.arange(0, seq_len, dtype=torch.float), inv_freq).to(x)
+    )
+    return torch.cos(sinusoid_inp), torch.sin(sinusoid_inp)
+
+
+class XPosEmbedding(torch.nn.Module):
+    """
+    xPos positional embeddings from https://arxiv.org/abs/2212.10554.
+    """
+
+    def __init__(self, head_dim, freq_base=10000, scale_base=512, gamma=0.4, precision=torch.half):
+        super().__init__()
+        self.scale_base = scale_base
+        self.register_buffer(
+            "scale",
+            (
+                (torch.arange(0, head_dim, 2) + gamma * head_dim)
+                / ((1.0 + gamma) * head_dim)
+            ),
+        )
+        self.max_seq_len_cached = None
+        self.precision = precision
+        self.freq_base = freq_base
+
+    def forward(self, x, seq_dim=1, seq_len=None):
+        if seq_len is None:
+            seq_len = x.shape[seq_dim]
+        scale = (
+            self.scale
+            ** (
+                torch.arange(0, seq_len, 1) - seq_len // 2
+            ).to(self.scale).div(self.scale_base)[:, None]
+        )
+
+        if (
+                self.max_seq_len_cached is None
+                or (seq_len > self.max_seq_len_cached)
+        ):
+            self.max_seq_len_cached = seq_len
+            cos, sin = fixed_pos_embedding(scale, self.freq_base)
+            self.cos_cached = cos
+            self.sin_cached = sin
+            if self.precision == torch.bfloat16:
+                self.cos_cached = self.cos_cached.bfloat16()
+                self.sin_cached = self.sin_cached.bfloat16()
+        return (
+            self.cos_cached[:seq_len],
+            self.sin_cached[:seq_len],
+            scale,
+        )
+
+
+def rotate_every_two(x):
+    x1 = x[:, :, ::2]
+    x2 = x[:, :, 1::2]
+    x = torch.stack((-x2, x1), dim=-1)
+    return x.flatten(-2)  # in einsum notation: rearrange(x, '... d j -> ... (d j)')\
+
+
+def duplicate_interleave(m):
+    """
+    A simple version of `torch.repeat_interleave` for duplicating a matrix while interleaving the copy.
+    """
+    dim0 = m.shape[0]
+    m = m.view(-1, 1)  # flatten the matrix
+    m = m.repeat(1, 2)  # repeat all elements into the 2nd dimension
+    m = m.view(dim0, -1)  # reshape into a matrix, interleaving the copy
+    return m.unsqueeze(1)
+
+
+def _apply_xpos_emb(x, cos, sin, scale):
+    # x is assumed to be (seq_len, batch_size, dim) here.
+    cos = duplicate_interleave(cos * scale)
+    sin = duplicate_interleave(sin * scale)
+    # einsum notation for lambda t: repeat(t[offset:x.shape[1]+offset,:], "n d -> () n () (d j)", j=2)
+    return (x * cos) + (rotate_every_two(x) * sin)
+
+
+@torch.jit.script
+def apply_xpos_emb(q, k, cos, sin, scale, offset: int = 0):
+    # q/k are assumed to be (seq_len, batch_size, dim) here.
+    cos = cos[offset:q.shape[0] + offset]
+    sin = sin[offset:q.shape[0] + offset]
+    scale = scale[offset:q.shape[0] + offset]
+    return (
+        _apply_xpos_emb(q, cos, sin, scale),
+        _apply_xpos_emb(k, cos, sin, 1.0 / scale),
+    )
+
+
+def apply_xpos_emb_torch(q, k, cos, sin, scale, offset: int = 0):
+    # q/k are assumed to be (seq_len, batch_size, dim) here.
+    cos = cos[offset:q.shape[0] + offset]
+    sin = sin[offset:q.shape[0] + offset]
+    scale = scale[offset:q.shape[0] + offset]
+    return (
+        _apply_xpos_emb(q, cos, sin, scale),
+        _apply_xpos_emb(k, cos, sin, 1.0 / scale),
+    )
\ No newline at end of file
diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py
index 890703e80..1eefacae6 100644
--- a/megatron/model/transformer.py
+++ b/megatron/model/transformer.py
@@ -32,6 +32,9 @@
     apply_rotary_pos_emb_torch,
     apply_rotary_pos_emb,
     AliBi,
+    XPosEmbedding,
+    apply_xpos_emb_torch,
+    apply_xpos_emb
 )
 from megatron.model.fused_bias_dropout import (
     get_bias_dropout_add,
@@ -120,7 +123,7 @@ def forward(self, hidden_states):
 
         if (
             self.activation_type == "gelu" and self.bias_gelu_fusion
-        ) or self.activation_type == "geglu" or self.activation_type == "swiglu":
+        ) or self.activation_type == "geglu":
             intermediate_parallel = self.activation_func(
                 intermediate_parallel, bias_parallel
             )
@@ -332,6 +335,11 @@ def __init__(
         else:
             self.rotary_emb = None
 
+        ## xpos
+        if neox_args.pos_emb == "xpos":
+            self.xpos_emb = XPosEmbedding(self.hidden_size_per_attention_head, precision=neox_args.params_dtype)
+        else:
+            self.xpos_emb = None
         self.attention_type = neox_args.attention_config[layer_number]
         self.use_flash_attention = self.attention_type == "flash"
         self.sparse = self.attention_type not in ("global", "flash")
@@ -665,6 +673,14 @@ def forward(self, hidden_states, attention_mask, layer_past=None):
                 query_layer = torch.cat((query_layer, query_pass), dim=-1)
                 key_layer = torch.cat((key_layer, key_pass), dim=-1)
 
+        ## xpos
+        if exists(self.xpos_emb):
+            apply_xpos_fn = apply_xpos_emb_torch if self.bf16 else apply_xpos_emb
+            cos, sin, scale = self.xpos_emb(value_layer, seq_len=seq_len)
+            query_layer, key_layer = apply_xpos_fn(
+                query_layer, key_layer, cos, sin, scale, offset=offset)            
+
+
         # ==================================
         # Cache key and value for inference
         # ==================================
diff --git a/megatron/neox_arguments/neox_args.py b/megatron/neox_arguments/neox_args.py
index 6ddae6e92..98707e267 100644
--- a/megatron/neox_arguments/neox_args.py
+++ b/megatron/neox_arguments/neox_args.py
@@ -141,7 +141,7 @@ class NeoXArgsModel(NeoXArgsTemplate):
     """
 
     pos_emb: Literal[
-        "learned", "rotary", "sinusoidal", "rpe", "alibi", "none"
+        "learned", "rotary", "sinusoidal", "rpe", "alibi", "none", "xpos"
     ] = "learned"
     """
     Type of positional embedding to use - choose from 'learned', 'rotary', 'sinusoidal', 'rpe', 'none'

From 2e95d1dc056332ea5058d6e92035a7dc4f2f771b Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Sun, 10 Sep 2023 17:19:31 +0900
Subject: [PATCH 121/183] fix

---
 configs/local_setup_ja.yml | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/configs/local_setup_ja.yml b/configs/local_setup_ja.yml
index 547f8b164..69ab5effb 100644
--- a/configs/local_setup_ja.yml
+++ b/configs/local_setup_ja.yml
@@ -20,16 +20,17 @@
 
   #"save": "checkpoints",
   # "load": "checkpoints",
-  "save": "/content/drive/MyDrive/pre_trained/49M",
-  "load": "/content/drive/MyDrive/pre_trained/49M",
+  "save": "/content/drive/MyDrive/pre_trained/49M/checkpoints",
+  "load": "/content/drive/MyDrive/pre_trained/49M/checkpoints",
 
   "checkpoint_validation_with_forward_pass": False,
 
   ## logging
-  "log_dir": "logs",
+  # "log_dir": "logs",
+  "log_dir": "/content/drive/MyDrive/pre_trained/49M/logs", 
 
   # "tensorboard_dir": "tensorboard",
-  "tensorboard_dir": "/content/drive/MyDrive/pre_trained/49M", 
-  "log_dir": "logs",
+  "tensorboard_dir": "/content/drive/MyDrive/pre_trained/49M/tensorboard", 
+  # "log_dir": "logs",
   "use_wandb": False
 }

From c5fd89a188f3798afcde82142acf767fe83f10ae Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Sun, 10 Sep 2023 17:22:31 +0900
Subject: [PATCH 122/183] fix xpos

---
 megatron/model/transformer.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py
index 1eefacae6..8b6627a3f 100644
--- a/megatron/model/transformer.py
+++ b/megatron/model/transformer.py
@@ -675,6 +675,12 @@ def forward(self, hidden_states, attention_mask, layer_past=None):
 
         ## xpos
         if exists(self.xpos_emb):
+            seq_len = key_layer.shape[0]
+            offset = 0
+            if exists(layer_past) and layer_past.numel() > 0:
+                offset = layer_past[0].shape[0]
+                seq_len += offset
+
             apply_xpos_fn = apply_xpos_emb_torch if self.bf16 else apply_xpos_emb
             cos, sin, scale = self.xpos_emb(value_layer, seq_len=seq_len)
             query_layer, key_layer = apply_xpos_fn(

From db18c2dd829a5b9bef650bde443cddc3a3f86114 Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Sun, 10 Sep 2023 17:36:23 +0900
Subject: [PATCH 123/183] fix

---
 megatron/model/transformer.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py
index 8b6627a3f..ebb40baea 100644
--- a/megatron/model/transformer.py
+++ b/megatron/model/transformer.py
@@ -640,6 +640,14 @@ def forward(self, hidden_states, attention_mask, layer_past=None):
             mixed_x_layer, 3
         )
 
+        ## for xpos
+        if layer_past is not None:
+            past_key, past_value = layer_past
+            key_layer = torch.cat((past_key.type_as(key_layer),
+                                   key_layer), dim=0)
+            value_layer = torch.cat((past_value.type_as(value_layer),
+                                     value_layer), dim=0)
+
         if exists(self.rotary_emb):
             if exists(self.rotary_ndims):
                 # partial rotary

From e2ec22c7004afbac32fa8552d397e0fc64bcf61e Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Sun, 10 Sep 2023 17:38:26 +0900
Subject: [PATCH 124/183] debug

---
 megatron/model/transformer.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py
index ebb40baea..e8dec142c 100644
--- a/megatron/model/transformer.py
+++ b/megatron/model/transformer.py
@@ -640,14 +640,6 @@ def forward(self, hidden_states, attention_mask, layer_past=None):
             mixed_x_layer, 3
         )
 
-        ## for xpos
-        if layer_past is not None:
-            past_key, past_value = layer_past
-            key_layer = torch.cat((past_key.type_as(key_layer),
-                                   key_layer), dim=0)
-            value_layer = torch.cat((past_value.type_as(value_layer),
-                                     value_layer), dim=0)
-
         if exists(self.rotary_emb):
             if exists(self.rotary_ndims):
                 # partial rotary
@@ -669,9 +661,17 @@ def forward(self, hidden_states, attention_mask, layer_past=None):
 
             seq_len = key_layer.shape[0]
             offset = 0
+            if layer_past is not None:
+                past_key, past_value = layer_past
+                key_layer = torch.cat((past_key.type_as(key_layer),
+                                   key_layer), dim=0)
+                value_layer = torch.cat((past_value.type_as(value_layer),
+                                     value_layer), dim=0)
+
             if exists(layer_past) and layer_past.numel() > 0:
                 offset = layer_past[0].shape[0]
                 seq_len += offset
+            print('has layer_past', exists(layer_past))
             cos, sin = self.rotary_emb(value_layer, seq_len=seq_len)
             query_layer, key_layer = apply_rotary_fn(
                 query_rot, key_rot, cos, sin, offset=offset

From c7aa2386bbaa10d06ae47f6303ae054a0a959e6b Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Sun, 10 Sep 2023 17:40:03 +0900
Subject: [PATCH 125/183] debug

---
 megatron/model/transformer.py | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py
index e8dec142c..3ae8bbffb 100644
--- a/megatron/model/transformer.py
+++ b/megatron/model/transformer.py
@@ -661,17 +661,10 @@ def forward(self, hidden_states, attention_mask, layer_past=None):
 
             seq_len = key_layer.shape[0]
             offset = 0
-            if layer_past is not None:
-                past_key, past_value = layer_past
-                key_layer = torch.cat((past_key.type_as(key_layer),
-                                   key_layer), dim=0)
-                value_layer = torch.cat((past_value.type_as(value_layer),
-                                     value_layer), dim=0)
-
             if exists(layer_past) and layer_past.numel() > 0:
                 offset = layer_past[0].shape[0]
                 seq_len += offset
-            print('has layer_past', exists(layer_past))
+
             cos, sin = self.rotary_emb(value_layer, seq_len=seq_len)
             query_layer, key_layer = apply_rotary_fn(
                 query_rot, key_rot, cos, sin, offset=offset
@@ -685,6 +678,15 @@ def forward(self, hidden_states, attention_mask, layer_past=None):
         if exists(self.xpos_emb):
             seq_len = key_layer.shape[0]
             offset = 0
+            print('has layer_past', exists(layer_past))
+            if exists(layer_past):
+                past_key, past_value = layer_past
+                key_layer = torch.cat((past_key.type_as(key_layer),
+                                   key_layer), dim=0)
+                value_layer = torch.cat((past_value.type_as(value_layer),
+                                     value_layer), dim=0)
+
+
             if exists(layer_past) and layer_past.numel() > 0:
                 offset = layer_past[0].shape[0]
                 seq_len += offset

From 9de41b1bd081898c5495f9ddafeef35a2ab484f3 Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Sun, 10 Sep 2023 17:43:21 +0900
Subject: [PATCH 126/183] debug

---
 megatron/model/transformer.py | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py
index 3ae8bbffb..f0e786715 100644
--- a/megatron/model/transformer.py
+++ b/megatron/model/transformer.py
@@ -676,6 +676,23 @@ def forward(self, hidden_states, attention_mask, layer_past=None):
 
         ## xpos
         if exists(self.xpos_emb):
+            # ===================================
+            # Raw attention scores. [b, np, s, s]
+            # ===================================
+
+            # [b, np, sq, sk]
+            output_size = (query_layer.size(1),
+                        query_layer.size(2),
+                        query_layer.size(0),
+                        key_layer.size(0))
+
+            # [sq, b, np, hn] -> [sq, b * np, hn]
+            query_layer = query_layer.view(output_size[2],
+                                        output_size[0] * output_size[1], -1)
+            # [sk, b, np, hn] -> [sk, b * np, hn]
+            key_layer = key_layer.view(output_size[3],
+                                    output_size[0] * output_size[1], -1)
+
             seq_len = key_layer.shape[0]
             offset = 0
             print('has layer_past', exists(layer_past))

From 6478c1c7e285553d74cb4fb8d767472368e85a30 Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Sun, 10 Sep 2023 17:49:28 +0900
Subject: [PATCH 127/183] debug

---
 megatron/model/transformer.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py
index f0e786715..293c94c8f 100644
--- a/megatron/model/transformer.py
+++ b/megatron/model/transformer.py
@@ -728,6 +728,12 @@ def forward(self, hidden_states, attention_mask, layer_past=None):
         if self.use_cache:
             present = torch.stack((key_layer, value_layer))
 
+        if exists(self.xpos_emb):
+            # change view [sk, b * np, hn]
+            value_layer = value_layer.view(value_layer.size(0),
+                                       output_size[0] * output_size[1], -1)
+            
+
         if self.use_flash_attention:
             context_layer = self.flash_attention(query_layer, key_layer, value_layer)
         elif not self.sparse:

From 5025cb230faf905d06d27aae18cf0e300b2cb398 Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Sun, 10 Sep 2023 18:02:41 +0900
Subject: [PATCH 128/183] fix

---
 configs/49M.yml               | 16 ++++++++--------
 megatron/model/transformer.py |  7 -------
 2 files changed, 8 insertions(+), 15 deletions(-)

diff --git a/configs/49M.yml b/configs/49M.yml
index 4e162b9f1..a32dec0c0 100644
--- a/configs/49M.yml
+++ b/configs/49M.yml
@@ -25,14 +25,14 @@
   #"bias_gelu_fusion": false,
   "scaled_upper_triang_masked_softmax_fusion": true,  
   "bias-gelu-fusion": true,
-  "attention-config": [
-        [
-            [
-                "flash"
-            ],
-            10
-        ]
-  ],
+  # "attention-config": [
+  #       [
+  #           [
+  #               "flash"
+  #           ],
+  #           10
+  #       ]
+  # ],
   "curriculum_learning": {
         "enabled": true,
         "curriculum_type": "seqlen",
diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py
index 293c94c8f..c644219b2 100644
--- a/megatron/model/transformer.py
+++ b/megatron/model/transformer.py
@@ -578,7 +578,6 @@ def flash_attention(self, query_layer, key_layer, value_layer):
             )
             # [b, sq, np, hn] -> [b, np, sq, hn]
             matmul_result = matmul_result.transpose(1, 2)
-
         else:
             # [sq, b, np, hn] -> [b, sq, np, hn]
             sq = query_layer.size(0)
@@ -728,12 +727,6 @@ def forward(self, hidden_states, attention_mask, layer_past=None):
         if self.use_cache:
             present = torch.stack((key_layer, value_layer))
 
-        if exists(self.xpos_emb):
-            # change view [sk, b * np, hn]
-            value_layer = value_layer.view(value_layer.size(0),
-                                       output_size[0] * output_size[1], -1)
-            
-
         if self.use_flash_attention:
             context_layer = self.flash_attention(query_layer, key_layer, value_layer)
         elif not self.sparse:

From f3979b8f7f1450b7208e74a6e727fa6782c32571 Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Sun, 10 Sep 2023 18:17:25 +0900
Subject: [PATCH 129/183] debug

---
 configs/49M.yml               | 4 ++--
 megatron/model/transformer.py | 8 +++++++-
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/configs/49M.yml b/configs/49M.yml
index a32dec0c0..dcc196cbb 100644
--- a/configs/49M.yml
+++ b/configs/49M.yml
@@ -9,8 +9,8 @@
   "num_attention_heads": 10,
   "seq_length": 2048,
   "max_position_embeddings": 2048,
-  # "pos_emb": "rotary",
-  "pos_emb": "xpos",
+  "pos_emb": "rotary",
+  # "pos_emb": "xpos",
   "rotary_pct": 0.25,
   "no_weight_tying": true,
   "gpt_j_residual": true,
diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py
index c644219b2..69de6ad4a 100644
--- a/megatron/model/transformer.py
+++ b/megatron/model/transformer.py
@@ -672,7 +672,10 @@ def forward(self, hidden_states, attention_mask, layer_past=None):
             if exists(self.rotary_ndims):
                 query_layer = torch.cat((query_layer, query_pass), dim=-1)
                 key_layer = torch.cat((key_layer, key_pass), dim=-1)
-
+            print('query_layer', query_layer.size())
+            print('key_layer', key_layer.size())
+            print('value_layer', value_layer.size())
+            
         ## xpos
         if exists(self.xpos_emb):
             # ===================================
@@ -711,6 +714,9 @@ def forward(self, hidden_states, attention_mask, layer_past=None):
             cos, sin, scale = self.xpos_emb(value_layer, seq_len=seq_len)
             query_layer, key_layer = apply_xpos_fn(
                 query_layer, key_layer, cos, sin, scale, offset=offset)            
+            print('query_layer', query_layer.size())
+            print('key_layer', key_layer.size())
+            print('value_layer', value_layer.size())
 
 
         # ==================================

From fd9a93156e20b9695bfbaa262e44b5cc51dde730 Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Sun, 10 Sep 2023 18:19:58 +0900
Subject: [PATCH 130/183] debug

---
 configs/49M.yml               | 4 ++--
 megatron/model/transformer.py | 7 ++++---
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/configs/49M.yml b/configs/49M.yml
index dcc196cbb..a32dec0c0 100644
--- a/configs/49M.yml
+++ b/configs/49M.yml
@@ -9,8 +9,8 @@
   "num_attention_heads": 10,
   "seq_length": 2048,
   "max_position_embeddings": 2048,
-  "pos_emb": "rotary",
-  # "pos_emb": "xpos",
+  # "pos_emb": "rotary",
+  "pos_emb": "xpos",
   "rotary_pct": 0.25,
   "no_weight_tying": true,
   "gpt_j_residual": true,
diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py
index 69de6ad4a..92fcd642d 100644
--- a/megatron/model/transformer.py
+++ b/megatron/model/transformer.py
@@ -672,9 +672,10 @@ def forward(self, hidden_states, attention_mask, layer_past=None):
             if exists(self.rotary_ndims):
                 query_layer = torch.cat((query_layer, query_pass), dim=-1)
                 key_layer = torch.cat((key_layer, key_pass), dim=-1)
-            print('query_layer', query_layer.size())
-            print('key_layer', key_layer.size())
-            print('value_layer', value_layer.size())
+                                
+            # print('query_layer', query_layer.size()) #torch.Size([64, 8, 10, 64])
+            # print('key_layer', key_layer.size())  #torch.Size([64, 8, 10, 64])
+            # print('value_layer', value_layer.size())  #torch.Size([64, 8, 10, 64])
             
         ## xpos
         if exists(self.xpos_emb):

From 31327fdd41a8df66385bebac6530b586df132705 Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Sun, 10 Sep 2023 18:22:51 +0900
Subject: [PATCH 131/183] debug

---
 megatron/model/transformer.py | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py
index 92fcd642d..e8b987312 100644
--- a/megatron/model/transformer.py
+++ b/megatron/model/transformer.py
@@ -688,6 +688,8 @@ def forward(self, hidden_states, attention_mask, layer_past=None):
                         query_layer.size(2),
                         query_layer.size(0),
                         key_layer.size(0))
+            print('query_layer0', query_layer.size()) # torch.Size([64, 80, 64])
+            print('key_layer0', key_layer.size()) # torch.Size([64, 80, 64])
 
             # [sq, b, np, hn] -> [sq, b * np, hn]
             query_layer = query_layer.view(output_size[2],
@@ -695,6 +697,9 @@ def forward(self, hidden_states, attention_mask, layer_past=None):
             # [sk, b, np, hn] -> [sk, b * np, hn]
             key_layer = key_layer.view(output_size[3],
                                     output_size[0] * output_size[1], -1)
+            
+            print('query_layer1', query_layer.size()) # torch.Size([64, 80, 64])
+            print('key_layer1', key_layer.size()) # torch.Size([64, 80, 64])
 
             seq_len = key_layer.shape[0]
             offset = 0
@@ -710,14 +715,16 @@ def forward(self, hidden_states, attention_mask, layer_past=None):
             if exists(layer_past) and layer_past.numel() > 0:
                 offset = layer_past[0].shape[0]
                 seq_len += offset
+            print('query_layer2', query_layer.size()) # torch.Size([64, 80, 64])
+            print('key_laye2', key_layer.size()) # torch.Size([64, 80, 64])
 
             apply_xpos_fn = apply_xpos_emb_torch if self.bf16 else apply_xpos_emb
             cos, sin, scale = self.xpos_emb(value_layer, seq_len=seq_len)
             query_layer, key_layer = apply_xpos_fn(
-                query_layer, key_layer, cos, sin, scale, offset=offset)            
-            print('query_layer', query_layer.size())
-            print('key_layer', key_layer.size())
-            print('value_layer', value_layer.size())
+                query_layer, key_layer, cos, sin, scale, offset=offset)
+            print('query_layer3', query_layer.size()) # torch.Size([64, 80, 64])
+            print('key_layer3', key_layer.size()) # torch.Size([64, 80, 64])
+            print('value_layer3', value_layer.size()) # torch.Size([64, 8, 10, 64])
 
 
         # ==================================

From 7d568d549e648f51d5038d3e03202231d5cb2413 Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Sun, 10 Sep 2023 18:30:01 +0900
Subject: [PATCH 132/183] fix

---
 megatron/model/transformer.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py
index e8b987312..264ab68e8 100644
--- a/megatron/model/transformer.py
+++ b/megatron/model/transformer.py
@@ -722,9 +722,17 @@ def forward(self, hidden_states, attention_mask, layer_past=None):
             cos, sin, scale = self.xpos_emb(value_layer, seq_len=seq_len)
             query_layer, key_layer = apply_xpos_fn(
                 query_layer, key_layer, cos, sin, scale, offset=offset)
+            
             print('query_layer3', query_layer.size()) # torch.Size([64, 80, 64])
             print('key_layer3', key_layer.size()) # torch.Size([64, 80, 64])
-            print('value_layer3', value_layer.size()) # torch.Size([64, 8, 10, 64])
+            print('value_layer3', value_layer.size()) # torch.Size([64, 8, 10, 64])            
+            query_layer = query_layer.view(*output_size)
+            key_layer = key_layer.view(*output_size)
+
+            print('query_layer4', query_layer.size()) # torch.Size([64, 80, 64])
+            print('key_layer4', key_layer.size()) # torch.Size([64, 80, 64])
+            print('value_layer4', value_layer.size()) # torch.Size([64, 8, 10, 64])
+
 
 
         # ==================================

From 7bbc614b9ea7ca7fcca32cb7a59a95acb96083ff Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Sun, 10 Sep 2023 18:34:00 +0900
Subject: [PATCH 133/183] fix

---
 megatron/model/transformer.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py
index 264ab68e8..ef79eded7 100644
--- a/megatron/model/transformer.py
+++ b/megatron/model/transformer.py
@@ -684,6 +684,8 @@ def forward(self, hidden_states, attention_mask, layer_past=None):
             # ===================================
 
             # [b, np, sq, sk]
+
+            _b, _s, _s2, _e = query_layer.size()
             output_size = (query_layer.size(1),
                         query_layer.size(2),
                         query_layer.size(0),
@@ -726,8 +728,8 @@ def forward(self, hidden_states, attention_mask, layer_past=None):
             print('query_layer3', query_layer.size()) # torch.Size([64, 80, 64])
             print('key_layer3', key_layer.size()) # torch.Size([64, 80, 64])
             print('value_layer3', value_layer.size()) # torch.Size([64, 8, 10, 64])            
-            query_layer = query_layer.view(*output_size)
-            key_layer = key_layer.view(*output_size)
+            query_layer = query_layer.view(_b, _s, _s2, _e)
+            key_layer = key_layer.view(_b, _s, _s2, _e)
 
             print('query_layer4', query_layer.size()) # torch.Size([64, 80, 64])
             print('key_layer4', key_layer.size()) # torch.Size([64, 80, 64])

From 4b1521e1b380edd8641da4be0a063012b60c3597 Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Sun, 10 Sep 2023 18:34:16 +0900
Subject: [PATCH 134/183] fix

---
 megatron/model/transformer.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py
index ef79eded7..eaff6ab6e 100644
--- a/megatron/model/transformer.py
+++ b/megatron/model/transformer.py
@@ -690,6 +690,7 @@ def forward(self, hidden_states, attention_mask, layer_past=None):
                         query_layer.size(2),
                         query_layer.size(0),
                         key_layer.size(0))
+            print('output_size', output_size)
             print('query_layer0', query_layer.size()) # torch.Size([64, 80, 64])
             print('key_layer0', key_layer.size()) # torch.Size([64, 80, 64])
 

From f80ee3570db37d17121ecf9ca580b127c592210b Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Sun, 10 Sep 2023 18:48:38 +0900
Subject: [PATCH 135/183] debug

---
 megatron/model/transformer.py | 43 ++++++++++++-----------------------
 1 file changed, 14 insertions(+), 29 deletions(-)

diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py
index eaff6ab6e..1d2b39ea9 100644
--- a/megatron/model/transformer.py
+++ b/megatron/model/transformer.py
@@ -682,62 +682,47 @@ def forward(self, hidden_states, attention_mask, layer_past=None):
             # ===================================
             # Raw attention scores. [b, np, s, s]
             # ===================================
-
+            _sq, _b, _np, _hn = query_layer.size()
+            
             # [b, np, sq, sk]
-
-            _b, _s, _s2, _e = query_layer.size()
             output_size = (query_layer.size(1),
                         query_layer.size(2),
                         query_layer.size(0),
                         key_layer.size(0))
-            print('output_size', output_size)
-            print('query_layer0', query_layer.size()) # torch.Size([64, 80, 64])
-            print('key_layer0', key_layer.size()) # torch.Size([64, 80, 64])
+            print('key_layer 1' , key_layer.size())
 
+            
             # [sq, b, np, hn] -> [sq, b * np, hn]
             query_layer = query_layer.view(output_size[2],
                                         output_size[0] * output_size[1], -1)
             # [sk, b, np, hn] -> [sk, b * np, hn]
             key_layer = key_layer.view(output_size[3],
-                                    output_size[0] * output_size[1], -1)
-            
-            print('query_layer1', query_layer.size()) # torch.Size([64, 80, 64])
-            print('key_layer1', key_layer.size()) # torch.Size([64, 80, 64])
-
+                                    output_size[0] * output_size[1], -1)        
+            print('key_layer 2' , key_layer.size())
             seq_len = key_layer.shape[0]
             offset = 0
-            print('has layer_past', exists(layer_past))
             if exists(layer_past):
                 past_key, past_value = layer_past
                 key_layer = torch.cat((past_key.type_as(key_layer),
                                    key_layer), dim=0)
                 value_layer = torch.cat((past_value.type_as(value_layer),
                                      value_layer), dim=0)
-
-
+                
+            print('key_layer 3' , key_layer.size())
             if exists(layer_past) and layer_past.numel() > 0:
                 offset = layer_past[0].shape[0]
                 seq_len += offset
-            print('query_layer2', query_layer.size()) # torch.Size([64, 80, 64])
-            print('key_laye2', key_layer.size()) # torch.Size([64, 80, 64])
 
             apply_xpos_fn = apply_xpos_emb_torch if self.bf16 else apply_xpos_emb
             cos, sin, scale = self.xpos_emb(value_layer, seq_len=seq_len)
             query_layer, key_layer = apply_xpos_fn(
                 query_layer, key_layer, cos, sin, scale, offset=offset)
-            
-            print('query_layer3', query_layer.size()) # torch.Size([64, 80, 64])
-            print('key_layer3', key_layer.size()) # torch.Size([64, 80, 64])
-            print('value_layer3', value_layer.size()) # torch.Size([64, 8, 10, 64])            
-            query_layer = query_layer.view(_b, _s, _s2, _e)
-            key_layer = key_layer.view(_b, _s, _s2, _e)
-
-            print('query_layer4', query_layer.size()) # torch.Size([64, 80, 64])
-            print('key_layer4', key_layer.size()) # torch.Size([64, 80, 64])
-            print('value_layer4', value_layer.size()) # torch.Size([64, 8, 10, 64])
-
-
-
+            print('key_layer 4' , key_layer.size())
+            ## [b, np*sq, hn] -> [b, np, sq, hn]
+            query_layer = query_layer.view(_b, _np, _sq, _hn)
+            key_layer = key_layer.view(_b, _np, _sq, _hn)
+            print('key_layer 5' , key_layer.size())
+            exit(0)
         # ==================================
         # Cache key and value for inference
         # ==================================

From e41a7f258825c400894be549a45ffb5670ae7bd9 Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Sun, 10 Sep 2023 18:55:44 +0900
Subject: [PATCH 136/183] fix

---
 megatron/model/transformer.py | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py
index 1d2b39ea9..82817d5c2 100644
--- a/megatron/model/transformer.py
+++ b/megatron/model/transformer.py
@@ -689,16 +689,13 @@ def forward(self, hidden_states, attention_mask, layer_past=None):
                         query_layer.size(2),
                         query_layer.size(0),
                         key_layer.size(0))
-            print('key_layer 1' , key_layer.size())
 
-            
             # [sq, b, np, hn] -> [sq, b * np, hn]
             query_layer = query_layer.view(output_size[2],
                                         output_size[0] * output_size[1], -1)
             # [sk, b, np, hn] -> [sk, b * np, hn]
             key_layer = key_layer.view(output_size[3],
                                     output_size[0] * output_size[1], -1)        
-            print('key_layer 2' , key_layer.size())
             seq_len = key_layer.shape[0]
             offset = 0
             if exists(layer_past):
@@ -708,7 +705,6 @@ def forward(self, hidden_states, attention_mask, layer_past=None):
                 value_layer = torch.cat((past_value.type_as(value_layer),
                                      value_layer), dim=0)
                 
-            print('key_layer 3' , key_layer.size())
             if exists(layer_past) and layer_past.numel() > 0:
                 offset = layer_past[0].shape[0]
                 seq_len += offset
@@ -717,12 +713,12 @@ def forward(self, hidden_states, attention_mask, layer_past=None):
             cos, sin, scale = self.xpos_emb(value_layer, seq_len=seq_len)
             query_layer, key_layer = apply_xpos_fn(
                 query_layer, key_layer, cos, sin, scale, offset=offset)
-            print('key_layer 4' , key_layer.size())
-            ## [b, np*sq, hn] -> [b, np, sq, hn]
-            query_layer = query_layer.view(_b, _np, _sq, _hn)
-            key_layer = key_layer.view(_b, _np, _sq, _hn)
-            print('key_layer 5' , key_layer.size())
-            exit(0)
+    
+            ## [sq, b * np, hn] ->  [sq, b, np, hn]
+            query_layer = query_layer.view(_sq, _b, _np, _hn)
+            ## [sq, b * np, hn] ->  [sk, b, np, hn]
+            key_layer = key_layer.view(_sq, _b, _np, _hn)            
+            
         # ==================================
         # Cache key and value for inference
         # ==================================

From 3ce96f1fe0cb746b367bc698c2c4ea15fbc78913 Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Sun, 10 Sep 2023 19:02:15 +0900
Subject: [PATCH 137/183] debug

---
 configs/49M.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/configs/49M.yml b/configs/49M.yml
index a32dec0c0..75cd38c16 100644
--- a/configs/49M.yml
+++ b/configs/49M.yml
@@ -15,8 +15,8 @@
   "no_weight_tying": true,
   "gpt_j_residual": true,
   "output_layer_parallelism": "column",
-  "activation": "gelu",
-  # "activation": "swiglu",
+  # "activation": "gelu",
+  "activation": "swiglu",
   "norm": "rmsnorm",
   # "use_bnb_optimizer": true,
 

From 19299bc2008c11ccb4c6228bc6442878ff13fda0 Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Sun, 10 Sep 2023 19:05:12 +0900
Subject: [PATCH 138/183] debug

---
 megatron/mpu/layers.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/megatron/mpu/layers.py b/megatron/mpu/layers.py
index 92edbd6eb..4c7698ba3 100644
--- a/megatron/mpu/layers.py
+++ b/megatron/mpu/layers.py
@@ -742,6 +742,10 @@ def forward(self, input_):
         else:
             input_parallel = scatter_to_model_parallel_region(input_)
         # Matrix multiply.
+
+        ## (512x1280 and 2560x640)
+        print('debug: ', input_parallel.size(), self.weight.size())
+        exit(0)
         output_parallel = F.linear(input_parallel, self.weight)
         # All-reduce across all the partitions.
         if not self.parallel_output:

From 314ee87fce7170eabd8927141a9b1249b87d42df Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Sun, 10 Sep 2023 19:07:51 +0900
Subject: [PATCH 139/183] debug

---
 configs/49M.yml        | 4 ++--
 megatron/mpu/layers.py | 1 +
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/configs/49M.yml b/configs/49M.yml
index 75cd38c16..801168cfc 100644
--- a/configs/49M.yml
+++ b/configs/49M.yml
@@ -9,8 +9,8 @@
   "num_attention_heads": 10,
   "seq_length": 2048,
   "max_position_embeddings": 2048,
-  # "pos_emb": "rotary",
-  "pos_emb": "xpos",
+  "pos_emb": "rotary",
+  # "pos_emb": "xpos",
   "rotary_pct": 0.25,
   "no_weight_tying": true,
   "gpt_j_residual": true,
diff --git a/megatron/mpu/layers.py b/megatron/mpu/layers.py
index 4c7698ba3..12787a655 100644
--- a/megatron/mpu/layers.py
+++ b/megatron/mpu/layers.py
@@ -746,6 +746,7 @@ def forward(self, input_):
         ## (512x1280 and 2560x640)
         print('debug: ', input_parallel.size(), self.weight.size())
         exit(0)
+        ## xpos debug:  torch.Size([64, 8, 640]) torch.Size([640, 640])
         output_parallel = F.linear(input_parallel, self.weight)
         # All-reduce across all the partitions.
         if not self.parallel_output:

From d565c48b864cba8a578919f7d6774e8ad9480548 Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Sun, 10 Sep 2023 19:09:35 +0900
Subject: [PATCH 140/183] debug

---
 configs/49M.yml        | 4 ++--
 megatron/mpu/layers.py | 5 +++--
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/configs/49M.yml b/configs/49M.yml
index 801168cfc..75cd38c16 100644
--- a/configs/49M.yml
+++ b/configs/49M.yml
@@ -9,8 +9,8 @@
   "num_attention_heads": 10,
   "seq_length": 2048,
   "max_position_embeddings": 2048,
-  "pos_emb": "rotary",
-  # "pos_emb": "xpos",
+  # "pos_emb": "rotary",
+  "pos_emb": "xpos",
   "rotary_pct": 0.25,
   "no_weight_tying": true,
   "gpt_j_residual": true,
diff --git a/megatron/mpu/layers.py b/megatron/mpu/layers.py
index 12787a655..a75681c7d 100644
--- a/megatron/mpu/layers.py
+++ b/megatron/mpu/layers.py
@@ -744,9 +744,10 @@ def forward(self, input_):
         # Matrix multiply.
 
         ## (512x1280 and 2560x640)
-        print('debug: ', input_parallel.size(), self.weight.size())
-        exit(0)
+        ## print('debug: ', input_parallel.size(), self.weight.size())
+        ##  exit(0)
         ## xpos debug:  torch.Size([64, 8, 640]) torch.Size([640, 640])
+        ##              torch.Size([64, 8, 640]) torch.Size([640, 640])
         output_parallel = F.linear(input_parallel, self.weight)
         # All-reduce across all the partitions.
         if not self.parallel_output:

From b5185367358a5b2d1edc2bea7e1cb361ef1a2c81 Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Sun, 10 Sep 2023 19:30:13 +0900
Subject: [PATCH 141/183] debug

---
 megatron/model/activations.py | 1 +
 megatron/mpu/layers.py        | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/megatron/model/activations.py b/megatron/model/activations.py
index fc4c73445..40c3ff906 100644
--- a/megatron/model/activations.py
+++ b/megatron/model/activations.py
@@ -125,6 +125,7 @@ def mish(x):
 @torch.jit.script
 def swiglu(x):
     x = torch.chunk(x, 2, dim=-1)
+    print("x0 x1: ", x[0].size(), x[1].size())
     return F.silu(x[0]) * x[1]
 
 class GEGLU(torch.nn.Module):
diff --git a/megatron/mpu/layers.py b/megatron/mpu/layers.py
index a75681c7d..3afe1ff3b 100644
--- a/megatron/mpu/layers.py
+++ b/megatron/mpu/layers.py
@@ -744,7 +744,7 @@ def forward(self, input_):
         # Matrix multiply.
 
         ## (512x1280 and 2560x640)
-        ## print('debug: ', input_parallel.size(), self.weight.size())
+        print('debug: ', input_parallel.size(), self.weight.size())
         ##  exit(0)
         ## xpos debug:  torch.Size([64, 8, 640]) torch.Size([640, 640])
         ##              torch.Size([64, 8, 640]) torch.Size([640, 640])

From 3396a71452fcceb0a04525a447502ebd0b9faac4 Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Sun, 10 Sep 2023 19:43:35 +0900
Subject: [PATCH 142/183] debug

---
 megatron/model/activations.py | 3 ++-
 megatron/mpu/layers.py        | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/megatron/model/activations.py b/megatron/model/activations.py
index 40c3ff906..81ab3dbab 100644
--- a/megatron/model/activations.py
+++ b/megatron/model/activations.py
@@ -124,8 +124,9 @@ def mish(x):
 
 @torch.jit.script
 def swiglu(x):
+    print("x0 x1 111: ", x)
     x = torch.chunk(x, 2, dim=-1)
-    print("x0 x1: ", x[0].size(), x[1].size())
+    print("x0 x1 222: ", x[0].size(), x[1].size())
     return F.silu(x[0]) * x[1]
 
 class GEGLU(torch.nn.Module):
diff --git a/megatron/mpu/layers.py b/megatron/mpu/layers.py
index 3afe1ff3b..ad1b3e44c 100644
--- a/megatron/mpu/layers.py
+++ b/megatron/mpu/layers.py
@@ -734,6 +734,7 @@ def set_parallel_output(self, parallel_output: bool):
         self.parallel_output = parallel_output
 
     def forward(self, input_):
+        print('debug1: ', input_.size())
         if self.use_mup and self.mup_rescale_parameters:
             input_ /= self.width_mult()
         # Set up backprop all-reduce.
@@ -744,7 +745,7 @@ def forward(self, input_):
         # Matrix multiply.
 
         ## (512x1280 and 2560x640)
-        print('debug: ', input_parallel.size(), self.weight.size())
+        print('debug2: ', input_parallel.size(), self.weight.size())
         ##  exit(0)
         ## xpos debug:  torch.Size([64, 8, 640]) torch.Size([640, 640])
         ##              torch.Size([64, 8, 640]) torch.Size([640, 640])

From 6951f10661a5a80a106a96294974138add15077d Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Sun, 10 Sep 2023 19:44:55 +0900
Subject: [PATCH 143/183] debug

---
 megatron/model/activations.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/megatron/model/activations.py b/megatron/model/activations.py
index 81ab3dbab..5f50d307a 100644
--- a/megatron/model/activations.py
+++ b/megatron/model/activations.py
@@ -124,7 +124,7 @@ def mish(x):
 
 @torch.jit.script
 def swiglu(x):
-    print("x0 x1 111: ", x)
+    print("x0 x1 111: ", x.size())
     x = torch.chunk(x, 2, dim=-1)
     print("x0 x1 222: ", x[0].size(), x[1].size())
     return F.silu(x[0]) * x[1]

From 0f3dfe35251c65925e2952f180ca0db480d2c5de Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Sun, 10 Sep 2023 20:06:35 +0900
Subject: [PATCH 144/183] debug

---
 megatron/mpu/layers.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/megatron/mpu/layers.py b/megatron/mpu/layers.py
index ad1b3e44c..4650aa814 100644
--- a/megatron/mpu/layers.py
+++ b/megatron/mpu/layers.py
@@ -734,6 +734,7 @@ def set_parallel_output(self, parallel_output: bool):
         self.parallel_output = parallel_output
 
     def forward(self, input_):
+        print('self.input_is_parallel', self.input_is_parallel)
         print('debug1: ', input_.size())
         if self.use_mup and self.mup_rescale_parameters:
             input_ /= self.width_mult()

From 558c2d3b41c908ec9081c88fda5a75cd49f21a39 Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Sun, 10 Sep 2023 20:09:13 +0900
Subject: [PATCH 145/183] debug

---
 megatron/model/activations.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/megatron/model/activations.py b/megatron/model/activations.py
index 5f50d307a..057449e30 100644
--- a/megatron/model/activations.py
+++ b/megatron/model/activations.py
@@ -125,9 +125,9 @@ def mish(x):
 @torch.jit.script
 def swiglu(x):
     print("x0 x1 111: ", x.size())
-    x = torch.chunk(x, 2, dim=-1)
-    print("x0 x1 222: ", x[0].size(), x[1].size())
-    return F.silu(x[0]) * x[1]
+    return F.silu(x) * x
+    # x = torch.chunk(x, 2, dim=-1)
+    # return F.silu(x[0]) * x[1]
 
 class GEGLU(torch.nn.Module):
     def __init__(self, neox_args):

From 3c8109b3da66a056fa6e076caf78e3b1f36744cd Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Sun, 10 Sep 2023 20:11:39 +0900
Subject: [PATCH 146/183] fix

---
 megatron/model/activations.py | 1 -
 megatron/mpu/layers.py        | 7 -------
 2 files changed, 8 deletions(-)

diff --git a/megatron/model/activations.py b/megatron/model/activations.py
index 057449e30..ea7935d8a 100644
--- a/megatron/model/activations.py
+++ b/megatron/model/activations.py
@@ -124,7 +124,6 @@ def mish(x):
 
 @torch.jit.script
 def swiglu(x):
-    print("x0 x1 111: ", x.size())
     return F.silu(x) * x
     # x = torch.chunk(x, 2, dim=-1)
     # return F.silu(x[0]) * x[1]
diff --git a/megatron/mpu/layers.py b/megatron/mpu/layers.py
index 4650aa814..f335df98c 100644
--- a/megatron/mpu/layers.py
+++ b/megatron/mpu/layers.py
@@ -734,8 +734,6 @@ def set_parallel_output(self, parallel_output: bool):
         self.parallel_output = parallel_output
 
     def forward(self, input_):
-        print('self.input_is_parallel', self.input_is_parallel)
-        print('debug1: ', input_.size())
         if self.use_mup and self.mup_rescale_parameters:
             input_ /= self.width_mult()
         # Set up backprop all-reduce.
@@ -745,11 +743,6 @@ def forward(self, input_):
             input_parallel = scatter_to_model_parallel_region(input_)
         # Matrix multiply.
 
-        ## (512x1280 and 2560x640)
-        print('debug2: ', input_parallel.size(), self.weight.size())
-        ##  exit(0)
-        ## xpos debug:  torch.Size([64, 8, 640]) torch.Size([640, 640])
-        ##              torch.Size([64, 8, 640]) torch.Size([640, 640])
         output_parallel = F.linear(input_parallel, self.weight)
         # All-reduce across all the partitions.
         if not self.parallel_output:

From 9579b62491e883cea57ed06eae66e11d41222de3 Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Mon, 11 Sep 2023 19:43:23 +0900
Subject: [PATCH 147/183] fix save iter

---
 configs/49M.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configs/49M.yml b/configs/49M.yml
index 75cd38c16..fb63b7568 100644
--- a/configs/49M.yml
+++ b/configs/49M.yml
@@ -113,7 +113,7 @@
   "eval_interval": 100000,
   "eval_iters": 1000,
   "keep_last_n_checkpoints": 4,
-  "save_iters": 10000,
+  "save_iters": 5000,
 
   # logging
   "log_interval": 1000,

From 81e4460f57d10c520e55b699a49aacbd70d40a43 Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Tue, 12 Sep 2023 17:46:23 +0900
Subject: [PATCH 148/183] fix

---
 configs/49M.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configs/49M.yml b/configs/49M.yml
index fb63b7568..c688a54a3 100644
--- a/configs/49M.yml
+++ b/configs/49M.yml
@@ -109,7 +109,7 @@
   "distributed_backend": "nccl",
   "lr_decay_style": "cosine",
   "warmup": 0.01,
-  "checkpoint_factor": 10000,
+  "checkpoint_factor": 5000,
   "eval_interval": 100000,
   "eval_iters": 1000,
   "keep_last_n_checkpoints": 4,

From c9042c57a137257a237d75487ba6780cae91f78a Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Sat, 16 Sep 2023 08:19:02 +0900
Subject: [PATCH 149/183] fix

---
 configs/convert_19M_settings.yml | 31 ++++++++++++++++++++++++++
 configs/convert_49M_settings.yml | 37 ++++++++++++++++++++++++++++++++
 2 files changed, 68 insertions(+)
 create mode 100644 configs/convert_19M_settings.yml
 create mode 100644 configs/convert_49M_settings.yml

diff --git a/configs/convert_19M_settings.yml b/configs/convert_19M_settings.yml
new file mode 100644
index 000000000..baf797385
--- /dev/null
+++ b/configs/convert_19M_settings.yml
@@ -0,0 +1,31 @@
+{
+  "tokenizer_type": "SPMTokenizer",
+  "vocab-file": "./novelAI/tokenizer.model",
+  
+  "pipe_parallel_size": 1,
+  "model_parallel_size": 1,
+
+  # model settings
+  "num_layers": 6,
+  "hidden_size": 512,
+  "num_attention_heads": 8,
+  "seq_length": 2048,
+  "max_position_embeddings": 2048,
+  "pos_emb": "rotary",
+  "no_weight_tying": true,
+  "gpt_j_residual": false,
+  "output_layer_parallelism": "column",
+
+  "scaled_upper_triang_masked_softmax_fusion": false,
+  "bias_gelu_fusion": false,
+
+  "optimizer": {
+    "type": "Adam",
+    "params": {
+      "lr": 0.001,
+      "betas": [0.9, 0.95],
+      "eps": 1.0e-8,
+    }
+  },
+  "min_lr": 0.0001
+}
diff --git a/configs/convert_49M_settings.yml b/configs/convert_49M_settings.yml
new file mode 100644
index 000000000..9287a60f1
--- /dev/null
+++ b/configs/convert_49M_settings.yml
@@ -0,0 +1,37 @@
+{
+  "tokenizer_type": "SPMTokenizer",
+  "vocab-file": "./novelAI/tokenizer.model",
+  
+  "pipe_parallel_size": 1,
+  "model_parallel_size": 1,
+
+  # model settings
+  "num_layers": 10,
+  "hidden_size": 640,
+  "num_attention_heads": 10,
+  "seq_length": 2048,
+  "max_position_embeddings": 2048,
+
+  "activation": "swiglu",
+  "norm": "rmsnorm",
+  "pos_emb": "xpos",
+
+  ## ------------------- 
+  "pos_emb": "rotary",
+  "no_weight_tying": true,
+  "gpt_j_residual": false,
+  "output_layer_parallelism": "column",
+
+  "scaled_upper_triang_masked_softmax_fusion": false,
+  "bias_gelu_fusion": false,
+
+  "optimizer": {
+    "type": "Adam",
+    "params": {
+      "lr": 0.001,
+      "betas": [0.9, 0.95],
+      "eps": 1.0e-8,
+    }
+  },
+  "min_lr": 0.0001
+}

From b62cf6f52955517b14ddafbe5d864b75b7b780b1 Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Sat, 16 Sep 2023 08:43:38 +0900
Subject: [PATCH 150/183] add for hf gptneox

---
 hf_gptneox.py                 | 54 +++++++++++++++++++++++++++++++++++
 tools/convert_module_to_hf.py |  5 +++-
 2 files changed, 58 insertions(+), 1 deletion(-)
 create mode 100644 hf_gptneox.py

diff --git a/hf_gptneox.py b/hf_gptneox.py
new file mode 100644
index 000000000..33e95c8cb
--- /dev/null
+++ b/hf_gptneox.py
@@ -0,0 +1,54 @@
+from transformers.models.gpt_neox import GPTNeoXPreTrainedModel, GPTNeoXModel, GPTNeoXLayer, GPTNeoXMLP
+from transformers.activations import ClassInstantier, ACT2CLS
+from torch import Tensor, nn
+
+from typing import Callable, Optional
+import torch.nn.functional as F
+
+
+class SwiGLUFFN(nn.Module):
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+        act_layer: Callable[..., nn.Module] = None,
+        drop: float = 0.0,
+        bias: bool = True,
+    ) -> None:
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.w12 = nn.Linear(in_features, 2 * hidden_features, bias=bias)
+        self.w3 = nn.Linear(hidden_features, out_features, bias=bias)
+
+    def forward(self, x: Tensor) -> Tensor:
+        x12 = self.w12(x)
+        x1, x2 = x12.chunk(2, dim=-1)
+        hidden = F.silu(x1) * x2
+        return self.w3(hidden)
+
+ACT2CLS['swiglu'] = SwiGLUFFN
+ACT2FN = ClassInstantier(ACT2CLS)
+
+class GPTNeoX2MLP(GPTNeoXMLP):
+    def __init__(self, config):
+        super().__init__()
+        self.act = ACT2FN[config.hidden_act]
+
+class GPTNeoX2Layer(GPTNeoXModel):
+    def __init__(self, config):
+        super().__init__()
+        self.mlp = GPTNeoX2MLP(config)
+
+class GPTNeoX2Model(GPTNeoXModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.layers = nn.ModuleList([GPTNeoX2Layer(config) for _ in range(config.num_hidden_layers)])
+
+class GPTNeoX2ForCausalLM(GPTNeoXPreTrainedModel):
+    _tied_weights_keys = ["embed_out.weight"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.gpt_neox = GPTNeoX2Model(config)
\ No newline at end of file
diff --git a/tools/convert_module_to_hf.py b/tools/convert_module_to_hf.py
index c46d78402..130c81675 100644
--- a/tools/convert_module_to_hf.py
+++ b/tools/convert_module_to_hf.py
@@ -23,6 +23,7 @@
 import torch
 from transformers import GPTNeoXConfig, GPTNeoXForCausalLM
 
+from ..hf_gptneox import GPTNeoX2ForCausalLM
 
 sys.path.append(
     os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir))
@@ -145,7 +146,9 @@ def convert(input_checkpoint_path, loaded_config, output_checkpoint_path):
 
     hf_config = create_config(loaded_config)
 
-    hf_model = GPTNeoXForCausalLM(hf_config)
+    # hf_model = GPTNeoXForCausalLM(hf_config)
+    ## for swiglu
+    hf_model = GPTNeoX2ForCausalLM(hf_config)    
 
     # save model in fp16/bf16 if Deepspeed fp16 or bf16 mixed precision was used in config, else 32 bit weights
     fp16 = get_key(loaded_config, "fp16")

From ba18cd300ee1f1a4b6f46c8f69427188ba34564b Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Sat, 16 Sep 2023 08:44:45 +0900
Subject: [PATCH 151/183] add

---
 hf_gptneox.py                 | 54 -----------------------------------
 tools/convert_module_to_hf.py |  2 +-
 2 files changed, 1 insertion(+), 55 deletions(-)
 delete mode 100644 hf_gptneox.py

diff --git a/hf_gptneox.py b/hf_gptneox.py
deleted file mode 100644
index 33e95c8cb..000000000
--- a/hf_gptneox.py
+++ /dev/null
@@ -1,54 +0,0 @@
-from transformers.models.gpt_neox import GPTNeoXPreTrainedModel, GPTNeoXModel, GPTNeoXLayer, GPTNeoXMLP
-from transformers.activations import ClassInstantier, ACT2CLS
-from torch import Tensor, nn
-
-from typing import Callable, Optional
-import torch.nn.functional as F
-
-
-class SwiGLUFFN(nn.Module):
-    def __init__(
-        self,
-        in_features: int,
-        hidden_features: Optional[int] = None,
-        out_features: Optional[int] = None,
-        act_layer: Callable[..., nn.Module] = None,
-        drop: float = 0.0,
-        bias: bool = True,
-    ) -> None:
-        super().__init__()
-        out_features = out_features or in_features
-        hidden_features = hidden_features or in_features
-        self.w12 = nn.Linear(in_features, 2 * hidden_features, bias=bias)
-        self.w3 = nn.Linear(hidden_features, out_features, bias=bias)
-
-    def forward(self, x: Tensor) -> Tensor:
-        x12 = self.w12(x)
-        x1, x2 = x12.chunk(2, dim=-1)
-        hidden = F.silu(x1) * x2
-        return self.w3(hidden)
-
-ACT2CLS['swiglu'] = SwiGLUFFN
-ACT2FN = ClassInstantier(ACT2CLS)
-
-class GPTNeoX2MLP(GPTNeoXMLP):
-    def __init__(self, config):
-        super().__init__()
-        self.act = ACT2FN[config.hidden_act]
-
-class GPTNeoX2Layer(GPTNeoXModel):
-    def __init__(self, config):
-        super().__init__()
-        self.mlp = GPTNeoX2MLP(config)
-
-class GPTNeoX2Model(GPTNeoXModel):
-    def __init__(self, config):
-        super().__init__(config)
-        self.layers = nn.ModuleList([GPTNeoX2Layer(config) for _ in range(config.num_hidden_layers)])
-
-class GPTNeoX2ForCausalLM(GPTNeoXPreTrainedModel):
-    _tied_weights_keys = ["embed_out.weight"]
-
-    def __init__(self, config):
-        super().__init__(config)
-        self.gpt_neox = GPTNeoX2Model(config)
\ No newline at end of file
diff --git a/tools/convert_module_to_hf.py b/tools/convert_module_to_hf.py
index 130c81675..4b790d04c 100644
--- a/tools/convert_module_to_hf.py
+++ b/tools/convert_module_to_hf.py
@@ -23,7 +23,7 @@
 import torch
 from transformers import GPTNeoXConfig, GPTNeoXForCausalLM
 
-from ..hf_gptneox import GPTNeoX2ForCausalLM
+from hf_gptneox import GPTNeoX2ForCausalLM
 
 sys.path.append(
     os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir))

From 0542bbd35da5949b53c42d55f5c39e4259eef7a5 Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Sat, 16 Sep 2023 08:45:06 +0900
Subject: [PATCH 152/183] fix

---
 tools/hf_gptneox.py | 54 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 54 insertions(+)
 create mode 100644 tools/hf_gptneox.py

diff --git a/tools/hf_gptneox.py b/tools/hf_gptneox.py
new file mode 100644
index 000000000..33e95c8cb
--- /dev/null
+++ b/tools/hf_gptneox.py
@@ -0,0 +1,54 @@
+from transformers.models.gpt_neox import GPTNeoXPreTrainedModel, GPTNeoXModel, GPTNeoXLayer, GPTNeoXMLP
+from transformers.activations import ClassInstantier, ACT2CLS
+from torch import Tensor, nn
+
+from typing import Callable, Optional
+import torch.nn.functional as F
+
+
+class SwiGLUFFN(nn.Module):
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+        act_layer: Callable[..., nn.Module] = None,
+        drop: float = 0.0,
+        bias: bool = True,
+    ) -> None:
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.w12 = nn.Linear(in_features, 2 * hidden_features, bias=bias)
+        self.w3 = nn.Linear(hidden_features, out_features, bias=bias)
+
+    def forward(self, x: Tensor) -> Tensor:
+        x12 = self.w12(x)
+        x1, x2 = x12.chunk(2, dim=-1)
+        hidden = F.silu(x1) * x2
+        return self.w3(hidden)
+
+ACT2CLS['swiglu'] = SwiGLUFFN
+ACT2FN = ClassInstantier(ACT2CLS)
+
+class GPTNeoX2MLP(GPTNeoXMLP):
+    def __init__(self, config):
+        super().__init__()
+        self.act = ACT2FN[config.hidden_act]
+
+class GPTNeoX2Layer(GPTNeoXModel):
+    def __init__(self, config):
+        super().__init__()
+        self.mlp = GPTNeoX2MLP(config)
+
+class GPTNeoX2Model(GPTNeoXModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.layers = nn.ModuleList([GPTNeoX2Layer(config) for _ in range(config.num_hidden_layers)])
+
+class GPTNeoX2ForCausalLM(GPTNeoXPreTrainedModel):
+    _tied_weights_keys = ["embed_out.weight"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.gpt_neox = GPTNeoX2Model(config)
\ No newline at end of file

From 6c956e00f0c8c52d184b5324d149f6e11b15e0c6 Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Sat, 16 Sep 2023 08:46:52 +0900
Subject: [PATCH 153/183] fix

---
 tools/hf_gptneox.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tools/hf_gptneox.py b/tools/hf_gptneox.py
index 33e95c8cb..efbf9fe84 100644
--- a/tools/hf_gptneox.py
+++ b/tools/hf_gptneox.py
@@ -1,4 +1,5 @@
-from transformers.models.gpt_neox import GPTNeoXPreTrainedModel, GPTNeoXModel, GPTNeoXLayer, GPTNeoXMLP
+from transformers.models.gpt_neox import GPTNeoXPreTrainedModel, GPTNeoXModel, GPTNeoXLayer
+from transformers.models.gpt_neox.modeling_gpt_neox import GPTNeoXMLP
 from transformers.activations import ClassInstantier, ACT2CLS
 from torch import Tensor, nn
 
@@ -36,7 +37,7 @@ def __init__(self, config):
         super().__init__()
         self.act = ACT2FN[config.hidden_act]
 
-class GPTNeoX2Layer(GPTNeoXModel):
+class GPTNeoX2Layer(GPTNeoXLayer):
     def __init__(self, config):
         super().__init__()
         self.mlp = GPTNeoX2MLP(config)

From c21da19eed32d5c6eb5fcff6cf91ec5999be8744 Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Sat, 16 Sep 2023 08:50:18 +0900
Subject: [PATCH 154/183] ix

---
 tools/hf_gptneox.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tools/hf_gptneox.py b/tools/hf_gptneox.py
index efbf9fe84..38b9a92e2 100644
--- a/tools/hf_gptneox.py
+++ b/tools/hf_gptneox.py
@@ -44,7 +44,9 @@ def __init__(self, config):
 
 class GPTNeoX2Model(GPTNeoXModel):
     def __init__(self, config):
-        super().__init__(config)
+        _config = config.deepcopy()
+        _config.hidden_act = "gelu"
+        super().__init__(_config)
         self.layers = nn.ModuleList([GPTNeoX2Layer(config) for _ in range(config.num_hidden_layers)])
 
 class GPTNeoX2ForCausalLM(GPTNeoXPreTrainedModel):

From 955a943dd0ba4d933bbbc98460cfef6630b50eae Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Sat, 16 Sep 2023 08:55:24 +0900
Subject: [PATCH 155/183] copy

---
 tools/hf_gptneox.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/tools/hf_gptneox.py b/tools/hf_gptneox.py
index 38b9a92e2..54e337650 100644
--- a/tools/hf_gptneox.py
+++ b/tools/hf_gptneox.py
@@ -44,9 +44,11 @@ def __init__(self, config):
 
 class GPTNeoX2Model(GPTNeoXModel):
     def __init__(self, config):
-        _config = config.deepcopy()
-        _config.hidden_act = "gelu"
-        super().__init__(_config)
+        _copy_hidden_act = config.hidden_act        
+        config.hidden_act = "gelu"
+        super().__init__(config)
+
+        config.hidden_act = _copy_hidden_act
         self.layers = nn.ModuleList([GPTNeoX2Layer(config) for _ in range(config.num_hidden_layers)])
 
 class GPTNeoX2ForCausalLM(GPTNeoXPreTrainedModel):

From 289f49600b34502b75c8614beb6ffa514ad60826 Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Sat, 16 Sep 2023 09:03:48 +0900
Subject: [PATCH 156/183] fix

---
 tools/hf_gptneox.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/hf_gptneox.py b/tools/hf_gptneox.py
index 54e337650..b700f2d06 100644
--- a/tools/hf_gptneox.py
+++ b/tools/hf_gptneox.py
@@ -34,12 +34,12 @@ def forward(self, x: Tensor) -> Tensor:
 
 class GPTNeoX2MLP(GPTNeoXMLP):
     def __init__(self, config):
-        super().__init__()
+        super().__init__(config)
         self.act = ACT2FN[config.hidden_act]
 
 class GPTNeoX2Layer(GPTNeoXLayer):
     def __init__(self, config):
-        super().__init__()
+        super().__init__(config)
         self.mlp = GPTNeoX2MLP(config)
 
 class GPTNeoX2Model(GPTNeoXModel):

From a417387dce88c2ba0a6d54f71f1525028ed3e79d Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Sat, 16 Sep 2023 09:07:32 +0900
Subject: [PATCH 157/183] fix

---
 tools/hf_gptneox.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/tools/hf_gptneox.py b/tools/hf_gptneox.py
index b700f2d06..75a3201ee 100644
--- a/tools/hf_gptneox.py
+++ b/tools/hf_gptneox.py
@@ -34,17 +34,25 @@ def forward(self, x: Tensor) -> Tensor:
 
 class GPTNeoX2MLP(GPTNeoXMLP):
     def __init__(self, config):
+        _copy_hidden_act = config.hidden_act
+        config.hidden_act = "gelu"
         super().__init__(config)
+        
+        config.hidden_act = _copy_hidden_act
         self.act = ACT2FN[config.hidden_act]
 
 class GPTNeoX2Layer(GPTNeoXLayer):
     def __init__(self, config):
+        _copy_hidden_act = config.hidden_act
+        config.hidden_act = "gelu"
         super().__init__(config)
+
+        config.hidden_act = _copy_hidden_act
         self.mlp = GPTNeoX2MLP(config)
 
 class GPTNeoX2Model(GPTNeoXModel):
     def __init__(self, config):
-        _copy_hidden_act = config.hidden_act        
+        _copy_hidden_act = config.hidden_act
         config.hidden_act = "gelu"
         super().__init__(config)
 

From 068c6aea6ce7e1198a2f2a7016535d3721d39d15 Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Sat, 16 Sep 2023 09:12:51 +0900
Subject: [PATCH 158/183] fix act

---
 tools/hf_gptneox.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/tools/hf_gptneox.py b/tools/hf_gptneox.py
index 75a3201ee..b4eb0ae3c 100644
--- a/tools/hf_gptneox.py
+++ b/tools/hf_gptneox.py
@@ -28,8 +28,16 @@ def forward(self, x: Tensor) -> Tensor:
         x1, x2 = x12.chunk(2, dim=-1)
         hidden = F.silu(x1) * x2
         return self.w3(hidden)
+    
+class SwiGLU(nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, x: Tensor) -> Tensor:
+        return F.silu(x) * x
 
-ACT2CLS['swiglu'] = SwiGLUFFN
+# ACT2CLS['swiglu'] = SwiGLUFFN
+ACT2CLS['swiglu'] = SwiGLU
 ACT2FN = ClassInstantier(ACT2CLS)
 
 class GPTNeoX2MLP(GPTNeoXMLP):
@@ -37,7 +45,7 @@ def __init__(self, config):
         _copy_hidden_act = config.hidden_act
         config.hidden_act = "gelu"
         super().__init__(config)
-        
+
         config.hidden_act = _copy_hidden_act
         self.act = ACT2FN[config.hidden_act]
 

From d4a77a4a0f4a0d539dbbf504cfe1319b97636f9b Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Sat, 16 Sep 2023 09:17:37 +0900
Subject: [PATCH 159/183] fix

---
 tools/convert_module_to_hf.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/tools/convert_module_to_hf.py b/tools/convert_module_to_hf.py
index 4b790d04c..ab97b1361 100644
--- a/tools/convert_module_to_hf.py
+++ b/tools/convert_module_to_hf.py
@@ -203,12 +203,18 @@ def convert(input_checkpoint_path, loaded_config, output_checkpoint_path):
             state_dict[key] = torch.cat([t[key] for t in loaded_tp_ranks], dim=1)
 
         # average layernorm stats over mp ranks
-        for key in [
+        keysForOriginGPTNeoX=[
             "input_layernorm.weight",
             "input_layernorm.bias",
             "post_attention_layernorm.weight",
             "post_attention_layernorm.bias",
-        ]:
+        ]
+        keysForSwiglu = [
+            "input_layernorm.bias",
+            "post_attention_layernorm.weight",
+            "post_attention_layernorm.bias",
+        ]
+        for key in keysForSwiglu:
             state_dict[key] = (sum([t[key] for t in loaded_tp_ranks])) / len(
                 loaded_tp_ranks
             )

From 0db8946ab8bebca45163ed41f3643b7ae21ddd65 Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Sat, 16 Sep 2023 09:18:49 +0900
Subject: [PATCH 160/183] fix

---
 tools/convert_module_to_hf.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tools/convert_module_to_hf.py b/tools/convert_module_to_hf.py
index ab97b1361..d46dcec66 100644
--- a/tools/convert_module_to_hf.py
+++ b/tools/convert_module_to_hf.py
@@ -209,8 +209,7 @@ def convert(input_checkpoint_path, loaded_config, output_checkpoint_path):
             "post_attention_layernorm.weight",
             "post_attention_layernorm.bias",
         ]
-        keysForSwiglu = [
-            "input_layernorm.bias",
+        keysForSwiglu = [            
             "post_attention_layernorm.weight",
             "post_attention_layernorm.bias",
         ]

From d2e315934b64878e722fe766e2571f3d6376a646 Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Sat, 16 Sep 2023 09:20:34 +0900
Subject: [PATCH 161/183] fix

---
 tools/convert_module_to_hf.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tools/convert_module_to_hf.py b/tools/convert_module_to_hf.py
index d46dcec66..94a09b8f1 100644
--- a/tools/convert_module_to_hf.py
+++ b/tools/convert_module_to_hf.py
@@ -195,6 +195,9 @@ def convert(input_checkpoint_path, loaded_config, output_checkpoint_path):
             input_checkpoint_path, mp_partitions, layer_i + 2
         )
 
+        for t in loaded_tp_ranks:
+            print('t', t.keys())
+
         state_dict = {}
         for key in [
             "attention.dense.weight",

From 98b5968f68ee84bb5e717218c394c9dd2f8fce3f Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Sat, 16 Sep 2023 09:22:07 +0900
Subject: [PATCH 162/183] debug

---
 tools/convert_module_to_hf.py | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/tools/convert_module_to_hf.py b/tools/convert_module_to_hf.py
index 94a09b8f1..7244e76f1 100644
--- a/tools/convert_module_to_hf.py
+++ b/tools/convert_module_to_hf.py
@@ -187,7 +187,7 @@ def convert(input_checkpoint_path, loaded_config, output_checkpoint_path):
         # get layer from hf model
         hf_layer = hf_model.gpt_neox.layers[layer_i]
         for v in hf_layer.state_dict():
-            print('state_dict: ', v)
+            print('debug state_dict: ', v)
         print('-'*200)
 
         # + 2 bc of embed layer and a dummy _pre_transformer_block
@@ -196,7 +196,7 @@ def convert(input_checkpoint_path, loaded_config, output_checkpoint_path):
         )
 
         for t in loaded_tp_ranks:
-            print('t', t.keys())
+            print('debug loaded_tp_ranks: ', t.keys())
 
         state_dict = {}
         for key in [
@@ -212,10 +212,7 @@ def convert(input_checkpoint_path, loaded_config, output_checkpoint_path):
             "post_attention_layernorm.weight",
             "post_attention_layernorm.bias",
         ]
-        keysForSwiglu = [            
-            "post_attention_layernorm.weight",
-            "post_attention_layernorm.bias",
-        ]
+        keysForSwiglu = []
         for key in keysForSwiglu:
             state_dict[key] = (sum([t[key] for t in loaded_tp_ranks])) / len(
                 loaded_tp_ranks

From e4c1208fa760d3fc55ae4ba461b0222757d7cde9 Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Sat, 16 Sep 2023 09:27:12 +0900
Subject: [PATCH 163/183] debug

---
 tools/convert_module_to_hf.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tools/convert_module_to_hf.py b/tools/convert_module_to_hf.py
index 7244e76f1..033639a95 100644
--- a/tools/convert_module_to_hf.py
+++ b/tools/convert_module_to_hf.py
@@ -235,9 +235,11 @@ def convert(input_checkpoint_path, loaded_config, output_checkpoint_path):
             state_dict[key] = sum([t[key] for t in loaded_tp_ranks])
 
         # Just take one
+        print('debug: ', loaded_config)
         state_dict["attention.rotary_emb.inv_freq"] = loaded_tp_ranks[0][
             "attention.rotary_emb.inv_freq"
         ]
+        
 
         state_dict["attention.dense.bias"] = hf_layer.state_dict()["attention.dense.bias"]
 

From 7994b05fc78462f1d9276d4e855f724d480f8dee Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Sat, 16 Sep 2023 09:29:32 +0900
Subject: [PATCH 164/183] debug

---
 tools/convert_module_to_hf.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tools/convert_module_to_hf.py b/tools/convert_module_to_hf.py
index 033639a95..138bef1ac 100644
--- a/tools/convert_module_to_hf.py
+++ b/tools/convert_module_to_hf.py
@@ -141,7 +141,7 @@ def convert(input_checkpoint_path, loaded_config, output_checkpoint_path):
     should perform model-parallel merging correctly
     but only supports features allowed by HF GPT-NeoX implementation (e.g. rotary embeddings)
     """
-
+    print('debug: ', loaded_config)
     hf_config = GPTNeoXConfig()
 
     hf_config = create_config(loaded_config)
@@ -235,10 +235,10 @@ def convert(input_checkpoint_path, loaded_config, output_checkpoint_path):
             state_dict[key] = sum([t[key] for t in loaded_tp_ranks])
 
         # Just take one
-        print('debug: ', loaded_config)
-        state_dict["attention.rotary_emb.inv_freq"] = loaded_tp_ranks[0][
-            "attention.rotary_emb.inv_freq"
-        ]
+        if loaded_config['pos_emb'] == 'rotary':
+            state_dict["attention.rotary_emb.inv_freq"] = loaded_tp_ranks[0][
+                "attention.rotary_emb.inv_freq"
+            ]
         
 
         state_dict["attention.dense.bias"] = hf_layer.state_dict()["attention.dense.bias"]

From 86e23b0c08ef3c907192fcd57f7f1355934812ab Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Sat, 16 Sep 2023 09:30:05 +0900
Subject: [PATCH 165/183] fix config

---
 configs/convert_49M_settings.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configs/convert_49M_settings.yml b/configs/convert_49M_settings.yml
index 9287a60f1..9dd0301bd 100644
--- a/configs/convert_49M_settings.yml
+++ b/configs/convert_49M_settings.yml
@@ -17,7 +17,7 @@
   "pos_emb": "xpos",
 
   ## ------------------- 
-  "pos_emb": "rotary",
+  # "pos_emb": "rotary",
   "no_weight_tying": true,
   "gpt_j_residual": false,
   "output_layer_parallelism": "column",

From 6f0e4566a802476a4e0c5485d61f2c61d16b79b5 Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Sun, 24 Sep 2023 18:40:27 +0900
Subject: [PATCH 166/183] add text gen

---
 configs/text_generation.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/configs/text_generation.yml b/configs/text_generation.yml
index 5a49d61e5..cae624c5e 100644
--- a/configs/text_generation.yml
+++ b/configs/text_generation.yml
@@ -2,7 +2,7 @@
 # Make sure `load` is specified somewhere else
 {
   # Text gen type: `input-file`, `unconditional` or `interactive`
-  "text_gen_type": "unconditional",
+  "text_gen_type": "interactive",
 
   # Params for all
   "maximum_tokens": 102,
@@ -13,9 +13,9 @@
   "recompute": false,
 
   # `unconditional`: samples
-  "num_samples": 10,
+  # "num_samples": 10,
 
   # input/output file
-  "sample_input_file": "sample_input.txt",
-  "sample_output_file": "sample_output.txt",
+  #"sample_input_file": "sample_input.txt",
+  #"sample_output_file": "sample_output.txt",
 }

From d8b4a1a0b81425740f03c968f63eaed0205f6cfd Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Sun, 24 Sep 2023 18:50:59 +0900
Subject: [PATCH 167/183] debug

---
 eval_tasks/eval_adapter.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/eval_tasks/eval_adapter.py b/eval_tasks/eval_adapter.py
index e0a32797d..66a0dd1a6 100644
--- a/eval_tasks/eval_adapter.py
+++ b/eval_tasks/eval_adapter.py
@@ -24,7 +24,7 @@ def _download_file(*args, **kwargs):
         fn(*args, **kwargs)
 
 
-best_download.download_file = _download_file
+# best_download.download_file = _download_file
 
 import os
 import sys

From 7f82f08344cec1fdd7491b4b9920fc989d997e16 Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Sun, 24 Sep 2023 18:52:10 +0900
Subject: [PATCH 168/183] debug

---
 eval_tasks/eval_adapter.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/eval_tasks/eval_adapter.py b/eval_tasks/eval_adapter.py
index 66a0dd1a6..cede4f93b 100644
--- a/eval_tasks/eval_adapter.py
+++ b/eval_tasks/eval_adapter.py
@@ -13,15 +13,15 @@
 # limitations under the License.
 
 from megatron.utils import is_local_main, print_rank_0
-import best_download
+# import best_download
 
-# patch best_download (eval harness downloader) to only happen on the first local rank
-fn = best_download.download_file
+# # patch best_download (eval harness downloader) to only happen on the first local rank
+# fn = best_download.download_file
 
 
-def _download_file(*args, **kwargs):
-    if is_local_main():
-        fn(*args, **kwargs)
+# def _download_file(*args, **kwargs):
+#     if is_local_main():
+#         fn(*args, **kwargs)
 
 
 # best_download.download_file = _download_file

From c2f6214afe029b6625fdbebb44bda4cbda4cc718 Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Sun, 24 Sep 2023 19:20:41 +0900
Subject: [PATCH 169/183] debug

---
 configs/text_generation.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/configs/text_generation.yml b/configs/text_generation.yml
index cae624c5e..871e3412c 100644
--- a/configs/text_generation.yml
+++ b/configs/text_generation.yml
@@ -18,4 +18,5 @@
   # input/output file
   #"sample_input_file": "sample_input.txt",
   #"sample_output_file": "sample_output.txt",
+  "deepspeed": False
 }

From 6e682f5957e076e81f6aebe2976d577e72dc3c98 Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Sun, 24 Sep 2023 19:26:37 +0900
Subject: [PATCH 170/183] debug

---
 configs/text_generation.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configs/text_generation.yml b/configs/text_generation.yml
index 871e3412c..68b6ff3ee 100644
--- a/configs/text_generation.yml
+++ b/configs/text_generation.yml
@@ -18,5 +18,5 @@
   # input/output file
   #"sample_input_file": "sample_input.txt",
   #"sample_output_file": "sample_output.txt",
-  "deepspeed": False
+  "deepspeed": false
 }

From 12800e9c8256a09a73d3ef154193bfec941f27c1 Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Sun, 24 Sep 2023 19:27:08 +0900
Subject: [PATCH 171/183] debug

---
 configs/text_generation.yml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/configs/text_generation.yml b/configs/text_generation.yml
index 68b6ff3ee..637105563 100644
--- a/configs/text_generation.yml
+++ b/configs/text_generation.yml
@@ -17,6 +17,5 @@
 
   # input/output file
   #"sample_input_file": "sample_input.txt",
-  #"sample_output_file": "sample_output.txt",
-  "deepspeed": false
+  #"sample_output_file": "sample_output.txt",  
 }

From 72409875f6f40aa3d514642b3a2cc382f87f820f Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Sun, 24 Sep 2023 19:28:08 +0900
Subject: [PATCH 172/183] debug

---
 megatron/training.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/megatron/training.py b/megatron/training.py
index 1f2231acd..08a611ae6 100644
--- a/megatron/training.py
+++ b/megatron/training.py
@@ -620,6 +620,9 @@ def setup_model_and_optimizer(neox_args, use_cache=False, iteration=None):
     optimizer, param_groups = get_optimizer(model=model, neox_args=neox_args)
     lr_scheduler = get_learning_rate_scheduler(optimizer=optimizer, neox_args=neox_args)
 
+    ## force enable
+    neox_args.deepspeed = False
+
     if neox_args.deepspeed:
         print_rank_0("DeepSpeed is enabled.")
         if neox_args.no_load_optim:

From 2ce7091fac65dc4862324c48ceb709022dc60025 Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Sun, 24 Sep 2023 19:29:35 +0900
Subject: [PATCH 173/183] debug

---
 megatron/training.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/megatron/training.py b/megatron/training.py
index 08a611ae6..1f2231acd 100644
--- a/megatron/training.py
+++ b/megatron/training.py
@@ -620,9 +620,6 @@ def setup_model_and_optimizer(neox_args, use_cache=False, iteration=None):
     optimizer, param_groups = get_optimizer(model=model, neox_args=neox_args)
     lr_scheduler = get_learning_rate_scheduler(optimizer=optimizer, neox_args=neox_args)
 
-    ## force enable
-    neox_args.deepspeed = False
-
     if neox_args.deepspeed:
         print_rank_0("DeepSpeed is enabled.")
         if neox_args.no_load_optim:

From 7c1a46faefcde45dc2512aa6d9fa0485615ca3b7 Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Sun, 24 Sep 2023 19:35:31 +0900
Subject: [PATCH 174/183] for gptneox2

---
 tools/convert_module_to_hf_gptneox2.py | 359 +++++++++++++++++++++++++
 1 file changed, 359 insertions(+)
 create mode 100644 tools/convert_module_to_hf_gptneox2.py

diff --git a/tools/convert_module_to_hf_gptneox2.py b/tools/convert_module_to_hf_gptneox2.py
new file mode 100644
index 000000000..7ea38a4e3
--- /dev/null
+++ b/tools/convert_module_to_hf_gptneox2.py
@@ -0,0 +1,359 @@
+# Copyright (c) 2023, EleutherAI
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+
+import yaml
+import argparse
+from tqdm import tqdm
+from typing import List
+
+import torch
+from transformers import GPTNeoXConfig, GPTNeoXForCausalLM
+
+from hf_gptneox import GPTNeoX2ForCausalLM
+
+sys.path.append(
+    os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir))
+)
+from megatron.tokenizer import build_tokenizer
+
+
+"""
+A script for converting saved NeoX Checkpoints to Huggingface (HF) compatible GPT-NeoX type models.
+
+Note that this script does not support all NeoX features.
+Please investigate carefully whether your model is compatible with all architectures supported by the GPTNeoXForCausalLM class in HF.
+
+(e.g. position embeddings such as AliBi may not be supported by Huggingface's GPT-NeoX architecture.
+"""
+
+
+def load_partitions(
+    input_checkpoint_path, mp_partitions, layer_idx
+) -> List[torch.Tensor]:
+    """Returns a list containing all weights in a given layer from a model (across MP partitions)"""
+
+    loaded_tp_ranks = [
+        torch.load(
+            os.path.join(
+                input_checkpoint_path,
+                f"layer_{layer_idx:02}-model_{i:02}-model_states.pt",
+            ),
+            map_location=torch.device("cuda" if torch.cuda.is_available() else "cpu"),
+        )
+        for i in range(mp_partitions)
+    ]
+
+    return loaded_tp_ranks
+
+
+def get_key(loaded_config, key, default=None):
+    """
+    Search for a given key in a NeoX yaml. normalizes underscores -> hyphens
+    """
+    key = key.replace("_", "-")
+    try:
+        return loaded_config[key]
+    except KeyError:
+        key = key.replace("-", "_")
+        try:
+            return loaded_config[key]
+        except KeyError:
+            return default
+
+
+def create_config(neox_config):
+    """take in a loaded yaml from NeoX and assign relevant values to HF config.
+    Returns: GPTNeoXConfig() object
+    """
+
+    class TokenizerArgs:
+        # kinda hacky.
+        # this is to get something with the same interface as is used in build_tokenizer()
+        # without diving into loading a neox_args object or using argparse etc.
+        def __init__(self, neox_config):
+            self.make_vocab_size_divisible_by = get_key(
+                neox_config, "make-vocab-size-divisible-by", default=128
+            )
+            self.model_parallel_size = get_key(neox_config, "model-parallel-size")
+            self.vocab_file = get_key(neox_config, "vocab-file")
+            self.merge_file = get_key(neox_config, "merge-file")
+            self.tokenizer_type = get_key(neox_config, "tokenizer-type")
+
+            self.rank = 0
+
+    args = TokenizerArgs(neox_config)
+    tokenizer = build_tokenizer(args)
+    try:  # GPT2TokenizerFast raises NotImplementedError
+        pad_token = tokenizer.pad
+    except:
+        pad_token = (
+            1  # pad defaulting to 1. follows convention from GPT-NeoX-20b tokenizer
+        )
+
+    # TODO: change the default value here based on discussion regarding `gpt_j_tied` config parameter's default
+    use_tied_lns = get_key(neox_config, "gpt-j-tied", False)
+
+    if use_tied_lns:
+        raise NotImplementedError(
+            """ERROR: Huggingface Transformers does not yet support a single shared layernorm
+                per transformer block for GPT-NeoX models trained  w/ GPT-J parallel residuals.
+                See https://github.com/EleutherAI/gpt-neox/pull/481 for further details."""
+        )
+
+    # set all config values.
+    hf_config = GPTNeoXConfig(
+        vocab_size=args.padded_vocab_size,
+        hidden_size=get_key(neox_config, "hidden-size"),
+        num_hidden_layers=get_key(neox_config, "num-layers"),
+        num_attention_heads=get_key(neox_config, "num-attention-heads"),
+        intermediate_size=(get_key(neox_config, "hidden-size") * 4),
+        hidden_act=get_key(neox_config, "activation", default="gelu"),
+        rotary_pct=get_key(neox_config, "rotary-pct", default=1.0),
+        rotary_emb_base=get_key(neox_config, "rotary-emb-base", default=10000),
+        max_position_embeddings=get_key(neox_config, "max-position-embeddings"),
+        initializer_range=get_key(neox_config, "init-method-std", 0.02),
+        layer_norm_eps=get_key(neox_config, "layernorm-epsilon", 1e-5),
+        use_cache=True,
+        bos_token_id=tokenizer.eod,
+        eos_token_id=tokenizer.eod,
+        tie_word_embeddings=(not get_key(neox_config, "no-weight-tying", False)),
+        use_parallel_residual=get_key(neox_config, "gpt-j-residual", False),
+    )
+    return hf_config
+
+
+def convert(input_checkpoint_path, loaded_config, output_checkpoint_path):
+    """convert a NeoX checkpoint to a HF model format.
+    should perform model-parallel merging correctly
+    but only supports features allowed by HF GPT-NeoX implementation (e.g. rotary embeddings)
+    """
+    print('debug: ', loaded_config)
+    hf_config = GPTNeoXConfig()
+
+    hf_config = create_config(loaded_config)
+
+    # hf_model = GPTNeoXForCausalLM(hf_config)
+    ## for swiglu
+    hf_model = GPTNeoX2ForCausalLM(hf_config)    
+
+    # save model in fp16/bf16 if Deepspeed fp16 or bf16 mixed precision was used in config, else 32 bit weights
+    fp16 = get_key(loaded_config, "fp16")
+    if fp16:
+        try:
+            # this conditional is quite messy because there were a number of ways to specify bf16 or fp16 training
+            # in DeeperSpeed v1.0 .
+            if (fp16.get("fp16", None) or fp16["enabled"]) and not (fp16.get("type", None) == "bfloat16"):
+                hf_model.half()
+                print("Saving weights in fp16 precision...")
+            elif fp16.get("type", None) == "bfloat16":
+                hf_model.to(dtype=torch.bfloat16)
+                print("Saving weights in bf16 precision...")
+        except:
+            print("Model not trained in fp16 / bf16 mixed precision, saving weights in fp32...")
+    
+    mp_partitions = get_key(loaded_config, "model-parallel-size")
+
+    ### Embedding layer ###
+    loaded_tp_ranks = load_partitions(input_checkpoint_path, mp_partitions, 0)
+    hf_model.gpt_neox.embed_in.load_state_dict(
+        {
+            "weight": torch.cat(
+                [t["word_embeddings.weight"] for t in loaded_tp_ranks], dim=0
+            )
+        }
+    )
+
+    assert (
+        hf_config.vocab_size == hf_model.gpt_neox.embed_in.weight.shape[0]
+    ), f"ERROR: calculated vocab size {hf_config.vocab_size} != embed param size {hf_model.gpt_neox.embed_in.shape[0]}"
+    ### End Embedding Layer ###
+
+    for layer_i in tqdm(range(get_key(loaded_config, "num-layers"))):
+
+        # get layer from hf model
+        hf_layer = hf_model.gpt_neox.layers[layer_i]
+        for v in hf_layer.state_dict():
+            print('debug state_dict: ', v)
+        print('-'*200)
+
+        # + 2 bc of embed layer and a dummy _pre_transformer_block
+        loaded_tp_ranks = load_partitions(
+            input_checkpoint_path, mp_partitions, layer_i + 2
+        )
+
+        for t in loaded_tp_ranks:
+            print('debug loaded_tp_ranks: ', t.keys())
+
+        state_dict = {}
+
+
+
+        for key in [
+            "attention.dense.weight",
+            "mlp.dense_4h_to_h.weight",
+        ]:
+            state_dict[key] = torch.cat([t[key] for t in loaded_tp_ranks], dim=1)
+
+        # average layernorm stats over mp ranks
+        keysForOriginGPTNeoX=[
+            "input_layernorm.weight",
+            "input_layernorm.bias",
+            "post_attention_layernorm.weight",
+            "post_attention_layernorm.bias",
+        ]
+        keysForSwiglu = [
+            'input_layernorm.scale',
+            'post_attention_layernorm.scale'
+        ]
+        for key in keysForSwiglu:
+            state_dict[key] = (sum([t[key] for t in loaded_tp_ranks])) / len(
+                loaded_tp_ranks
+            )
+
+        # LinearWithTPMerge
+        for key in [
+            "mlp.dense_h_to_4h.weight",
+            "mlp.dense_h_to_4h.bias",
+            "attention.query_key_value.weight",
+            "attention.query_key_value.bias",
+        ]:
+            state_dict[key] = torch.cat([t[key] for t in loaded_tp_ranks], dim=0)
+
+        # LinearWithTPSplitBias
+        for key in [
+            "mlp.dense_4h_to_h.bias",
+            "attention.dense.bias",
+        ]:
+            state_dict[key] = sum([t[key] for t in loaded_tp_ranks])
+
+        # Just take one
+        if loaded_config['pos_emb'] == 'rotary':
+            state_dict["attention.rotary_emb.inv_freq"] = loaded_tp_ranks[0][
+                "attention.rotary_emb.inv_freq"
+            ]
+        
+
+        state_dict["attention.dense.bias"] = hf_layer.state_dict()["attention.dense.bias"]
+
+        if "attention.bias" in hf_layer.state_dict():
+            state_dict["attention.bias"] = hf_layer.state_dict()["attention.bias"]
+
+        if "attention.masked_bias" in hf_layer.state_dict():
+            state_dict["attention.masked_bias"] = hf_layer.state_dict()[
+                "attention.masked_bias"
+            ]
+
+        # load state_dict into layer
+        hf_layer.load_state_dict(state_dict)
+
+    # Load final layer norm
+    loaded_tp_ranks = load_partitions(
+        input_checkpoint_path, mp_partitions, get_key(loaded_config, "num-layers") + 3
+    )
+
+    hf_model.gpt_neox.final_layer_norm.load_state_dict(
+        {
+            "weight": (sum([t["norm.weight"] for t in loaded_tp_ranks]))
+            / len(loaded_tp_ranks),
+            "bias": (sum([t["norm.bias"] for t in loaded_tp_ranks]))
+            / len(loaded_tp_ranks),
+        }
+    )
+    del loaded_tp_ranks
+
+    # Load output embedding
+    loaded_tp_ranks = load_partitions(
+        input_checkpoint_path, mp_partitions, get_key(loaded_config, "num-layers") + 4
+    )
+
+    hf_model.embed_out.load_state_dict(
+        {
+            "weight": torch.cat(
+                [t["final_linear.weight"] for t in loaded_tp_ranks], dim=0
+            ),
+        }
+    )
+
+    del loaded_tp_ranks
+
+    return hf_model
+
+
+if __name__ == "__main__":
+
+    # before running script:
+    # `pip install --upgrade transformers`
+    # `huggingface-cli login`
+    #
+    from huggingface_hub import create_repo, HfApi
+
+    parser = argparse.ArgumentParser(
+        description="Merge MP partitions and convert to HF Model."
+    )
+    parser.add_argument(
+        "--input_dir",
+        type=str,
+        help="Path to NeoX checkpoint, e.g. /path/to/model/global_step143000",
+    )
+    parser.add_argument(
+        "--config_file",
+        type=str,
+        help="Path to config file for the input NeoX checkpoint.",
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        help="Output dir, where to save the HF Model, tokenizer, and configs",
+    )
+    parser.add_argument(
+        "--upload",
+        action="store_true",
+        help="Set to true in order to upload to the HF Hub directly.",
+    )
+    args = parser.parse_args()
+
+    with open(args.config_file) as f:
+        loaded_config = yaml.full_load(f)
+
+    hf_model = convert(args.input_dir, loaded_config, args.output_dir)
+
+    hf_model.save_pretrained(args.output_dir)
+
+    # save tokenizer to directory as well, for easy loading of model as a HF model
+    tokenizer_type = get_key(loaded_config, "tokenizer-type")
+
+    if tokenizer_type == "HFTokenizer":
+        print(f"saving tokenizer from file {get_key(loaded_config, 'vocab-file')}")
+        from transformers import PreTrainedTokenizerFast
+
+        tokenizer = PreTrainedTokenizerFast(
+            tokenizer_file=get_key(loaded_config, "vocab-file")
+        )
+        print("loaded tokenizer: ", tokenizer)
+        tokenizer.save_pretrained(args.output_dir)
+        print("tokenizer saved!")
+
+    if args.upload:
+        repo_name = input("Provide a repository name for the HF Hub: ")
+        create_repo(repo_name, repo_type="model", private=False, use_auth_token=True)
+
+        api = HfApi()
+        api.upload_folder(
+            folder_path=args.output_dir,
+            repo_id=repo_name,
+            repo_type="model",
+        )

From 3e5283ed43eeab313cf1b10757794c16c0f60021 Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Sun, 24 Sep 2023 19:41:10 +0900
Subject: [PATCH 175/183] debug

---
 tools/convert_module_to_hf_gptneox2.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tools/convert_module_to_hf_gptneox2.py b/tools/convert_module_to_hf_gptneox2.py
index 7ea38a4e3..8c875de20 100644
--- a/tools/convert_module_to_hf_gptneox2.py
+++ b/tools/convert_module_to_hf_gptneox2.py
@@ -21,7 +21,7 @@
 from typing import List
 
 import torch
-from transformers import GPTNeoXConfig, GPTNeoXForCausalLM
+from transformers import GPTNeoXConfig, GPTNeoXForCausalLM, AutoModelForCausalLM
 
 from hf_gptneox import GPTNeoX2ForCausalLM
 
@@ -148,7 +148,8 @@ def convert(input_checkpoint_path, loaded_config, output_checkpoint_path):
 
     # hf_model = GPTNeoXForCausalLM(hf_config)
     ## for swiglu
-    hf_model = GPTNeoX2ForCausalLM(hf_config)    
+    # hf_model = GPTNeoX2ForCausalLM(hf_config)
+    hf_model = AutoModelForCausalLM(hf_config)
 
     # save model in fp16/bf16 if Deepspeed fp16 or bf16 mixed precision was used in config, else 32 bit weights
     fp16 = get_key(loaded_config, "fp16")

From 7c00e854bb3bbc66ef513846cf459aeaa399e5a3 Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Sun, 24 Sep 2023 19:42:20 +0900
Subject: [PATCH 176/183] debug

---
 tools/convert_module_to_hf_gptneox2.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/convert_module_to_hf_gptneox2.py b/tools/convert_module_to_hf_gptneox2.py
index 8c875de20..5e90b5985 100644
--- a/tools/convert_module_to_hf_gptneox2.py
+++ b/tools/convert_module_to_hf_gptneox2.py
@@ -149,7 +149,7 @@ def convert(input_checkpoint_path, loaded_config, output_checkpoint_path):
     # hf_model = GPTNeoXForCausalLM(hf_config)
     ## for swiglu
     # hf_model = GPTNeoX2ForCausalLM(hf_config)
-    hf_model = AutoModelForCausalLM(hf_config)
+    hf_model = AutoModelForCausalLM.from_config(hf_config)
 
     # save model in fp16/bf16 if Deepspeed fp16 or bf16 mixed precision was used in config, else 32 bit weights
     fp16 = get_key(loaded_config, "fp16")

From 0be6712fbea8ee142f15f971b6652e388e4a511d Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Sun, 24 Sep 2023 19:56:35 +0900
Subject: [PATCH 177/183] debug

---
 tools/convert_module_to_hf_gptneox2.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/tools/convert_module_to_hf_gptneox2.py b/tools/convert_module_to_hf_gptneox2.py
index 5e90b5985..07807672c 100644
--- a/tools/convert_module_to_hf_gptneox2.py
+++ b/tools/convert_module_to_hf_gptneox2.py
@@ -146,10 +146,9 @@ def convert(input_checkpoint_path, loaded_config, output_checkpoint_path):
 
     hf_config = create_config(loaded_config)
 
-    # hf_model = GPTNeoXForCausalLM(hf_config)
+    # hf_model = GPTNeoXForCausalLM(hf_config)    
     ## for swiglu
-    # hf_model = GPTNeoX2ForCausalLM(hf_config)
-    hf_model = AutoModelForCausalLM.from_config(hf_config)
+    hf_model = GPTNeoX2ForCausalLM(hf_config)    
 
     # save model in fp16/bf16 if Deepspeed fp16 or bf16 mixed precision was used in config, else 32 bit weights
     fp16 = get_key(loaded_config, "fp16")

From 02651236265db44725223b542f903efbe7096008 Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Sun, 24 Sep 2023 20:54:50 +0900
Subject: [PATCH 178/183] fix model

---
 tools/hf_gptneox.py | 345 +++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 343 insertions(+), 2 deletions(-)

diff --git a/tools/hf_gptneox.py b/tools/hf_gptneox.py
index b4eb0ae3c..3ee257e10 100644
--- a/tools/hf_gptneox.py
+++ b/tools/hf_gptneox.py
@@ -1,9 +1,10 @@
 from transformers.models.gpt_neox import GPTNeoXPreTrainedModel, GPTNeoXModel, GPTNeoXLayer
-from transformers.models.gpt_neox.modeling_gpt_neox import GPTNeoXMLP
+from transformers.models.gpt_neox.modeling_gpt_neox import GPTNeoXMLP, GPTNeoXAttention
 from transformers.activations import ClassInstantier, ACT2CLS
 from torch import Tensor, nn
+import torch
 
-from typing import Callable, Optional
+from typing import Callable, Optional, Tuple
 import torch.nn.functional as F
 
 
@@ -49,6 +50,297 @@ def __init__(self, config):
         config.hidden_act = _copy_hidden_act
         self.act = ACT2FN[config.hidden_act]
 
+
+def rotate_half(x):
+    x1, x2 = x.chunk(2, dim=-1)
+    return torch.cat((-x2, x1), dim=-1)
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids, cos_k=None, sin_k=None):
+    """
+    q, k: [bs, num_heads, seq_len, rot_dim]
+    cos, sin: [seq_len, rot_dim / 2]
+    position_ids: [bs, seq_len]
+    """
+    # print(f"q: {q.shape}, k: {k.shape}, cos: {cos.shape}, sin: {sin.shape}, position_ids: {position_ids.shape}")
+    import einops
+    cos = einops.repeat(cos, 's r -> s (2 r)')
+    sin = einops.repeat(sin, 's r -> s (2 r)')
+    cos_k = einops.repeat(cos_k, 's r -> s (2 r)')
+    sin_k = einops.repeat(sin_k, 's r -> s (2 r)')
+    cos = cos[position_ids].unsqueeze(1)  # [bs, 1, seq_len, rot_dim]
+    sin = sin[position_ids].unsqueeze(1)  # [bs, 1, seq_len, rot_dim]
+    cos_k = cos_k[position_ids].unsqueeze(1)  # [bs, 1, seq_len, rot_dim]
+    sin_k = sin_k[position_ids].unsqueeze(1)  # [bs, 1, seq_len, rot_dim]
+
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos_k) + (rotate_half(k) * sin_k)
+    return q_embed, k_embed
+
+class RotaryEmbedding(torch.nn.Module):
+    """Based on Tri Dao's XPos: https://github.com/HazyResearch/flash-attention/blob/main/flash_attn/layers/rotary.py"""
+    def __init__(
+        self,
+        dim: int,
+        max_position_embeddings: int,
+        base: int = 10_000,
+        scale_base: int = 512,
+        device: str = None
+    ):
+        super().__init__()
+        self.dim = dim
+        self.seq_len_cached = max_position_embeddings
+
+        # Set up `inv_freq` term
+        inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2, device=device, dtype=torch.float32) / dim))
+        self.register_buffer("inv_freq", inv_freq)
+
+        # Set up `scale` term
+        self.scale_base = scale_base
+        scale = (
+            (torch.arange(0, dim, 2, device=device, dtype=torch.float32) + 0.4 * dim) / (1.4 * dim)
+            if scale_base is not None else None
+        )
+        self.register_buffer("scale", scale)
+
+        # Seet up `cos..` and `sin...` cache terms
+        t = torch.arange(self.seq_len_cached, device=device, dtype=torch.float32)
+        freqs = torch.outer(t, self.inv_freq)
+        # freqs = torch.cat((freqs, freqs), dim=-1)
+        seq_range = torch.arange(self.seq_len_cached, dtype=self.scale.dtype, device=self.scale.device)
+        power = (seq_range - self.seq_len_cached // 2) / self.scale_base
+        scale_cached = self.scale.to(device=power.device) ** power.unsqueeze(-1)
+        # scale_cached = torch.cat((scale_cached, scale_cached), dim=-1)
+        self.register_buffer("cos_cached", torch.cos(freqs) * scale_cached, persistent=False)
+        self.register_buffer("sin_cached", torch.sin(freqs) * scale_cached, persistent=False)
+        self.register_buffer("cos_k_cached", torch.cos(freqs) / scale_cached, persistent=False)
+        self.register_buffer("sin_k_cached", torch.sin(freqs) / scale_cached, persistent=False)
+
+    def forward(self, x, seq_len=None):
+        if seq_len > self.seq_len_cached:
+            self.seq_len_cached = seq_len
+            t = torch.arange(seq_len, device=x.device, dtype=torch.float32)
+            freqs = torch.outer(t, self.inv_freq)
+            freqs = torch.cat((freqs, freqs), dim=-1)
+            seq_range = torch.arange(self.seq_len_cached, dtype=self.scale.dtype, device=self.scale.device)
+            power = (seq_range - self.seq_len_cached // 2) / self.scale_base
+            scale_cached = self.scale.to(device=power.device) ** power.unsqueeze(-1)
+            scale_cached = torch.cat((scale_cached, scale_cached), dim=-1)
+            self.register_buffer("cos_cached", torch.cos(freqs) * scale_cached, persistent=False)
+            self.register_buffer("sin_cached", torch.sin(freqs) * scale_cached, persistent=False)
+            self.register_buffer("cos_k_cached", torch.cos(freqs) / scale_cached, persistent=False)
+            self.register_buffer("sin_k_cached", torch.sin(freqs) / scale_cached, persistent=False)
+        return (
+            self.cos_cached[:seq_len, ...],
+            self.sin_cached[:seq_len, ...],
+            self.cos_k_cached[:seq_len, ...],
+            self.sin_k_cached[:seq_len, ...],
+        )
+
+class GPTNeoX2Attention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.num_attention_heads = config.num_attention_heads
+        self.hidden_size = config.hidden_size
+        if self.hidden_size % self.num_attention_heads != 0:
+            raise ValueError(
+                "The hidden size is not divisble by the number of attention heads! Make sure to update them"
+            )
+        self.head_size = self.hidden_size // self.num_attention_heads
+
+        max_positions = config.max_position_embeddings
+        self.register_buffer(
+            "bias",
+            torch.tril(torch.ones((max_positions, max_positions), dtype=torch.bool)).view(
+                1, 1, max_positions, max_positions
+            ),
+            persistent=False,
+        )
+        self.register_buffer("masked_bias", torch.tensor(-1e9), persistent=False)
+
+        self.rotary_ndims = int(self.head_size * config.rotary_pct)
+        self.rotary_emb = RotaryEmbedding(
+            self.rotary_ndims,
+            max_position_embeddings=config.max_position_embeddings,
+            base=config.rotary_emb_base,
+            scale_base=config.rotary_scale_base,
+        )
+
+        self.register_buffer(
+            "norm_factor",
+            torch.sqrt(torch.tensor(self.head_size, dtype=torch.float32)).to(torch.get_default_dtype()),
+            persistent=False,
+        )
+
+        self.query_key_value = nn.Linear(self.hidden_size, 3 * self.hidden_size, bias=False)
+        self.dense = nn.Linear(self.hidden_size, self.hidden_size, bias=False)
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        attention_mask: torch.FloatTensor,
+        position_ids: torch.LongTensor,
+        head_mask: Optional[torch.FloatTensor] = None,
+        layer_past: Optional[Tuple[torch.Tensor]] = None,
+        use_cache: Optional[bool] = False,
+        output_attentions: Optional[bool] = False,
+    ):
+        has_layer_past = layer_past is not None
+
+        # Compute QKV
+        # Attention heads [batch, seq_len, hidden_size]
+        #   --> [batch, seq_len, (np * 3 * head_size)]
+        qkv = self.query_key_value(hidden_states)
+
+        # [batch, seq_len, (num_heads * 3 * head_size)]
+        #   --> [batch, seq_len, num_heads, 3 * head_size]
+        new_qkv_shape = qkv.size()[:-1] + (self.num_attention_heads, 3 * self.head_size)
+        qkv = qkv.view(*new_qkv_shape)
+
+        # [batch, seq_len, num_attention_heads, 3 * head_size] --> 3 [batch, num_attention_heads, seq_len, head_size]
+        query = qkv[..., : self.head_size].permute(0, 2, 1, 3)
+        key = qkv[..., self.head_size : 2 * self.head_size].permute(0, 2, 1, 3)
+        value = qkv[..., 2 * self.head_size :].permute(0, 2, 1, 3)
+
+        # Compute rotary embeddings on rotary_ndims
+        query_rot = query[..., : self.rotary_ndims]
+        query_pass = query[..., self.rotary_ndims :]
+        key_rot = key[..., : self.rotary_ndims]
+        key_pass = key[..., self.rotary_ndims :]
+
+        # Compute token offset for rotary embeddings (when decoding)
+        kv_seq_len = key.shape[-2]
+        if has_layer_past:
+            kv_seq_len += layer_past[0].shape[-2]
+
+        # Add rotary embeddings to query and key
+        # TODO: Check if using xpos
+        cos, sin, cos_k, sin_k = self.rotary_emb(value, seq_len=kv_seq_len)
+        query, key = apply_rotary_pos_emb(
+            query_rot, key_rot, cos, sin, position_ids, cos_k=cos_k, sin_k=sin_k)
+
+        query = torch.cat((query, query_pass), dim=-1)
+        key = torch.cat((key, key_pass), dim=-1)
+
+        # Cache QKV values
+        if has_layer_past:
+            past_key = layer_past[0]
+            past_value = layer_past[1]
+            key = torch.cat((past_key, key), dim=-2)
+            value = torch.cat((past_value, value), dim=-2)
+        present = (key, value) if use_cache else None
+
+        # Compute attention
+        attn_output, attn_weights = self._attn(query, key, value, attention_mask, head_mask)
+
+        # Merge attn_head_size dim and num_attn_heads dim into hidden dim
+        # [bs, seq_len, num_attention_heads, attn_head_size]
+        attn_output = attn_output.permute(0, 2, 1, 3).contiguous()
+        attn_output = attn_output.view(attn_output.size(0), attn_output.size(1), self.num_attention_heads * self.head_size)
+
+        attn_output = self.dense(attn_output)
+
+        outputs = (attn_output, present)
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+    def _attn(self, query, key, value, attention_mask=None, head_mask=None):
+        # q, k, v: [bs, num_attention_heads, seq_len, attn_head_size]
+        # compute causal mask from causal mask buffer
+
+        batch_size, num_attention_heads, query_length, attn_head_size = query.size()
+        key_length = key.size(-2)
+
+        causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length]
+
+        query = query.view(batch_size * num_attention_heads, query_length, attn_head_size)
+        key = key.view(batch_size * num_attention_heads, key_length, attn_head_size)
+        attn_scores = torch.zeros(
+            batch_size * num_attention_heads,
+            query_length,
+            key_length,
+            dtype=query.dtype,
+            device=key.device,
+        )
+        attn_scores = torch.baddbmm(
+            attn_scores,
+            query,
+            key.transpose(1, 2),
+            beta=1.0,
+            alpha=(torch.tensor(1.0, dtype=self.norm_factor.dtype, device=self.norm_factor.device) / self.norm_factor),
+        )
+        attn_scores = attn_scores.view(batch_size, num_attention_heads, query_length, key_length)
+
+        mask_value = torch.finfo(attn_scores.dtype).min
+        # Need to be a tensor, otherwise we get error: `RuntimeError: expected scalar type float but found double`.
+        # Need to be on the same device, otherwise `RuntimeError: ..., x and y to be on the same device`
+        mask_value = torch.tensor(mask_value, dtype=attn_scores.dtype, device=attn_scores.device)
+        attn_scores = torch.where(causal_mask, attn_scores, mask_value)
+
+        if attention_mask is not None:
+            # Apply the attention mask
+            attn_scores = attn_scores + attention_mask
+
+        # NOTE: Upcast to float32
+        attn_weights = nn.functional.softmax(attn_scores, dim=-1, dtype=torch.float32).type_as(value)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attn_weights = attn_weights * head_mask
+
+        attn_output = torch.matmul(attn_weights, value)
+        return attn_output, attn_weights
+
+
+def attention_mask_func(attention_scores, ltor_mask):
+    attention_scores.masked_fill_(~ltor_mask, torch.finfo(attention_scores.dtype).min)
+    return attention_scores
+
+
+class RMSNorm(torch.nn.Module):
+    def __init__(self, dim, p=-1.0, eps=1e-8, bias=False):
+        """
+            Root Mean Square Layer Normalization
+        :param dim: model size
+        :param p: partial RMSNorm, valid value [0, 1], default -1.0 (disabled)
+        :param eps:  epsilon value, default 1e-8
+        :param bias: whether use bias term for RMSNorm, disabled by
+            default because RMSNorm doesn't enforce re-centering invariance.
+        """
+        super(RMSNorm, self).__init__()
+
+        self.eps = eps
+        self.d = dim
+        self.p = p
+        self.bias = bias
+
+        self.scale = torch.nn.Parameter(torch.ones(dim))
+        self.register_parameter("scale", self.scale)
+
+        if self.bias:
+            self.offset = torch.nn.Parameter(torch.zeros(dim))
+            self.register_parameter("offset", self.offset)
+
+    def forward(self, x):
+        if self.p < 0.0 or self.p > 1.0:
+            norm_x = x.norm(2, dim=-1, keepdim=True)
+            d_x = self.d
+        else:
+            partial_size = int(self.d * self.p)
+            partial_x, _ = torch.split(x, [partial_size, self.d - partial_size], dim=-1)
+
+            norm_x = partial_x.norm(2, dim=-1, keepdim=True)
+            d_x = partial_size
+
+        rms_x = norm_x * d_x ** (-1.0 / 2)
+        x_normed = x / (rms_x + self.eps)
+
+        if self.bias:
+            return self.scale * x_normed + self.offset
+
+        return self.scale * x_normed
+
 class GPTNeoX2Layer(GPTNeoXLayer):
     def __init__(self, config):
         _copy_hidden_act = config.hidden_act
@@ -56,8 +348,57 @@ def __init__(self, config):
         super().__init__(config)
 
         config.hidden_act = _copy_hidden_act
+        self.use_parallel_residual = config.use_parallel_residual
+        # self.input_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+        # self.post_attention_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        # self.attention = GPTNeoXAttention(config)
+        self.attention = GPTNeoX2Attention(config)
         self.mlp = GPTNeoX2MLP(config)
 
+    def forward(
+        self,
+        hidden_states: Optional[torch.FloatTensor],
+        attention_mask: Optional[torch.FloatTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = False,
+        layer_past: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+    ):
+        attention_layer_outputs = self.attention(
+            self.input_layernorm(hidden_states),
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            layer_past=layer_past,
+            head_mask=head_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+        )
+        attn_output = attention_layer_outputs[0]  # output_attn: attn_output, present, (attn_weights)
+        outputs = attention_layer_outputs[1:]
+
+        if self.use_parallel_residual:
+            # pseudocode:
+            # x = x + attn(ln1(x)) + mlp(ln2(x))
+            mlp_output = self.mlp(self.post_attention_layernorm(hidden_states))
+            hidden_states = mlp_output + attn_output + hidden_states
+        else:
+            # pseudocode:
+            # x = x + attn(ln1(x))
+            # x = x + mlp(ln2(x))
+            attn_output = attn_output + hidden_states
+            mlp_output = self.mlp(self.post_attention_layernorm(attn_output))
+            hidden_states = mlp_output + attn_output
+
+        if use_cache:
+            outputs = (hidden_states,) + outputs  # hidden_states, present, (attn_weights)
+        else:
+            outputs = (hidden_states,) + outputs[1:]  # hidden_states, (attn_weights)
+
+        return outputs
+
 class GPTNeoX2Model(GPTNeoXModel):
     def __init__(self, config):
         _copy_hidden_act = config.hidden_act

From 0855a52d54e7aa1142ca0e3004eed43aaa833e1e Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Sun, 24 Sep 2023 21:00:41 +0900
Subject: [PATCH 179/183] fix model

---
 tools/hf_gptneox.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/hf_gptneox.py b/tools/hf_gptneox.py
index 3ee257e10..f82213082 100644
--- a/tools/hf_gptneox.py
+++ b/tools/hf_gptneox.py
@@ -158,6 +158,7 @@ def __init__(self, config):
         self.register_buffer("masked_bias", torch.tensor(-1e9), persistent=False)
 
         self.rotary_ndims = int(self.head_size * config.rotary_pct)
+        print('config', config)
         self.rotary_emb = RotaryEmbedding(
             self.rotary_ndims,
             max_position_embeddings=config.max_position_embeddings,

From e4ce875c8dac0e2c81b62517fbd70bd6df076a2a Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Sun, 24 Sep 2023 21:01:43 +0900
Subject: [PATCH 180/183] fix model

---
 tools/hf_gptneox.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tools/hf_gptneox.py b/tools/hf_gptneox.py
index f82213082..41d507a5b 100644
--- a/tools/hf_gptneox.py
+++ b/tools/hf_gptneox.py
@@ -162,8 +162,7 @@ def __init__(self, config):
         self.rotary_emb = RotaryEmbedding(
             self.rotary_ndims,
             max_position_embeddings=config.max_position_embeddings,
-            base=config.rotary_emb_base,
-            scale_base=config.rotary_scale_base,
+            base=config.rotary_emb_base            
         )
 
         self.register_buffer(

From d063655edca84cf9e1da161719924e14bbd84db8 Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Sun, 24 Sep 2023 21:02:13 +0900
Subject: [PATCH 181/183] fix model

---
 tools/hf_gptneox.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/hf_gptneox.py b/tools/hf_gptneox.py
index 41d507a5b..5d6767207 100644
--- a/tools/hf_gptneox.py
+++ b/tools/hf_gptneox.py
@@ -158,7 +158,7 @@ def __init__(self, config):
         self.register_buffer("masked_bias", torch.tensor(-1e9), persistent=False)
 
         self.rotary_ndims = int(self.head_size * config.rotary_pct)
-        print('config', config)
+
         self.rotary_emb = RotaryEmbedding(
             self.rotary_ndims,
             max_position_embeddings=config.max_position_embeddings,

From 45ad8383400f7cfacf48dbb821fbca91b092f763 Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Sun, 24 Sep 2023 21:26:28 +0900
Subject: [PATCH 182/183] fix model

---
 tools/convert_module_to_hf_gptneox2.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/convert_module_to_hf_gptneox2.py b/tools/convert_module_to_hf_gptneox2.py
index 07807672c..ce77a48b4 100644
--- a/tools/convert_module_to_hf_gptneox2.py
+++ b/tools/convert_module_to_hf_gptneox2.py
@@ -236,7 +236,7 @@ def convert(input_checkpoint_path, loaded_config, output_checkpoint_path):
         # LinearWithTPSplitBias
         for key in [
             "mlp.dense_4h_to_h.bias",
-            "attention.dense.bias",
+            # "attention.dense.bias",
         ]:
             state_dict[key] = sum([t[key] for t in loaded_tp_ranks])
 

From d6cf7deb41bee042b5c8cfd0618e24a10b43a1dc Mon Sep 17 00:00:00 2001
From: windows_on_wsl <otomijuf.004@gmail.com>
Date: Sun, 24 Sep 2023 21:27:46 +0900
Subject: [PATCH 183/183] fix model

---
 tools/convert_module_to_hf_gptneox2.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/convert_module_to_hf_gptneox2.py b/tools/convert_module_to_hf_gptneox2.py
index ce77a48b4..8b099f04a 100644
--- a/tools/convert_module_to_hf_gptneox2.py
+++ b/tools/convert_module_to_hf_gptneox2.py
@@ -247,7 +247,7 @@ def convert(input_checkpoint_path, loaded_config, output_checkpoint_path):
             ]
         
 
-        state_dict["attention.dense.bias"] = hf_layer.state_dict()["attention.dense.bias"]
+        # state_dict["attention.dense.bias"] = hf_layer.state_dict()["attention.dense.bias"]
 
         if "attention.bias" in hf_layer.state_dict():
             state_dict["attention.bias"] = hf_layer.state_dict()["attention.bias"]