Merge branch 'dev-v1' of github.com:google-research/deduplicate-text-…

…datasets into dev-v1
google-research · Mar 11, 2022 · f86b161 · f86b161
2 parents 67351ad + e0d9aef
commit f86b161
Show file tree

Hide file tree

Showing 8 changed files with 176 additions and 85 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -11,6 +11,5 @@ overflow-checks = false  # Go FAAASSTTT!
 [dependencies]
 zstd = "0.5"
 crossbeam = "0.3"
-fasthash = "0.4"
 filebuffer = "0.4"
 clap = { version = "3.1.1", features = ["derive"] }
diff --git a/README.md b/README.md
diff --git a/scripts/deduplicate_single_file.sh b/scripts/deduplicate_single_file.sh
@@ -0,0 +1,4 @@
+python3 scripts/make_suffix_array.py $1
+cargo run self-similar --data-file $1 --length-threshold $3 --cache-dir /tmp/cache --num-threads $4
+cargo run collect --data-file $1 --cache-dir /tmp/cache --length-threshold $3 > /tmp/drop_tokens_file
+python3 scripts/finish_single_file.py $1 /tmp/drop_tokens_file $2
diff --git a/scripts/finish_dedup_lm1b.py → scripts/finish_dedup_wiki40b.py b/scripts/finish_dedup_lm1b.py → scripts/finish_dedup_wiki40b.py
@@ -1,3 +1,16 @@
+# Copyright 2022 Google LLC
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import numpy as np
 import os
 import shutil
@@ -38,13 +51,13 @@ def serialize_example(**feature):
 
 tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
 
-#print(cumsum[:5])
-
 remove = defaultdict(list)
 
 def run(args):
     this_idx, row = args
     new_row = {'text': row,
+               'version_id': '',
+               'wikidata_id': '',
                 'timestamp': '',
                 'url': '',
                 'content-length': '',
@@ -72,6 +85,8 @@ def _info(self):
         builder=self,
         features=tfds.features.FeaturesDict({
             'text': tfds.features.Text(),
+            'version_id': tfds.features.Text(),
+            'wikidata_id': tfds.features.Text(),
             'timestamp': tfds.features.Text(),
             'url': tfds.features.Text(),
             'content-length': tfds.features.Text(),
@@ -132,7 +147,6 @@ def _generate_examples(self, split):
 
 sizes = np.frombuffer(open(os.path.join(args.suffixarray_dir, args.name+"."+args.split+".size"), "rb").read(), dtype=np.uint64)
 
-#print(np.max(sizes))
 remove_ex = defaultdict(list)
 ptr = 0
 for i,byte_start in enumerate(sizes[:-1]):
@@ -146,15 +160,17 @@ def _generate_examples(self, split):
                              min(int(remove[ptr][1] - byte_start), byte_end-byte_start)))
         ptr += 1
 
-#print(remove_ex)
 tfds.load("my_dataset", data_dir=where+"_dedup")
 
 
-if dataset == "lm1b":
-    en = os.path.join(where+"_dedup", "lm1b")
+if dataset == "wiki40b":
+    en = os.path.join(where+"_dedup", "wiki40b")
+    if not os.path.exists(en):
+        os.mkdir(en)
+    en = os.path.join(en, "en")
     if not os.path.exists(en):
         os.mkdir(en)
-    en = os.path.join(en, "1.1.0")
+    en = os.path.join(en, "1.3.0")
     if not os.path.exists(en):
         os.mkdir(en)
 

diff --git a/scripts/finish_single_file.py b/scripts/finish_single_file.py
@@ -0,0 +1,37 @@
+# Copyright 2022 Google LLC
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+
+original = sys.argv[1]
+remove_file = sys.argv[2]
+deduped = sys.argv[3]
+
+remove = []
+fin = open(remove_file)
+for line in fin:
+    if 'out' in line: break
+for line in fin:
+    remove.append(list(map(int,line.split())))
+remove = remove[::-1]
+
+ds = open(original,"rb")
+new_ds = open(deduped,"wb")
+
+start = 0
+while len(remove) > 0:
+    a,b = remove.pop()
+    new_ds.write(ds.read(a-start))
+    ds.seek(b)
+    start = b
+new_ds.write(ds.read())
diff --git a/scripts/load_dataset.py b/scripts/load_dataset.py
@@ -74,20 +74,19 @@ def tok(x):
 
 fout = open(os.path.join(save_dir, dataset_name+"."+split), "wb")
 
-p = mp.Pool(mp.cpu_count())
-
-i = 0
-sizes = [0]
-for b in ds:
-    print(i)
-
-    text = b['text'].numpy()
-    text = p.map(tok,text)
+with mp.Pool(mp.cpu_count()) as p:
+    i = 0
+    sizes = [0]
+    for b in ds:
+        print(i)
 
-    for x in text:
-        next_line = sep()+x
-        fout.write(next_line)
-        sizes.append(sizes[-1]+len(next_line))
-    i += 1
+        text = b['text'].numpy()
+        text = p.map(tok,text)
+
+        for x in text:
+            next_line = sep()+x
+            fout.write(next_line)
+            sizes.append(sizes[-1]+len(next_line))
+        i += 1
 
 open(os.path.join(save_dir,dataset_name+"."+split+".size"), "wb").write(np.array(sizes,dtype=np.uint64).tobytes())
diff --git a/scripts/run_pipeline.sh b/scripts/run_pipeline.sh
@@ -1,6 +1,6 @@
 TFDS_DIR=/tmp/tensorflow_datasets/
 DATA_DIR=/tmp/data/
-DATASET=lm1b
+DATASET=wiki40b
 SPLIT=test
 THRESHOLD=100
 CACHE=/tmp/cache/
@@ -11,7 +11,7 @@ python3 scripts/load_dataset.py --data_dir $TFDS_DIR --save_dir $DATA_DIR --name
 
 python3 scripts/make_suffix_array.py $DATA_DIR$DATASET.$SPLIT
 
-cargo run self-similar --data-file $DATA_DIR$DATASET.$SPLIT --length-threshold $THRESHOLD --cache-dir $CACHE --num-threads 96
+cargo run self-similar --data-file $DATA_DIR$DATASET.$SPLIT --length-threshold $THRESHOLD --cache-dir $CACHE
 
 cargo run collect --data-name $DATASET.$SPLIT --cache-dir $CACHE > /tmp/$DATASET.$SPLIT.remove.byterange
 

diff --git a/src/main.rs b/src/main.rs
@@ -59,7 +59,6 @@ extern crate filebuffer;
 extern crate zstd;
 extern crate crossbeam;
 extern crate clap;
-extern crate fasthash;
 
 use std::cmp::Ordering;
 use std::collections::BinaryHeap;