Skip to content

Commit

Permalink
Merge branch 'dev-v1' of github.com:google-research/deduplicate-text-…
Browse files Browse the repository at this point in the history
…datasets into dev-v1
  • Loading branch information
carlini committed Mar 11, 2022
2 parents 67351ad + e0d9aef commit f86b161
Show file tree
Hide file tree
Showing 8 changed files with 176 additions and 85 deletions.
1 change: 0 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,5 @@ overflow-checks = false # Go FAAASSTTT!
[dependencies]
zstd = "0.5"
crossbeam = "0.3"
fasthash = "0.4"
filebuffer = "0.4"
clap = { version = "3.1.1", features = ["derive"] }
157 changes: 97 additions & 60 deletions README.md

Large diffs are not rendered by default.

4 changes: 4 additions & 0 deletions scripts/deduplicate_single_file.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
python3 scripts/make_suffix_array.py $1
cargo run self-similar --data-file $1 --length-threshold $3 --cache-dir /tmp/cache --num-threads $4
cargo run collect --data-file $1 --cache-dir /tmp/cache --length-threshold $3 > /tmp/drop_tokens_file
python3 scripts/finish_single_file.py $1 /tmp/drop_tokens_file $2
30 changes: 23 additions & 7 deletions scripts/finish_dedup_lm1b.py → scripts/finish_dedup_wiki40b.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,16 @@
# Copyright 2022 Google LLC
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import numpy as np
import os
import shutil
Expand Down Expand Up @@ -38,13 +51,13 @@ def serialize_example(**feature):

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

#print(cumsum[:5])

remove = defaultdict(list)

def run(args):
this_idx, row = args
new_row = {'text': row,
'version_id': '',
'wikidata_id': '',
'timestamp': '',
'url': '',
'content-length': '',
Expand Down Expand Up @@ -72,6 +85,8 @@ def _info(self):
builder=self,
features=tfds.features.FeaturesDict({
'text': tfds.features.Text(),
'version_id': tfds.features.Text(),
'wikidata_id': tfds.features.Text(),
'timestamp': tfds.features.Text(),
'url': tfds.features.Text(),
'content-length': tfds.features.Text(),
Expand Down Expand Up @@ -132,7 +147,6 @@ def _generate_examples(self, split):

sizes = np.frombuffer(open(os.path.join(args.suffixarray_dir, args.name+"."+args.split+".size"), "rb").read(), dtype=np.uint64)

#print(np.max(sizes))
remove_ex = defaultdict(list)
ptr = 0
for i,byte_start in enumerate(sizes[:-1]):
Expand All @@ -146,15 +160,17 @@ def _generate_examples(self, split):
min(int(remove[ptr][1] - byte_start), byte_end-byte_start)))
ptr += 1

#print(remove_ex)
tfds.load("my_dataset", data_dir=where+"_dedup")


if dataset == "lm1b":
en = os.path.join(where+"_dedup", "lm1b")
if dataset == "wiki40b":
en = os.path.join(where+"_dedup", "wiki40b")
if not os.path.exists(en):
os.mkdir(en)
en = os.path.join(en, "en")
if not os.path.exists(en):
os.mkdir(en)
en = os.path.join(en, "1.1.0")
en = os.path.join(en, "1.3.0")
if not os.path.exists(en):
os.mkdir(en)

Expand Down
37 changes: 37 additions & 0 deletions scripts/finish_single_file.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# Copyright 2022 Google LLC
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import sys

original = sys.argv[1]
remove_file = sys.argv[2]
deduped = sys.argv[3]

remove = []
fin = open(remove_file)
for line in fin:
if 'out' in line: break
for line in fin:
remove.append(list(map(int,line.split())))
remove = remove[::-1]

ds = open(original,"rb")
new_ds = open(deduped,"wb")

start = 0
while len(remove) > 0:
a,b = remove.pop()
new_ds.write(ds.read(a-start))
ds.seek(b)
start = b
new_ds.write(ds.read())
27 changes: 13 additions & 14 deletions scripts/load_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,20 +74,19 @@ def tok(x):

fout = open(os.path.join(save_dir, dataset_name+"."+split), "wb")

p = mp.Pool(mp.cpu_count())

i = 0
sizes = [0]
for b in ds:
print(i)

text = b['text'].numpy()
text = p.map(tok,text)
with mp.Pool(mp.cpu_count()) as p:
i = 0
sizes = [0]
for b in ds:
print(i)

for x in text:
next_line = sep()+x
fout.write(next_line)
sizes.append(sizes[-1]+len(next_line))
i += 1
text = b['text'].numpy()
text = p.map(tok,text)

for x in text:
next_line = sep()+x
fout.write(next_line)
sizes.append(sizes[-1]+len(next_line))
i += 1

open(os.path.join(save_dir,dataset_name+"."+split+".size"), "wb").write(np.array(sizes,dtype=np.uint64).tobytes())
4 changes: 2 additions & 2 deletions scripts/run_pipeline.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
TFDS_DIR=/tmp/tensorflow_datasets/
DATA_DIR=/tmp/data/
DATASET=lm1b
DATASET=wiki40b
SPLIT=test
THRESHOLD=100
CACHE=/tmp/cache/
Expand All @@ -11,7 +11,7 @@ python3 scripts/load_dataset.py --data_dir $TFDS_DIR --save_dir $DATA_DIR --name

python3 scripts/make_suffix_array.py $DATA_DIR$DATASET.$SPLIT

cargo run self-similar --data-file $DATA_DIR$DATASET.$SPLIT --length-threshold $THRESHOLD --cache-dir $CACHE --num-threads 96
cargo run self-similar --data-file $DATA_DIR$DATASET.$SPLIT --length-threshold $THRESHOLD --cache-dir $CACHE

cargo run collect --data-name $DATASET.$SPLIT --cache-dir $CACHE > /tmp/$DATASET.$SPLIT.remove.byterange

Expand Down
1 change: 0 additions & 1 deletion src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,6 @@ extern crate filebuffer;
extern crate zstd;
extern crate crossbeam;
extern crate clap;
extern crate fasthash;

use std::cmp::Ordering;
use std::collections::BinaryHeap;
Expand Down

0 comments on commit f86b161

Please sign in to comment.