Skip to content

Commit

Permalink
Fail early on crashes; purge cache directory on rerun (Fixes #11)
Browse files Browse the repository at this point in the history
  • Loading branch information
carlini committed Apr 15, 2022
1 parent 8a172b0 commit d584d0c
Show file tree
Hide file tree
Showing 3 changed files with 15 additions and 3 deletions.
1 change: 1 addition & 0 deletions scripts/deduplicate_single_file.sh
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
set -e
python3 scripts/make_suffix_array.py $1
cargo run self-similar --data-file $1 --length-threshold $3 --cache-dir /tmp/cache --num-threads $4
cargo run collect --data-file $1 --cache-dir /tmp/cache --length-threshold $3 > /tmp/drop_tokens_file
Expand Down
11 changes: 9 additions & 2 deletions scripts/make_suffix_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,6 @@
for x,(s,e) in zip(files,started):
size_data = os.path.getsize(x)
FACT = np.ceil(np.log(size_data)/np.log(2)/8)
print("FACT", FACT)
size_table = os.path.getsize(x+".table.bin")
if not os.path.exists(x) or not os.path.exists(x+".table.bin") or size_table == 0 or size_data*FACT != size_table:
cmd = "./target/debug/dedup_dataset make-part --data-file %s --start-byte %d --end-byte %d"%(sys.argv[1], s, e)
Expand All @@ -80,13 +79,21 @@

print("Merging suffix trees")

os.popen("rm tmp/out.table.bin.*").read()

torun = " --suffix-path ".join(files)
print("./target/debug/dedup_dataset merge --output-file %s --suffix-path %s --num-threads %d"%("tmp/out.table.bin", torun, mp.cpu_count()))
os.popen("./target/debug/dedup_dataset merge --output-file %s --suffix-path %s --num-threads %d"%("tmp/out.table.bin", torun, mp.cpu_count())).read()
#exit(0)
print("Now merging individual tables")
os.popen("cat tmp/out.table.bin.* > tmp/out.table.bin").read()
print("Cleaning up")
#os.popen("rm tmp/out.table.bin.*").read()
os.popen("mv tmp/out.table.bin %s.table.bin"%sys.argv[1]).read()

if os.path.exists(sys.argv[1]+".table.bin"):
if os.path.getsize(sys.argv[1]+".table.bin")%os.path.getsize(sys.argv[1]) != 0:
print("File size is wrong")
exit(1)
else:
print("Failed to create table")
exit(1)
6 changes: 5 additions & 1 deletion src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -511,8 +511,12 @@ fn cmd_self_similar(data_file: &String, length_threshold: &usize, frequency_thre
let mut first = true;

loop {
cur_location = get_next_pointer_from_table(&mut table);
cur_location = get_next_pointer_from_table_canfail(&mut table);
i += 1;
if cur_location == std::u64::MAX {
// The last two items in the file matched
break;
}

let suf2 = &text[cur_location as usize..];
let does_match = suf2.len() >= length_threshold && suf1.len() >= length_threshold && suf1[..length_threshold] == suf2[..length_threshold];
Expand Down

0 comments on commit d584d0c

Please sign in to comment.