From 77156ceca314d3e22010f23cbbdcdaf6c39e46fd Mon Sep 17 00:00:00 2001 From: Alistair Johnson Date: Thu, 28 Apr 2022 13:34:57 -0400 Subject: [PATCH 1/2] explicitly set multiprocess start method to fork for cross-OS consistency The method for launching a process can be "spawn", "fork", and "forkserver". The default on Unix is fork, and the resulting process inherits all resources from the parent process. Conversely, the default on Mac OS X/Windows is spawn, which results in a minimal number of resources inherited by the child process. https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods --- scripts/finish_dedup_wiki40b.py | 2 +- scripts/load_dataset.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/finish_dedup_wiki40b.py b/scripts/finish_dedup_wiki40b.py index 2782627..3a33661 100644 --- a/scripts/finish_dedup_wiki40b.py +++ b/scripts/finish_dedup_wiki40b.py @@ -110,7 +110,7 @@ def _generate_examples(self, split): data_dir=args.data_dir) - p = mp.Pool(96) + p = mp.get_context("fork").Pool(mp.cpu_count()) i = -1 for batch in ds: i += 1 diff --git a/scripts/load_dataset.py b/scripts/load_dataset.py index 6cca9b8..51ad1ad 100644 --- a/scripts/load_dataset.py +++ b/scripts/load_dataset.py @@ -74,7 +74,7 @@ def tok(x): fout = open(os.path.join(save_dir, dataset_name+"."+split), "wb") -with mp.Pool(mp.cpu_count()) as p: +with mp.get_context("fork").Pool(mp.cpu_count()) as p: i = 0 sizes = [0] for b in ds: From 2c1360bff7b023439fa89f7bdff8efc92b1b9a48 Mon Sep 17 00:00:00 2001 From: Alistair Johnson Date: Thu, 28 Apr 2022 13:38:15 -0400 Subject: [PATCH 2/2] fix typo in bash call --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 3f79ba4..61362f4 100644 --- a/README.md +++ b/README.md @@ -249,7 +249,7 @@ Okay so maybe you don't like reading. You skipped the entire section above. (Hon Then just do this ``` -bash scripts/scripts/run_pipeline.sh +bash scripts/run_pipeline.sh python3 scripts/finish_dedup_wiki40b.py --data_dir ~/tensorflow_datasets/ --save_dir /tmp/dedup --name wiki40b --split test --suffixarray_dir data --remove /tmp/wiki40b.test.remove.byterange ```