-
Notifications
You must be signed in to change notification settings - Fork 8
/
nlp_pipeline.py
68 lines (55 loc) · 1.66 KB
/
nlp_pipeline.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
# -*- coding: utf-8 -*-
import re
from os import listdir
from os.path import isfile, join
from time import time
import options
from utils import init_logging
options.update_from_args()
from options import CORPUS_PREFIXES, DE, STORE, START, BATCH_SIZE, BATCHES
from constants import FULL_PATH
from nlp_processor import NLPProcessor
if __name__ == "__main__":
t0 = time()
### --- run ---
logger = init_logging("NLP")
def logg(msg):
logger.info(msg)
logg("##### START #####")
# filter files for certain prefixes
prefixes = r"^(" + "|".join(CORPUS_PREFIXES) + r")."
pattern = re.compile(prefixes)
files = sorted(
[
f
for f in listdir(FULL_PATH)
if (isfile(join(FULL_PATH, f)) and pattern.match(f))
]
)
processor = NLPProcessor(spacy_path=DE, logg=logg)
start = START # 550_000
batch_size = BATCH_SIZE # 50_000
batches = BATCHES
for name in files:
corpus = name.split(".")[0]
fname = join(FULL_PATH, name)
for i in range(1, batches + 1):
logg(">>> batch: {:d} >>>".format(i))
processor.read_process_store(
fname,
corpus,
start=start,
stop=(start + batch_size) if batch_size else None,
store=STORE,
# vocab_to_disk=STORE,
# print=True,
# head=1000,
)
if batch_size:
start += batch_size
else:
break
t1 = int(time() - t0)
logg(
"all done in {:02d}:{:02d}:{:02d}".format(t1 // 3600, (t1 // 60) % 60, t1 % 60)
)