Skip to content

Commit

Permalink
BENCH Make benchmarks/bench_text_vectorizers.py run faster (scikit-le…
Browse files Browse the repository at this point in the history
  • Loading branch information
rth authored and qinhanmin2014 committed Dec 23, 2018
1 parent 19a7c08 commit 8d7e849
Showing 1 changed file with 3 additions and 4 deletions.
7 changes: 3 additions & 4 deletions benchmarks/bench_text_vectorizers.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,20 +32,19 @@ def f():
return f


text = fetch_20newsgroups(subset='train').data
text = fetch_20newsgroups(subset='train').data[:1000]

print("="*80 + '\n#' + " Text vectorizers benchmark" + '\n' + '='*80 + '\n')
print("Using a subset of the 20 newsrgoups dataset ({} documents)."
.format(len(text)))
print("This benchmarks runs in ~20 min ...")
print("This benchmarks runs in ~1 min ...")

res = []

for Vectorizer, (analyzer, ngram_range) in itertools.product(
[CountVectorizer, TfidfVectorizer, HashingVectorizer],
[('word', (1, 1)),
('word', (1, 2)),
('word', (1, 4)),
('char', (4, 4)),
('char_wb', (4, 4))
]):
Expand All @@ -56,7 +55,7 @@ def f():
dt = timeit.repeat(run_vectorizer(Vectorizer, text, **params),
number=1,
repeat=n_repeat)
bench['time'] = "{:.2f} (+-{:.2f})".format(np.mean(dt), np.std(dt))
bench['time'] = "{:.3f} (+-{:.3f})".format(np.mean(dt), np.std(dt))

mem_usage = memory_usage(run_vectorizer(Vectorizer, text, **params))

Expand Down

0 comments on commit 8d7e849

Please sign in to comment.