From c481c8a7bb41a7027b7a55137302c690b2b5ba01 Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Fri, 6 Apr 2018 13:32:38 +0300 Subject: [PATCH 1/5] Initial implementation of analyzedir command. Fixes #94 --- annif/cli.py | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/annif/cli.py b/annif/cli.py index 83da44e76..cad6bd022 100644 --- a/annif/cli.py +++ b/annif/cli.py @@ -4,6 +4,8 @@ import collections import logging +import os.path +import re import sys import click import click_log @@ -151,6 +153,50 @@ def run_analyze(project_id, limit, threshold, backend_param): click.echo("{}\t<{}>\t{}".format(hit.score, hit.uri, hit.label)) +@cli.command('analyzedir') +@click_log.simple_verbosity_option(logger) +@click.argument('project_id') +@click.argument('directory') +@click.option('--suffix', default='.annif') +@click.option('--force', default=False) +@click.option('--limit', default=10) +@click.option('--threshold', default=0.0) +@click.option('--backend-param', '-b', multiple=True) +def run_analyzedir( + project_id, + directory, + suffix, + force, + limit, + threshold, + backend_param): + """" + Analyze a directory with documents. Write the results in TSV files + with the given suffix. + + USAGE: annif analyzedir [--suffix=SUFFIX] + [--force=FORCE] [--limit=N] [--threshold=N] + """ + project = get_project(project_id) + backend_params = parse_backend_params(backend_param) + hit_filter = HitFilter(limit, threshold) + + for docfilename, dummy_subjectfn in annif.corpus.DocumentDirectory( + directory, require_subjects=False): + with open(docfilename) as docfile: + text = docfile.read() + subjectfilename = re.sub(r'\.txt$', suffix, docfilename) + if os.path.exists(subjectfilename) and not force: + click.echo( + "Not overwriting {} (use --force to override)".format( + subjectfilename)) + continue + with open(subjectfilename, 'w') as subjfile: + for hit in hit_filter(project.analyze(text, backend_params)): + line = "<{}>\t{}\t{}".format(hit.uri, hit.label, hit.score) + click.echo(line, file=subjfile) + + @cli.command('eval') @click_log.simple_verbosity_option(logger) @click.argument('project_id') From a0f72082025a16c44af2254a7bb632a923ce368d Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Fri, 6 Apr 2018 17:09:33 +0300 Subject: [PATCH 2/5] add simple unit test for analyzedir --- tests/test_cli.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/tests/test_cli.py b/tests/test_cli.py index a323054db..a9a6e71a5 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -109,6 +109,19 @@ def test_analyze_param(): assert result.exit_code == 0 +def test_analyzedir(tmpdir): + tmpdir.join('doc1.txt').write('nothing special') + + result = runner.invoke( + annif.cli.cli, ['analyzedir', 'dummy-en', str(tmpdir)]) + assert not result.exception + assert result.exit_code == 0 + + assert tmpdir.join('doc1.annif').exists() + assert tmpdir.join('doc1.annif').read_text( + 'utf-8') == "\tdummy\t0.5\n" + + def test_eval_label(tmpdir): keyfile = tmpdir.join('dummy.key') keyfile.write("dummy\nanother\n") From b08aed5446a066d60e0bf2aaaf1a61f1cfb2448f Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Fri, 6 Apr 2018 17:15:54 +0300 Subject: [PATCH 3/5] reformat function args --- annif/cli.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/annif/cli.py b/annif/cli.py index cad6bd022..da009a9ca 100644 --- a/annif/cli.py +++ b/annif/cli.py @@ -162,14 +162,8 @@ def run_analyze(project_id, limit, threshold, backend_param): @click.option('--limit', default=10) @click.option('--threshold', default=0.0) @click.option('--backend-param', '-b', multiple=True) -def run_analyzedir( - project_id, - directory, - suffix, - force, - limit, - threshold, - backend_param): +def run_analyzedir(project_id, directory, suffix, force, + limit, threshold, backend_param): """" Analyze a directory with documents. Write the results in TSV files with the given suffix. From 37d56b823cdd289047d1f158709d0e3e6e48b738 Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Fri, 6 Apr 2018 17:17:37 +0300 Subject: [PATCH 4/5] extend analyzedir unit test to check for overwriting --- tests/test_cli.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tests/test_cli.py b/tests/test_cli.py index a9a6e71a5..788d0ccd5 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -121,6 +121,13 @@ def test_analyzedir(tmpdir): assert tmpdir.join('doc1.annif').read_text( 'utf-8') == "\tdummy\t0.5\n" + # make sure that preexisting subject files are not overwritten + result = runner.invoke( + annif.cli.cli, ['analyzedir', 'dummy-en', str(tmpdir)]) + assert not result.exception + assert result.exit_code == 0 + assert "Not overwriting" in result.output + def test_eval_label(tmpdir): keyfile = tmpdir.join('dummy.key') From 995071ace004ecc7c6e0eccfc7db855be4382c6f Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Fri, 6 Apr 2018 17:23:32 +0300 Subject: [PATCH 5/5] test for analyzedir --force parameter (and make it a proper boolean flag) --- annif/cli.py | 2 +- tests/test_cli.py | 8 ++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/annif/cli.py b/annif/cli.py index da009a9ca..98bd1ea99 100644 --- a/annif/cli.py +++ b/annif/cli.py @@ -158,7 +158,7 @@ def run_analyze(project_id, limit, threshold, backend_param): @click.argument('project_id') @click.argument('directory') @click.option('--suffix', default='.annif') -@click.option('--force', default=False) +@click.option('--force/--no-force', default=False) @click.option('--limit', default=10) @click.option('--threshold', default=0.0) @click.option('--backend-param', '-b', multiple=True) diff --git a/tests/test_cli.py b/tests/test_cli.py index 788d0ccd5..4e2e83a1d 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -128,6 +128,14 @@ def test_analyzedir(tmpdir): assert result.exit_code == 0 assert "Not overwriting" in result.output + # check that the --force parameter forces overwriting + result = runner.invoke( + annif.cli.cli, ['analyzedir', 'dummy-fi', '--force', str(tmpdir)]) + assert tmpdir.join('doc1.annif').exists() + assert "Not overwriting" not in result.output + assert tmpdir.join('doc1.annif').read_text( + 'utf-8') == "\tdummy\t1.0\n" + def test_eval_label(tmpdir): keyfile = tmpdir.join('dummy.key')