Skip to content

Commit

Permalink
Merge pull request #95 from NatLibFi/analyzedir-command
Browse files Browse the repository at this point in the history
Analyzedir command
  • Loading branch information
osma committed Apr 6, 2018
2 parents 4a82078 + 995071a commit ec91638
Show file tree
Hide file tree
Showing 2 changed files with 68 additions and 0 deletions.
40 changes: 40 additions & 0 deletions annif/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@

import collections
import logging
import os.path
import re
import sys
import click
import click_log
Expand Down Expand Up @@ -151,6 +153,44 @@ def run_analyze(project_id, limit, threshold, backend_param):
click.echo("{}\t<{}>\t{}".format(hit.score, hit.uri, hit.label))


@cli.command('analyzedir')
@click_log.simple_verbosity_option(logger)
@click.argument('project_id')
@click.argument('directory')
@click.option('--suffix', default='.annif')
@click.option('--force/--no-force', default=False)
@click.option('--limit', default=10)
@click.option('--threshold', default=0.0)
@click.option('--backend-param', '-b', multiple=True)
def run_analyzedir(project_id, directory, suffix, force,
limit, threshold, backend_param):
""""
Analyze a directory with documents. Write the results in TSV files
with the given suffix.
USAGE: annif analyzedir <project_id> <directory> [--suffix=SUFFIX]
[--force=FORCE] [--limit=N] [--threshold=N]
"""
project = get_project(project_id)
backend_params = parse_backend_params(backend_param)
hit_filter = HitFilter(limit, threshold)

for docfilename, dummy_subjectfn in annif.corpus.DocumentDirectory(
directory, require_subjects=False):
with open(docfilename) as docfile:
text = docfile.read()
subjectfilename = re.sub(r'\.txt$', suffix, docfilename)
if os.path.exists(subjectfilename) and not force:
click.echo(
"Not overwriting {} (use --force to override)".format(
subjectfilename))
continue
with open(subjectfilename, 'w') as subjfile:
for hit in hit_filter(project.analyze(text, backend_params)):
line = "<{}>\t{}\t{}".format(hit.uri, hit.label, hit.score)
click.echo(line, file=subjfile)


@cli.command('eval')
@click_log.simple_verbosity_option(logger)
@click.argument('project_id')
Expand Down
28 changes: 28 additions & 0 deletions tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,34 @@ def test_analyze_param():
assert result.exit_code == 0


def test_analyzedir(tmpdir):
tmpdir.join('doc1.txt').write('nothing special')

result = runner.invoke(
annif.cli.cli, ['analyzedir', 'dummy-en', str(tmpdir)])
assert not result.exception
assert result.exit_code == 0

assert tmpdir.join('doc1.annif').exists()
assert tmpdir.join('doc1.annif').read_text(
'utf-8') == "<http:https://example.org/dummy>\tdummy\t0.5\n"

# make sure that preexisting subject files are not overwritten
result = runner.invoke(
annif.cli.cli, ['analyzedir', 'dummy-en', str(tmpdir)])
assert not result.exception
assert result.exit_code == 0
assert "Not overwriting" in result.output

# check that the --force parameter forces overwriting
result = runner.invoke(
annif.cli.cli, ['analyzedir', 'dummy-fi', '--force', str(tmpdir)])
assert tmpdir.join('doc1.annif').exists()
assert "Not overwriting" not in result.output
assert tmpdir.join('doc1.annif').read_text(
'utf-8') == "<http:https://example.org/dummy>\tdummy\t1.0\n"


def test_eval_label(tmpdir):
keyfile = tmpdir.join('dummy.key')
keyfile.write("dummy\nanother\n")
Expand Down

0 comments on commit ec91638

Please sign in to comment.