Skip to content

Commit

Permalink
Add Subject and SubjectDirectory classes for representing subject cor…
Browse files Browse the repository at this point in the history
…pus. Fixes #57
  • Loading branch information
osma committed Mar 19, 2018
1 parent b35871b commit d441897
Show file tree
Hide file tree
Showing 3 changed files with 53 additions and 0 deletions.
1 change: 1 addition & 0 deletions annif/corpus/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@


from .docdir import DocumentDirectory
from .subject import Subject, SubjectDirectory


class SubjectSet:
Expand Down
27 changes: 27 additions & 0 deletions annif/corpus/subject.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
"""A directory of files as a subject corpus"""


import glob
import os.path
import re


class Subject:
def __init__(self, uri, label, text):
self.uri = uri
self.label = label
self.text = text


class SubjectDirectory:
def __init__(self, path):
self.path = path

def __iter__(self):
"""Iterate through the directory, yielding Subject objects."""

for filename in glob.glob(os.path.join(self.path, '*.txt')):
with open(filename) as subjfile:
uri, label = subjfile.readline().strip().split(' ', 1)
text = ' '.join(subjfile.readlines())
yield Subject(uri, label, text)
25 changes: 25 additions & 0 deletions tests/test_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,3 +93,28 @@ def test_docdir_tsv_require_subjects(tmpdir):
assert files[0][1] == str(tmpdir.join('doc1.tsv'))
assert files[1][0] == str(tmpdir.join('doc2.txt'))
assert files[1][1] == str(tmpdir.join('doc2.tsv'))


def test_subjdir(tmpdir):
tmpdir.join('subj1.txt').write("""https://example.org/subj1 subject one
first subject
this is the first thing we know about""")
tmpdir.join('subj2.txt').write("""https://example.org/subj2 subject two
second subject
this is the second thing we know about""")
tmpdir.join('subj3.txt').write("""https://example.org/subj3 subject three
third subject
this is the third thing we know about""")

subjdir = annif.corpus.SubjectDirectory(str(tmpdir))
subjects = sorted(list(subjdir), key=lambda subj: subj.uri)
assert len(subjects) == 3
assert subjects[0].uri == 'https://example.org/subj1'
assert subjects[0].label == 'subject one'
assert 'first' in subjects[0].text
assert subjects[1].uri == 'https://example.org/subj2'
assert subjects[1].label == 'subject two'
assert 'second' in subjects[1].text
assert subjects[2].uri == 'https://example.org/subj3'
assert subjects[2].label == 'subject three'
assert 'third' in subjects[2].text

0 comments on commit d441897

Please sign in to comment.