Skip to content

Commit

Permalink
Add tests of io.py
Browse files Browse the repository at this point in the history
  • Loading branch information
frreiss committed Mar 25, 2020
1 parent 0f60fdc commit b82b905
Show file tree
Hide file tree
Showing 2 changed files with 181 additions and 0 deletions.
9 changes: 9 additions & 0 deletions test_data/test_io/test.dict
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# This is a comment.
Dictionary Entry
Entry
# This is also a comment.
#
Help me! I am trapped
In a Haiku factory!
Save me before they

172 changes: 172 additions & 0 deletions text_extensions_for_pandas/test_io.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,172 @@
#
# Copyright (c) 2020 IBM Corp.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http:https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import numpy as np
import unittest
import textwrap

from text_extensions_for_pandas.io import *

import spacy

_SPACY_LANGUAGE_MODEL = spacy.load("en_core_web_sm")


class IOTest(unittest.TestCase):
def test_make_tokens(self):
from spacy.lang.en import English
nlp = English()
tokenizer = nlp.Defaults.create_tokenizer(nlp)
series = make_tokens(
"The quick, brown fox jumped over the hazy bog...", tokenizer
)
self.assertEqual(
repr(series),
textwrap.dedent(
"""\
0 [0, 3): 'The'
1 [4, 9): 'quick'
2 [9, 10): ','
3 [11, 16): 'brown'
4 [17, 20): 'fox'
5 [21, 27): 'jumped'
6 [28, 32): 'over'
7 [33, 36): 'the'
8 [37, 41): 'hazy'
9 [42, 45): 'bog'
10 [45, 48): '...'
dtype: CharSpan"""
),
)

def test_make_tokens_and_features(self):
df = make_tokens_and_features(
"She sold c shills by the Sith Lord.", _SPACY_LANGUAGE_MODEL
)
# print(f"****{str(df.to_records())}****")
self.assertEqual(
str(df.to_records()),
textwrap.dedent(
"""\
[(0, 0, [0, 3): 'She', [0, 3): 'She', '-PRON-', 'PRON', 'PRP', 'nsubj', 1, 'Xxx', 'O', '', True, True, [0, 35): 'She sold c shills by the Sith Lord.')
(1, 1, [4, 8): 'sold', [4, 8): 'sold', 'sell', 'VERB', 'VBD', 'ROOT', 1, 'xxxx', 'O', '', True, False, [0, 35): 'She sold c shills by the Sith Lord.')
(2, 2, [9, 10): 'c', [9, 10): 'c', 'c', 'NOUN', 'NN', 'compound', 3, 'x', 'O', '', True, False, [0, 35): 'She sold c shills by the Sith Lord.')
(3, 3, [11, 17): 'shills', [11, 17): 'shills', 'shill', 'NOUN', 'NNS', 'dobj', 1, 'xxxx', 'O', '', True, False, [0, 35): 'She sold c shills by the Sith Lord.')
(4, 4, [18, 20): 'by', [18, 20): 'by', 'by', 'ADP', 'IN', 'prep', 3, 'xx', 'O', '', True, True, [0, 35): 'She sold c shills by the Sith Lord.')
(5, 5, [21, 24): 'the', [21, 24): 'the', 'the', 'DET', 'DT', 'det', 7, 'xxx', 'O', '', True, True, [0, 35): 'She sold c shills by the Sith Lord.')
(6, 6, [25, 29): 'Sith', [25, 29): 'Sith', 'Sith', 'PROPN', 'NNP', 'compound', 7, 'Xxxx', 'O', '', True, False, [0, 35): 'She sold c shills by the Sith Lord.')
(7, 7, [30, 34): 'Lord', [30, 34): 'Lord', 'Lord', 'PROPN', 'NNP', 'pobj', 4, 'Xxxx', 'O', '', True, False, [0, 35): 'She sold c shills by the Sith Lord.')
(8, 8, [34, 35): '.', [34, 35): '.', '.', 'PUNCT', '.', 'punct', 1, '.', 'O', '', False, False, [0, 35): 'She sold c shills by the Sith Lord.')]"""
),
)
df2 = make_tokens_and_features(
"She sold c shills by the Sith Lord.",
_SPACY_LANGUAGE_MODEL,
add_left_and_right=True,
)
# print(f"****{str(df2.to_records())}****")
self.assertEqual(
str(df2.to_records()),
textwrap.dedent(
"""\
[(0, 0, [0, 3): 'She', [0, 3): 'She', '-PRON-', 'PRON', 'PRP', 'nsubj', 1, 'Xxx', 'O', '', True, True, [0, 35): 'She sold c shills by the Sith Lord.', <NA>, 1)
(1, 1, [4, 8): 'sold', [4, 8): 'sold', 'sell', 'VERB', 'VBD', 'ROOT', 1, 'xxxx', 'O', '', True, False, [0, 35): 'She sold c shills by the Sith Lord.', 0, 2)
(2, 2, [9, 10): 'c', [9, 10): 'c', 'c', 'NOUN', 'NN', 'compound', 3, 'x', 'O', '', True, False, [0, 35): 'She sold c shills by the Sith Lord.', 1, 3)
(3, 3, [11, 17): 'shills', [11, 17): 'shills', 'shill', 'NOUN', 'NNS', 'dobj', 1, 'xxxx', 'O', '', True, False, [0, 35): 'She sold c shills by the Sith Lord.', 2, 4)
(4, 4, [18, 20): 'by', [18, 20): 'by', 'by', 'ADP', 'IN', 'prep', 3, 'xx', 'O', '', True, True, [0, 35): 'She sold c shills by the Sith Lord.', 3, 5)
(5, 5, [21, 24): 'the', [21, 24): 'the', 'the', 'DET', 'DT', 'det', 7, 'xxx', 'O', '', True, True, [0, 35): 'She sold c shills by the Sith Lord.', 4, 6)
(6, 6, [25, 29): 'Sith', [25, 29): 'Sith', 'Sith', 'PROPN', 'NNP', 'compound', 7, 'Xxxx', 'O', '', True, False, [0, 35): 'She sold c shills by the Sith Lord.', 5, 7)
(7, 7, [30, 34): 'Lord', [30, 34): 'Lord', 'Lord', 'PROPN', 'NNP', 'pobj', 4, 'Xxxx', 'O', '', True, False, [0, 35): 'She sold c shills by the Sith Lord.', 6, 8)
(8, 8, [34, 35): '.', [34, 35): '.', '.', 'PUNCT', '.', 'punct', 1, '.', 'O', '', False, False, [0, 35): 'She sold c shills by the Sith Lord.', 7, <NA>)]"""
),
)

def test_token_features_to_tree(self):
df = make_tokens_and_features(
"Peter Peeper packed a puck of liquid flubber.", _SPACY_LANGUAGE_MODEL
)
json = token_features_to_tree(df)
# print(f"****{json}****")
expected = {
"words": [
{"text": "Peter", "tag": "NNP"},
{"text": "Peeper", "tag": "NNP"},
{"text": "packed", "tag": "VBD"},
{"text": "a", "tag": "DT"},
{"text": "puck", "tag": "NN"},
{"text": "of", "tag": "IN"},
{"text": "liquid", "tag": "JJ"},
{"text": "flubber", "tag": "NN"},
{"text": ".", "tag": "."},
],
"arcs": [
{"start": 0, "end": 1, "label": "compound", "dir": "left"},
{"start": 1, "end": 2, "label": "nsubj", "dir": "left"},
{"start": 3, "end": 4, "label": "det", "dir": "left"},
{"start": 2, "end": 4, "label": "dobj", "dir": "right"},
{"start": 4, "end": 5, "label": "prep", "dir": "right"},
{"start": 6, "end": 7, "label": "amod", "dir": "left"},
{"start": 5, "end": 7, "label": "pobj", "dir": "right"},
{"start": 2, "end": 8, "label": "punct", "dir": "right"},
],
}
self.assertDictEqual(json, expected)

def test_iob_to_spans(self):
df = make_tokens_and_features(
textwrap.dedent(
"""\
The Bermuda Triangle got tired of warm weather.
It moved to Alaska. Now Santa Claus is missing.
-- Steven Wright"""
),
_SPACY_LANGUAGE_MODEL,
)
spans = iob_to_spans(df)
# print(f"****{spans}****")
self.assertEqual(
str(spans),
textwrap.dedent(
"""\
token_span ent_type
0 [61, 67): 'Alaska' GPE
1 [73, 84): 'Santa Claus' GPE
2 [100, 113): 'Steven Wright' PERSON"""
),
)

def test_load_dict(self):
from spacy.lang.en import English
nlp = English()
tokenizer = nlp.Defaults.create_tokenizer(nlp)
df = load_dict("test_data/test_io/test.dict", tokenizer)
# print(f"***{df}***")
self.assertEqual(
str(df),
textwrap.dedent(
"""\
toks_0 toks_1 toks_2 toks_3 toks_4 toks_5 toks_6
0 dictionary entry None None None None None
1 entry None None None None None None
2 help me ! i am trapped None
3 in a haiku factory ! None None
4 save me before they None None None
5 None None None None None None None"""
)
)


if __name__ == "__main__":
unittest.main()

0 comments on commit b82b905

Please sign in to comment.