From b82b9055f5161a5483f76ecf60f4b233fa04323c Mon Sep 17 00:00:00 2001 From: frreiss Date: Wed, 25 Mar 2020 14:52:30 -0700 Subject: [PATCH] Add tests of io.py --- test_data/test_io/test.dict | 9 ++ text_extensions_for_pandas/test_io.py | 172 ++++++++++++++++++++++++++ 2 files changed, 181 insertions(+) create mode 100644 test_data/test_io/test.dict create mode 100644 text_extensions_for_pandas/test_io.py diff --git a/test_data/test_io/test.dict b/test_data/test_io/test.dict new file mode 100644 index 00000000..204dd72e --- /dev/null +++ b/test_data/test_io/test.dict @@ -0,0 +1,9 @@ +# This is a comment. +Dictionary Entry +Entry +# This is also a comment. +# +Help me! I am trapped +In a Haiku factory! +Save me before they + diff --git a/text_extensions_for_pandas/test_io.py b/text_extensions_for_pandas/test_io.py new file mode 100644 index 00000000..3bed5711 --- /dev/null +++ b/text_extensions_for_pandas/test_io.py @@ -0,0 +1,172 @@ +# +# Copyright (c) 2020 IBM Corp. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import numpy as np +import unittest +import textwrap + +from text_extensions_for_pandas.io import * + +import spacy + +_SPACY_LANGUAGE_MODEL = spacy.load("en_core_web_sm") + + +class IOTest(unittest.TestCase): + def test_make_tokens(self): + from spacy.lang.en import English + nlp = English() + tokenizer = nlp.Defaults.create_tokenizer(nlp) + series = make_tokens( + "The quick, brown fox jumped over the hazy bog...", tokenizer + ) + self.assertEqual( + repr(series), + textwrap.dedent( + """\ + 0 [0, 3): 'The' + 1 [4, 9): 'quick' + 2 [9, 10): ',' + 3 [11, 16): 'brown' + 4 [17, 20): 'fox' + 5 [21, 27): 'jumped' + 6 [28, 32): 'over' + 7 [33, 36): 'the' + 8 [37, 41): 'hazy' + 9 [42, 45): 'bog' + 10 [45, 48): '...' + dtype: CharSpan""" + ), + ) + + def test_make_tokens_and_features(self): + df = make_tokens_and_features( + "She sold c shills by the Sith Lord.", _SPACY_LANGUAGE_MODEL + ) + # print(f"****{str(df.to_records())}****") + self.assertEqual( + str(df.to_records()), + textwrap.dedent( + """\ + [(0, 0, [0, 3): 'She', [0, 3): 'She', '-PRON-', 'PRON', 'PRP', 'nsubj', 1, 'Xxx', 'O', '', True, True, [0, 35): 'She sold c shills by the Sith Lord.') + (1, 1, [4, 8): 'sold', [4, 8): 'sold', 'sell', 'VERB', 'VBD', 'ROOT', 1, 'xxxx', 'O', '', True, False, [0, 35): 'She sold c shills by the Sith Lord.') + (2, 2, [9, 10): 'c', [9, 10): 'c', 'c', 'NOUN', 'NN', 'compound', 3, 'x', 'O', '', True, False, [0, 35): 'She sold c shills by the Sith Lord.') + (3, 3, [11, 17): 'shills', [11, 17): 'shills', 'shill', 'NOUN', 'NNS', 'dobj', 1, 'xxxx', 'O', '', True, False, [0, 35): 'She sold c shills by the Sith Lord.') + (4, 4, [18, 20): 'by', [18, 20): 'by', 'by', 'ADP', 'IN', 'prep', 3, 'xx', 'O', '', True, True, [0, 35): 'She sold c shills by the Sith Lord.') + (5, 5, [21, 24): 'the', [21, 24): 'the', 'the', 'DET', 'DT', 'det', 7, 'xxx', 'O', '', True, True, [0, 35): 'She sold c shills by the Sith Lord.') + (6, 6, [25, 29): 'Sith', [25, 29): 'Sith', 'Sith', 'PROPN', 'NNP', 'compound', 7, 'Xxxx', 'O', '', True, False, [0, 35): 'She sold c shills by the Sith Lord.') + (7, 7, [30, 34): 'Lord', [30, 34): 'Lord', 'Lord', 'PROPN', 'NNP', 'pobj', 4, 'Xxxx', 'O', '', True, False, [0, 35): 'She sold c shills by the Sith Lord.') + (8, 8, [34, 35): '.', [34, 35): '.', '.', 'PUNCT', '.', 'punct', 1, '.', 'O', '', False, False, [0, 35): 'She sold c shills by the Sith Lord.')]""" + ), + ) + df2 = make_tokens_and_features( + "She sold c shills by the Sith Lord.", + _SPACY_LANGUAGE_MODEL, + add_left_and_right=True, + ) + # print(f"****{str(df2.to_records())}****") + self.assertEqual( + str(df2.to_records()), + textwrap.dedent( + """\ + [(0, 0, [0, 3): 'She', [0, 3): 'She', '-PRON-', 'PRON', 'PRP', 'nsubj', 1, 'Xxx', 'O', '', True, True, [0, 35): 'She sold c shills by the Sith Lord.', , 1) + (1, 1, [4, 8): 'sold', [4, 8): 'sold', 'sell', 'VERB', 'VBD', 'ROOT', 1, 'xxxx', 'O', '', True, False, [0, 35): 'She sold c shills by the Sith Lord.', 0, 2) + (2, 2, [9, 10): 'c', [9, 10): 'c', 'c', 'NOUN', 'NN', 'compound', 3, 'x', 'O', '', True, False, [0, 35): 'She sold c shills by the Sith Lord.', 1, 3) + (3, 3, [11, 17): 'shills', [11, 17): 'shills', 'shill', 'NOUN', 'NNS', 'dobj', 1, 'xxxx', 'O', '', True, False, [0, 35): 'She sold c shills by the Sith Lord.', 2, 4) + (4, 4, [18, 20): 'by', [18, 20): 'by', 'by', 'ADP', 'IN', 'prep', 3, 'xx', 'O', '', True, True, [0, 35): 'She sold c shills by the Sith Lord.', 3, 5) + (5, 5, [21, 24): 'the', [21, 24): 'the', 'the', 'DET', 'DT', 'det', 7, 'xxx', 'O', '', True, True, [0, 35): 'She sold c shills by the Sith Lord.', 4, 6) + (6, 6, [25, 29): 'Sith', [25, 29): 'Sith', 'Sith', 'PROPN', 'NNP', 'compound', 7, 'Xxxx', 'O', '', True, False, [0, 35): 'She sold c shills by the Sith Lord.', 5, 7) + (7, 7, [30, 34): 'Lord', [30, 34): 'Lord', 'Lord', 'PROPN', 'NNP', 'pobj', 4, 'Xxxx', 'O', '', True, False, [0, 35): 'She sold c shills by the Sith Lord.', 6, 8) + (8, 8, [34, 35): '.', [34, 35): '.', '.', 'PUNCT', '.', 'punct', 1, '.', 'O', '', False, False, [0, 35): 'She sold c shills by the Sith Lord.', 7, )]""" + ), + ) + + def test_token_features_to_tree(self): + df = make_tokens_and_features( + "Peter Peeper packed a puck of liquid flubber.", _SPACY_LANGUAGE_MODEL + ) + json = token_features_to_tree(df) + # print(f"****{json}****") + expected = { + "words": [ + {"text": "Peter", "tag": "NNP"}, + {"text": "Peeper", "tag": "NNP"}, + {"text": "packed", "tag": "VBD"}, + {"text": "a", "tag": "DT"}, + {"text": "puck", "tag": "NN"}, + {"text": "of", "tag": "IN"}, + {"text": "liquid", "tag": "JJ"}, + {"text": "flubber", "tag": "NN"}, + {"text": ".", "tag": "."}, + ], + "arcs": [ + {"start": 0, "end": 1, "label": "compound", "dir": "left"}, + {"start": 1, "end": 2, "label": "nsubj", "dir": "left"}, + {"start": 3, "end": 4, "label": "det", "dir": "left"}, + {"start": 2, "end": 4, "label": "dobj", "dir": "right"}, + {"start": 4, "end": 5, "label": "prep", "dir": "right"}, + {"start": 6, "end": 7, "label": "amod", "dir": "left"}, + {"start": 5, "end": 7, "label": "pobj", "dir": "right"}, + {"start": 2, "end": 8, "label": "punct", "dir": "right"}, + ], + } + self.assertDictEqual(json, expected) + + def test_iob_to_spans(self): + df = make_tokens_and_features( + textwrap.dedent( + """\ + The Bermuda Triangle got tired of warm weather. + It moved to Alaska. Now Santa Claus is missing. + -- Steven Wright""" + ), + _SPACY_LANGUAGE_MODEL, + ) + spans = iob_to_spans(df) + # print(f"****{spans}****") + self.assertEqual( + str(spans), + textwrap.dedent( + """\ + token_span ent_type + 0 [61, 67): 'Alaska' GPE + 1 [73, 84): 'Santa Claus' GPE + 2 [100, 113): 'Steven Wright' PERSON""" + ), + ) + + def test_load_dict(self): + from spacy.lang.en import English + nlp = English() + tokenizer = nlp.Defaults.create_tokenizer(nlp) + df = load_dict("test_data/test_io/test.dict", tokenizer) + # print(f"***{df}***") + self.assertEqual( + str(df), + textwrap.dedent( + """\ + toks_0 toks_1 toks_2 toks_3 toks_4 toks_5 toks_6 + 0 dictionary entry None None None None None + 1 entry None None None None None None + 2 help me ! i am trapped None + 3 in a haiku factory ! None None + 4 save me before they None None None + 5 None None None None None None None""" + ) + ) + + +if __name__ == "__main__": + unittest.main()