Add tests of io.py

CODAIT · Mar 25, 2020 · b82b905 · b82b905
1 parent 0f60fdc
commit b82b905
Show file tree

Hide file tree

Showing 2 changed files with 181 additions and 0 deletions.
diff --git a/test_data/test_io/test.dict b/test_data/test_io/test.dict
@@ -0,0 +1,9 @@
+# This is a comment.
+Dictionary Entry
+Entry
+# This is also a comment.
+#
+Help me! I am trapped
+In a Haiku factory!
+Save me before they
+
diff --git a/text_extensions_for_pandas/test_io.py b/text_extensions_for_pandas/test_io.py
@@ -0,0 +1,172 @@
+#
+# Copyright (c) 2020 IBM Corp.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http:https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import numpy as np
+import unittest
+import textwrap
+
+from text_extensions_for_pandas.io import *
+
+import spacy
+
+_SPACY_LANGUAGE_MODEL = spacy.load("en_core_web_sm")
+
+
+class IOTest(unittest.TestCase):
+ def test_make_tokens(self):
+ from spacy.lang.en import English
+ nlp = English()
+ tokenizer = nlp.Defaults.create_tokenizer(nlp)
+ series = make_tokens(
+ "The quick, brown fox jumped over the hazy bog...", tokenizer
+ )
+ self.assertEqual(
+ repr(series),
+ textwrap.dedent(
+ """\
+ 0 [0, 3): 'The'
+ 1 [4, 9): 'quick'
+ 2 [9, 10): ','
+ 3 [11, 16): 'brown'
+ 4 [17, 20): 'fox'
+ 5 [21, 27): 'jumped'
+ 6 [28, 32): 'over'
+ 7 [33, 36): 'the'
+ 8 [37, 41): 'hazy'
+ 9 [42, 45): 'bog'
+ 10 [45, 48): '...'
+ dtype: CharSpan"""
+ ),
+ )
+
+ def test_make_tokens_and_features(self):
+ df = make_tokens_and_features(
+ "She sold c shills by the Sith Lord.", _SPACY_LANGUAGE_MODEL
+ )
+ # print(f"****{str(df.to_records())}****")
+ self.assertEqual(
+ str(df.to_records()),
+ textwrap.dedent(
+ """\
+ [(0, 0, [0, 3): 'She', [0, 3): 'She', '-PRON-', 'PRON', 'PRP', 'nsubj', 1, 'Xxx', 'O', '', True, True, [0, 35): 'She sold c shills by the Sith Lord.')
+ (1, 1, [4, 8): 'sold', [4, 8): 'sold', 'sell', 'VERB', 'VBD', 'ROOT', 1, 'xxxx', 'O', '', True, False, [0, 35): 'She sold c shills by the Sith Lord.')
+ (2, 2, [9, 10): 'c', [9, 10): 'c', 'c', 'NOUN', 'NN', 'compound', 3, 'x', 'O', '', True, False, [0, 35): 'She sold c shills by the Sith Lord.')
+ (3, 3, [11, 17): 'shills', [11, 17): 'shills', 'shill', 'NOUN', 'NNS', 'dobj', 1, 'xxxx', 'O', '', True, False, [0, 35): 'She sold c shills by the Sith Lord.')
+ (4, 4, [18, 20): 'by', [18, 20): 'by', 'by', 'ADP', 'IN', 'prep', 3, 'xx', 'O', '', True, True, [0, 35): 'She sold c shills by the Sith Lord.')
+ (5, 5, [21, 24): 'the', [21, 24): 'the', 'the', 'DET', 'DT', 'det', 7, 'xxx', 'O', '', True, True, [0, 35): 'She sold c shills by the Sith Lord.')
+ (6, 6, [25, 29): 'Sith', [25, 29): 'Sith', 'Sith', 'PROPN', 'NNP', 'compound', 7, 'Xxxx', 'O', '', True, False, [0, 35): 'She sold c shills by the Sith Lord.')
+ (7, 7, [30, 34): 'Lord', [30, 34): 'Lord', 'Lord', 'PROPN', 'NNP', 'pobj', 4, 'Xxxx', 'O', '', True, False, [0, 35): 'She sold c shills by the Sith Lord.')
+ (8, 8, [34, 35): '.', [34, 35): '.', '.', 'PUNCT', '.', 'punct', 1, '.', 'O', '', False, False, [0, 35): 'She sold c shills by the Sith Lord.')]"""
+ ),
+ )
+ df2 = make_tokens_and_features(
+ "She sold c shills by the Sith Lord.",
+ _SPACY_LANGUAGE_MODEL,
+ add_left_and_right=True,
+ )
+ # print(f"****{str(df2.to_records())}****")
+ self.assertEqual(
+ str(df2.to_records()),
+ textwrap.dedent(
+ """\
+ [(0, 0, [0, 3): 'She', [0, 3): 'She', '-PRON-', 'PRON', 'PRP', 'nsubj', 1, 'Xxx', 'O', '', True, True, [0, 35): 'She sold c shills by the Sith Lord.', <NA>, 1)
+ (1, 1, [4, 8): 'sold', [4, 8): 'sold', 'sell', 'VERB', 'VBD', 'ROOT', 1, 'xxxx', 'O', '', True, False, [0, 35): 'She sold c shills by the Sith Lord.', 0, 2)
+ (2, 2, [9, 10): 'c', [9, 10): 'c', 'c', 'NOUN', 'NN', 'compound', 3, 'x', 'O', '', True, False, [0, 35): 'She sold c shills by the Sith Lord.', 1, 3)
+ (3, 3, [11, 17): 'shills', [11, 17): 'shills', 'shill', 'NOUN', 'NNS', 'dobj', 1, 'xxxx', 'O', '', True, False, [0, 35): 'She sold c shills by the Sith Lord.', 2, 4)
+ (4, 4, [18, 20): 'by', [18, 20): 'by', 'by', 'ADP', 'IN', 'prep', 3, 'xx', 'O', '', True, True, [0, 35): 'She sold c shills by the Sith Lord.', 3, 5)
+ (5, 5, [21, 24): 'the', [21, 24): 'the', 'the', 'DET', 'DT', 'det', 7, 'xxx', 'O', '', True, True, [0, 35): 'She sold c shills by the Sith Lord.', 4, 6)
+ (6, 6, [25, 29): 'Sith', [25, 29): 'Sith', 'Sith', 'PROPN', 'NNP', 'compound', 7, 'Xxxx', 'O', '', True, False, [0, 35): 'She sold c shills by the Sith Lord.', 5, 7)
+ (7, 7, [30, 34): 'Lord', [30, 34): 'Lord', 'Lord', 'PROPN', 'NNP', 'pobj', 4, 'Xxxx', 'O', '', True, False, [0, 35): 'She sold c shills by the Sith Lord.', 6, 8)
+ (8, 8, [34, 35): '.', [34, 35): '.', '.', 'PUNCT', '.', 'punct', 1, '.', 'O', '', False, False, [0, 35): 'She sold c shills by the Sith Lord.', 7, <NA>)]"""
+ ),
+ )
+
+ def test_token_features_to_tree(self):
+ df = make_tokens_and_features(
+ "Peter Peeper packed a puck of liquid flubber.", _SPACY_LANGUAGE_MODEL
+ )
+ json = token_features_to_tree(df)
+ # print(f"****{json}****")
+ expected = {
+ "words": [
+ {"text": "Peter", "tag": "NNP"},
+ {"text": "Peeper", "tag": "NNP"},
+ {"text": "packed", "tag": "VBD"},
+ {"text": "a", "tag": "DT"},
+ {"text": "puck", "tag": "NN"},
+ {"text": "of", "tag": "IN"},
+ {"text": "liquid", "tag": "JJ"},
+ {"text": "flubber", "tag": "NN"},
+ {"text": ".", "tag": "."},
+ ],
+ "arcs": [
+ {"start": 0, "end": 1, "label": "compound", "dir": "left"},
+ {"start": 1, "end": 2, "label": "nsubj", "dir": "left"},
+ {"start": 3, "end": 4, "label": "det", "dir": "left"},
+ {"start": 2, "end": 4, "label": "dobj", "dir": "right"},
+ {"start": 4, "end": 5, "label": "prep", "dir": "right"},
+ {"start": 6, "end": 7, "label": "amod", "dir": "left"},
+ {"start": 5, "end": 7, "label": "pobj", "dir": "right"},
+ {"start": 2, "end": 8, "label": "punct", "dir": "right"},
+ ],
+ }
+ self.assertDictEqual(json, expected)
+
+ def test_iob_to_spans(self):
+ df = make_tokens_and_features(
+ textwrap.dedent(
+ """\
+ The Bermuda Triangle got tired of warm weather. 
+ It moved to Alaska. Now Santa Claus is missing.
+ -- Steven Wright"""
+ ),
+ _SPACY_LANGUAGE_MODEL,
+ )
+ spans = iob_to_spans(df)
+ # print(f"****{spans}****")
+ self.assertEqual(
+ str(spans),
+ textwrap.dedent(
+ """\
+ token_span ent_type
+ 0 [61, 67): 'Alaska' GPE
+ 1 [73, 84): 'Santa Claus' GPE
+ 2 [100, 113): 'Steven Wright' PERSON"""
+ ),
+ )
+
+ def test_load_dict(self):
+ from spacy.lang.en import English
+ nlp = English()
+ tokenizer = nlp.Defaults.create_tokenizer(nlp)
+ df = load_dict("test_data/test_io/test.dict", tokenizer)
+ # print(f"***{df}***")
+ self.assertEqual(
+ str(df),
+ textwrap.dedent(
+ """\
+ toks_0 toks_1 toks_2 toks_3 toks_4 toks_5 toks_6
+ 0 dictionary entry None None None None None
+ 1 entry None None None None None None
+ 2 help me ! i am trapped None
+ 3 in a haiku factory ! None None
+ 4 save me before they None None None
+ 5 None None None None None None None"""
+ )
+ )
+
+
+if __name__ == "__main__":
+ unittest.main()