From b82b9055f5161a5483f76ecf60f4b233fa04323c Mon Sep 17 00:00:00 2001
From: frreiss <frreiss@us.ibm.com>
Date: Wed, 25 Mar 2020 14:52:30 -0700
Subject: [PATCH] Add tests of io.py

---
 test_data/test_io/test.dict           |   9 ++
 text_extensions_for_pandas/test_io.py | 172 ++++++++++++++++++++++++++
 2 files changed, 181 insertions(+)
 create mode 100644 test_data/test_io/test.dict
 create mode 100644 text_extensions_for_pandas/test_io.py

diff --git a/test_data/test_io/test.dict b/test_data/test_io/test.dict
new file mode 100644
index 00000000..204dd72e
--- /dev/null
+++ b/test_data/test_io/test.dict
@@ -0,0 +1,9 @@
+# This is a comment.
+Dictionary Entry
+Entry
+# This is also a comment.
+#
+Help me! I am trapped
+In a Haiku factory!
+Save me before they
+
diff --git a/text_extensions_for_pandas/test_io.py b/text_extensions_for_pandas/test_io.py
new file mode 100644
index 00000000..3bed5711
--- /dev/null
+++ b/text_extensions_for_pandas/test_io.py
@@ -0,0 +1,172 @@
+#
+#  Copyright (c) 2020 IBM Corp.
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+import numpy as np
+import unittest
+import textwrap
+
+from text_extensions_for_pandas.io import *
+
+import spacy
+
+_SPACY_LANGUAGE_MODEL = spacy.load("en_core_web_sm")
+
+
+class IOTest(unittest.TestCase):
+    def test_make_tokens(self):
+        from spacy.lang.en import English
+        nlp = English()
+        tokenizer = nlp.Defaults.create_tokenizer(nlp)
+        series = make_tokens(
+            "The quick, brown fox jumped over the hazy bog...", tokenizer
+        )
+        self.assertEqual(
+            repr(series),
+            textwrap.dedent(
+                """\
+                0          [0, 3): 'The'
+                1        [4, 9): 'quick'
+                2           [9, 10): ','
+                3      [11, 16): 'brown'
+                4        [17, 20): 'fox'
+                5     [21, 27): 'jumped'
+                6       [28, 32): 'over'
+                7        [33, 36): 'the'
+                8       [37, 41): 'hazy'
+                9        [42, 45): 'bog'
+                10       [45, 48): '...'
+                dtype: CharSpan"""
+            ),
+        )
+
+    def test_make_tokens_and_features(self):
+        df = make_tokens_and_features(
+            "She sold c shills by the Sith Lord.", _SPACY_LANGUAGE_MODEL
+        )
+        # print(f"****{str(df.to_records())}****")
+        self.assertEqual(
+            str(df.to_records()),
+            textwrap.dedent(
+                """\
+                [(0, 0, [0, 3): 'She', [0, 3): 'She', '-PRON-', 'PRON', 'PRP', 'nsubj', 1, 'Xxx', 'O', '',  True,  True, [0, 35): 'She sold c shills by the Sith Lord.')
+                 (1, 1, [4, 8): 'sold', [4, 8): 'sold', 'sell', 'VERB', 'VBD', 'ROOT', 1, 'xxxx', 'O', '',  True, False, [0, 35): 'She sold c shills by the Sith Lord.')
+                 (2, 2, [9, 10): 'c', [9, 10): 'c', 'c', 'NOUN', 'NN', 'compound', 3, 'x', 'O', '',  True, False, [0, 35): 'She sold c shills by the Sith Lord.')
+                 (3, 3, [11, 17): 'shills', [11, 17): 'shills', 'shill', 'NOUN', 'NNS', 'dobj', 1, 'xxxx', 'O', '',  True, False, [0, 35): 'She sold c shills by the Sith Lord.')
+                 (4, 4, [18, 20): 'by', [18, 20): 'by', 'by', 'ADP', 'IN', 'prep', 3, 'xx', 'O', '',  True,  True, [0, 35): 'She sold c shills by the Sith Lord.')
+                 (5, 5, [21, 24): 'the', [21, 24): 'the', 'the', 'DET', 'DT', 'det', 7, 'xxx', 'O', '',  True,  True, [0, 35): 'She sold c shills by the Sith Lord.')
+                 (6, 6, [25, 29): 'Sith', [25, 29): 'Sith', 'Sith', 'PROPN', 'NNP', 'compound', 7, 'Xxxx', 'O', '',  True, False, [0, 35): 'She sold c shills by the Sith Lord.')
+                 (7, 7, [30, 34): 'Lord', [30, 34): 'Lord', 'Lord', 'PROPN', 'NNP', 'pobj', 4, 'Xxxx', 'O', '',  True, False, [0, 35): 'She sold c shills by the Sith Lord.')
+                 (8, 8, [34, 35): '.', [34, 35): '.', '.', 'PUNCT', '.', 'punct', 1, '.', 'O', '', False, False, [0, 35): 'She sold c shills by the Sith Lord.')]"""
+            ),
+        )
+        df2 = make_tokens_and_features(
+            "She sold c shills by the Sith Lord.",
+            _SPACY_LANGUAGE_MODEL,
+            add_left_and_right=True,
+        )
+        # print(f"****{str(df2.to_records())}****")
+        self.assertEqual(
+            str(df2.to_records()),
+            textwrap.dedent(
+                """\
+                [(0, 0, [0, 3): 'She', [0, 3): 'She', '-PRON-', 'PRON', 'PRP', 'nsubj', 1, 'Xxx', 'O', '',  True,  True, [0, 35): 'She sold c shills by the Sith Lord.', <NA>, 1)
+                 (1, 1, [4, 8): 'sold', [4, 8): 'sold', 'sell', 'VERB', 'VBD', 'ROOT', 1, 'xxxx', 'O', '',  True, False, [0, 35): 'She sold c shills by the Sith Lord.', 0, 2)
+                 (2, 2, [9, 10): 'c', [9, 10): 'c', 'c', 'NOUN', 'NN', 'compound', 3, 'x', 'O', '',  True, False, [0, 35): 'She sold c shills by the Sith Lord.', 1, 3)
+                 (3, 3, [11, 17): 'shills', [11, 17): 'shills', 'shill', 'NOUN', 'NNS', 'dobj', 1, 'xxxx', 'O', '',  True, False, [0, 35): 'She sold c shills by the Sith Lord.', 2, 4)
+                 (4, 4, [18, 20): 'by', [18, 20): 'by', 'by', 'ADP', 'IN', 'prep', 3, 'xx', 'O', '',  True,  True, [0, 35): 'She sold c shills by the Sith Lord.', 3, 5)
+                 (5, 5, [21, 24): 'the', [21, 24): 'the', 'the', 'DET', 'DT', 'det', 7, 'xxx', 'O', '',  True,  True, [0, 35): 'She sold c shills by the Sith Lord.', 4, 6)
+                 (6, 6, [25, 29): 'Sith', [25, 29): 'Sith', 'Sith', 'PROPN', 'NNP', 'compound', 7, 'Xxxx', 'O', '',  True, False, [0, 35): 'She sold c shills by the Sith Lord.', 5, 7)
+                 (7, 7, [30, 34): 'Lord', [30, 34): 'Lord', 'Lord', 'PROPN', 'NNP', 'pobj', 4, 'Xxxx', 'O', '',  True, False, [0, 35): 'She sold c shills by the Sith Lord.', 6, 8)
+                 (8, 8, [34, 35): '.', [34, 35): '.', '.', 'PUNCT', '.', 'punct', 1, '.', 'O', '', False, False, [0, 35): 'She sold c shills by the Sith Lord.', 7, <NA>)]"""
+            ),
+        )
+
+    def test_token_features_to_tree(self):
+        df = make_tokens_and_features(
+            "Peter Peeper packed a puck of liquid flubber.", _SPACY_LANGUAGE_MODEL
+        )
+        json = token_features_to_tree(df)
+        # print(f"****{json}****")
+        expected = {
+            "words": [
+                {"text": "Peter", "tag": "NNP"},
+                {"text": "Peeper", "tag": "NNP"},
+                {"text": "packed", "tag": "VBD"},
+                {"text": "a", "tag": "DT"},
+                {"text": "puck", "tag": "NN"},
+                {"text": "of", "tag": "IN"},
+                {"text": "liquid", "tag": "JJ"},
+                {"text": "flubber", "tag": "NN"},
+                {"text": ".", "tag": "."},
+            ],
+            "arcs": [
+                {"start": 0, "end": 1, "label": "compound", "dir": "left"},
+                {"start": 1, "end": 2, "label": "nsubj", "dir": "left"},
+                {"start": 3, "end": 4, "label": "det", "dir": "left"},
+                {"start": 2, "end": 4, "label": "dobj", "dir": "right"},
+                {"start": 4, "end": 5, "label": "prep", "dir": "right"},
+                {"start": 6, "end": 7, "label": "amod", "dir": "left"},
+                {"start": 5, "end": 7, "label": "pobj", "dir": "right"},
+                {"start": 2, "end": 8, "label": "punct", "dir": "right"},
+            ],
+        }
+        self.assertDictEqual(json, expected)
+
+    def test_iob_to_spans(self):
+        df = make_tokens_and_features(
+            textwrap.dedent(
+                """\
+            The Bermuda Triangle got tired of warm weather. 
+            It moved to Alaska. Now Santa Claus is missing.
+            -- Steven Wright"""
+            ),
+            _SPACY_LANGUAGE_MODEL,
+        )
+        spans = iob_to_spans(df)
+        # print(f"****{spans}****")
+        self.assertEqual(
+            str(spans),
+            textwrap.dedent(
+                """\
+                                    token_span ent_type
+                0           [61, 67): 'Alaska'      GPE
+                1      [73, 84): 'Santa Claus'      GPE
+                2  [100, 113): 'Steven Wright'   PERSON"""
+            ),
+        )
+
+    def test_load_dict(self):
+        from spacy.lang.en import English
+        nlp = English()
+        tokenizer = nlp.Defaults.create_tokenizer(nlp)
+        df = load_dict("test_data/test_io/test.dict", tokenizer)
+        # print(f"***{df}***")
+        self.assertEqual(
+            str(df),
+            textwrap.dedent(
+                """\
+                       toks_0 toks_1  toks_2   toks_3 toks_4   toks_5 toks_6
+                0  dictionary  entry    None     None   None     None   None
+                1       entry   None    None     None   None     None   None
+                2        help     me       !        i     am  trapped   None
+                3          in      a   haiku  factory      !     None   None
+                4        save     me  before     they   None     None   None
+                5        None   None    None     None   None     None   None"""
+            )
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()