From 060346f8165183c12f9b5f96180254eb9c6a4909 Mon Sep 17 00:00:00 2001 From: supersonic1999 Date: Sat, 17 Apr 2021 12:04:15 +0100 Subject: [PATCH 1/2] implemented hapax legomena index --- src/TRUNAJOD/ttr.py | 28 +++++++++++++++++++++++++++- tests/ttr_test.py | 16 ++++++++++++++++ 2 files changed, 43 insertions(+), 1 deletion(-) diff --git a/src/TRUNAJOD/ttr.py b/src/TRUNAJOD/ttr.py index 593db54..90f4cbc 100644 --- a/src/TRUNAJOD/ttr.py +++ b/src/TRUNAJOD/ttr.py @@ -140,7 +140,6 @@ def yule_k(doc: Doc) -> float: return 1e-4 * sum(r ** 2 * vr - N for r, vr in rs.items()) / N ** 2 - def d_estimate( doc: Doc, min_range: int = 35, max_range: int = 50, trials: int = 5 ) -> float: @@ -200,3 +199,30 @@ def d_estimate( y = ttrs ** 2 d = np.linalg.lstsq(A, y, rcond=None)[0] return d[0] + +def hapax_legomena_index(doc: Doc) -> int: + """Hapax Legomena Index from a text. + + Hapax Legomena Index is the number of words occuring once in a text. + + :param doc: Processed spaCy Doc + :type doc: Doc + :return: Texts' Hapex Legomena Index + :rtype: int + """ + word_counter = 0 + word_dupe_counter = 0 + words = {} + for token in doc: + if is_word(token.pos_): + word_counter += 1 + if str(token.pos_) not in words: + words[str(token.pos_)] = 1 + else: + words[str(token.pos_)] += 1 + + for key, value in words.items(): + if int(value) > 1: + word_dupe_counter += int(value) + return word_counter - word_dupe_counter + diff --git a/tests/ttr_test.py b/tests/ttr_test.py index cff37bf..300984b 100644 --- a/tests/ttr_test.py +++ b/tests/ttr_test.py @@ -90,3 +90,19 @@ def test_d_estimate(): np.random.seed(0) assert ttr.d_estimate(doc) == 119.4468681409897 + + +def test_hapax_legomena_index(): + """Test hapax_legomena_index.""" + Token = namedtuple("Token", "lemma_ pos_") + doc = [ + Token("hola", "hola"), + Token("hola", "hola"), + Token("chao", "chao"), + Token("hola", "hola"), + Token("perro", "perro"), + Token("hola", "hola"), + ] + + answer = 2 + assert ttr.hapax_legomena_index(doc) == answer From ee2276c5aafb27772bcbc63712d9ff5b5bdd575d Mon Sep 17 00:00:00 2001 From: supersonic1999 Date: Sat, 17 Apr 2021 18:26:03 +0100 Subject: [PATCH 2/2] ran pre-commit again --- src/TRUNAJOD/ttr.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/TRUNAJOD/ttr.py b/src/TRUNAJOD/ttr.py index 90f4cbc..38380d1 100644 --- a/src/TRUNAJOD/ttr.py +++ b/src/TRUNAJOD/ttr.py @@ -140,6 +140,7 @@ def yule_k(doc: Doc) -> float: return 1e-4 * sum(r ** 2 * vr - N for r, vr in rs.items()) / N ** 2 + def d_estimate( doc: Doc, min_range: int = 35, max_range: int = 50, trials: int = 5 ) -> float: @@ -200,6 +201,7 @@ def d_estimate( d = np.linalg.lstsq(A, y, rcond=None)[0] return d[0] + def hapax_legomena_index(doc: Doc) -> int: """Hapax Legomena Index from a text. @@ -225,4 +227,3 @@ def hapax_legomena_index(doc: Doc) -> int: if int(value) > 1: word_dupe_counter += int(value) return word_counter - word_dupe_counter -