Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding Stanza option to TTR, Entity Grid #13

Merged
merged 2 commits into from
Apr 3, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 29 additions & 12 deletions src/TRUNAJOD/entity_grid.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,9 @@
sequence and the API currently does not provide any hyper-parameter tunning to
change this.
"""
from TRUNAJOD.utils import SupportedModels

SPACY_UNIVERSAL_NOUN_TAGS = set([u'NOUN', u'PRON', u'PROPN'])
UNIVERSAL_NOUN_TAGS = set([u'NOUN', u'PRON', u'PROPN'])
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice


ordered_transitions = [
u'SS', u'SO', u'SX', u'S-', u'OS', u'OO', u'OX', u'O-', u'XS', u'XO',
Expand Down Expand Up @@ -63,7 +64,7 @@ class EntityGrid(object):
module. It only supports 2-transitions entity grid.
"""

def __init__(self, doc):
def __init__(self, doc, model_name="spacy"):
"""Construct EntityGrid object."""
# Initialization
entity_map = dict()
Expand All @@ -88,9 +89,14 @@ def __init__(self, doc):
u'-X': 0,
u'--': 0
}
# check model
model = SupportedModels(model_name)

# Get number of sentences in the text
n_sent = len(list(doc.sents))
if model == SupportedModels.SPACY:
n_sent = len(list(doc.sents))
elif model == SupportedModels.STANZA:
n_sent = len(list(doc.sentences))

# To get coherence measurements we need at least 2 sentences
if n_sent < 2:
Expand All @@ -99,15 +105,26 @@ def __init__(self, doc):
.format(n_sent))

# For each sentence, get dependencies and its grammatical role
for sent in doc.sents:
for token in sent:
if token.pos_ in SPACY_UNIVERSAL_NOUN_TAGS:
entity_map['s%d' % i].append((token.text.upper(),
token.dep_))
if token.text.upper() not in entity_grid:
entity_grid[token.text.upper()] = [u'-'] * n_sent
i += 1
entity_map['s%d' % i] = []
if model == SupportedModels.SPACY:
for sent in doc.sents:
for token in sent:
if token.pos_ in UNIVERSAL_NOUN_TAGS:
entity_map['s%d' % i].append((token.text.upper(),
token.dep_))
if token.text.upper() not in entity_grid:
entity_grid[token.text.upper()] = [u'-'] * n_sent
i += 1
entity_map['s%d' % i] = []
elif model == SupportedModels.STANZA:
for sent in doc.sentences:
for word in sent.words:
if word.upos in UNIVERSAL_NOUN_TAGS:
entity_map['s%d' % i].append((word.text.upper(),
word.deprel))
if word.text.upper() not in entity_grid:
entity_grid[word.text.upper()] = ['-'] * n_sent
i += 1
entity_map['s%d' % i] = []

# Last iteration will create an extra element, so I remove it.
entity_map.pop('s%d' % i)
Expand Down
62 changes: 44 additions & 18 deletions src/TRUNAJOD/ttr.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,10 @@
tend to 0. This measurement is not recommended if analyzing texts of different
lengths, as when the number of tokens increases, the TTR tends flatten.
"""
from TRUNAJOD.utils import is_word
from TRUNAJOD.utils import SupportedModels,is_word

# dev import
# from src.TRUNAJOD.utils import is_word


def type_token_ratio(word_list):
Expand All @@ -22,27 +25,36 @@ def type_token_ratio(word_list):
return len(set(word_list)) / len(word_list)


def lexical_diversity_mtld(doc, ttr_segment=0.72):
def lexical_diversity_mtld(doc, model_name="spacy", ttr_segment=0.72):
"""Compute MTLD lexical diversity in a bi-directional fashion.

:param doc: Processed text
:type doc: Spacy Doc
:type doc: NLP Doc
:return: Bi-directional lexical diversity MTLD
:rtype: float
"""
# check model
model = SupportedModels(model_name)

word_list = []
for token in doc:
if is_word(token.pos_):
word_list.append(token.lemma_)
return (one_side_lexical_diversity_mtld(word_list, ttr_segment) +
one_side_lexical_diversity_mtld(word_list[::-1], ttr_segment)) / 2
if model == SupportedModels.SPACY:
for token in doc:
if is_word(token.pos_):
word_list.append(token.lemma_)
elif model == SupportedModels.STANZA:
for sent in doc.sentences:
for word in sent.words:
if is_word(word.upos):
word_list.append(word.lemma)
return (one_side_lexical_diversity_mtld(word_list, model, ttr_segment) +
one_side_lexical_diversity_mtld(word_list[::-1], model, ttr_segment)) / 2


def one_side_lexical_diversity_mtld(doc, ttr_segment=0.72):
def one_side_lexical_diversity_mtld(doc, model_name="spacy", ttr_segment=0.72):
"""Lexical diversity per MTLD.

:param doc: Tokenized text
:type doc: Spacy Doc
:type doc: NLP Doc
:param ttr_segment: Threshold for TTR mean computation
:type ttr_segment: float
:return: MLTD lexical diversity
Expand All @@ -52,17 +64,31 @@ def one_side_lexical_diversity_mtld(doc, ttr_segment=0.72):
total_words = 0
non_ttr_segment = 1 - ttr_segment
word_list = []
for token in doc:
word_list.append(token.lower())
total_words += 1
ttr = type_token_ratio(word_list)
if ttr < ttr_segment:
word_list = []
factor += 1

# check model
model = SupportedModels(model_name)

if model == SupportedModels.SPACY or type(doc) == list:
for token in doc:
word_list.append(token.lower())
total_words += 1
ttr = type_token_ratio(word_list)
if ttr < ttr_segment:
word_list = []
factor += 1
elif model == SupportedModels.STANZA:
if type(doc) != list:
for sent in doc.sentences:
for word in sent.words:
word_list.append(word.text.lower())
total_words += 1
ttr = type_token_ratio(word_list)
if ttr < ttr_segment:
word_list = []
factor += 1

if word_list:
factor += 1 - (
type_token_ratio(word_list) - ttr_segment) / non_ttr_segment
total_words += 1

return total_words / factor
5 changes: 5 additions & 0 deletions src/TRUNAJOD/utils.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,12 @@
#!/usr/bin/env python
"""Utility functions for TRUNAJOD library."""
from enum import Enum


class SupportedModels(str, Enum):
SPACY = "spacy"
STANZA = "stanza"

def flatten(list_of_lists):
"""Flatten a list of list.

Expand Down
43 changes: 43 additions & 0 deletions stanza_example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
from TRUNAJOD.entity_grid import EntityGrid
from TRUNAJOD.ttr import lexical_diversity_mtld, one_side_lexical_diversity_mtld
import spacy
import stanza
"""
MUST CHANGE TTR IMPORT TO WORK
"""
# Load spaCy model
nlp = spacy.load("es_core_news_sm")

# Load stanza model
nlp_s = stanza.Pipeline('es', use_gpu=False)

# Example
example_text = (
"El espectáculo del cielo nocturno cautiva la mirada y suscita preguntas"
"sobre el universo, su origen y su funcionamiento. No es sorprendente que "
"todas las civilizaciones y culturas hayan formado sus propias "
"cosmologías. Unas relatan, por ejemplo, que el universo ha"
"sido siempre tal como es, con ciclos que inmutablemente se repiten; "
"otras explican que este universo ha tenido un principio, "
"que ha aparecido por obra creadora de una divinidad."
)

# Create Doc
doc = nlp(example_text)
doc_s = nlp_s(example_text)

# TTR Check - change TTR import to test
print("spacy result: ", lexical_diversity_mtld(doc))
# or
# print("spacy result: ", lexical_diversity_mtld(doc, model_name="spacy"))
print("stanza result: ", lexical_diversity_mtld(doc_s, model_name="stanza"))

# Entity Grid Check
egrid = EntityGrid(doc)
egrid_s = EntityGrid(doc_s, model_name="stanza")

print("spacy Entity grid:")
print(egrid.get_egrid())

print("stanza Entity grid:")
print(egrid_s.get_egrid())
68 changes: 68 additions & 0 deletions tester.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
#!/usr/bin/env python
"""Type Token Ratios module.

Type token ratios (TTR) are a measurement of lexical diversity. They are
defined as the ratio of unique tokens divided by the total number of tokens.
This measurement is bounded between 0 and 1. If there is no repetition in
the text this measurement is 1, and if there is infinite repetition, it will
tend to 0. This measurement is not recommended if analyzing texts of different
lengths, as when the number of tokens increases, the TTR tends flatten.
"""
from TRUNAJOD.utils import is_word


def type_token_ratio(word_list):
"""Return Type Token Ratio of a word list.

:param word_list: List of words
:type word_list: List of strings
:return: TTR of the word list
:rtype: float
"""
return len(set(word_list)) / len(word_list)


def lexical_diversity_mtld(doc, ttr_segment=0.72):
"""Compute MTLD lexical diversity in a bi-directional fashion.

:param doc: Processed text
:type doc: Spacy Doc
:return: Bi-directional lexical diversity MTLD
:rtype: float
"""
word_list = []
for token in doc:
if is_word(token.pos_):
word_list.append(token.lemma_)
return (one_side_lexical_diversity_mtld(word_list, ttr_segment) +
one_side_lexical_diversity_mtld(word_list[::-1], ttr_segment)) / 2


def one_side_lexical_diversity_mtld(doc, ttr_segment=0.72):
"""Lexical diversity per MTLD.

:param doc: Tokenized text
:type doc: Spacy Doc
:param ttr_segment: Threshold for TTR mean computation
:type ttr_segment: float
:return: MLTD lexical diversity
:rtype: float
"""
factor = 0
total_words = 0
non_ttr_segment = 1 - ttr_segment
word_list = []
for token in doc:
word_list.append(token.lower())
total_words += 1
ttr = type_token_ratio(word_list)
if ttr < ttr_segment:
word_list = []
factor += 1

if word_list:
factor += 1 - (
type_token_ratio(word_list) - ttr_segment) / non_ttr_segment
total_words += 1

return total_words / factor
6 changes: 3 additions & 3 deletions tests/ttr_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,13 @@
def test_type_token_ratio():
"""Test type_token_ratio func."""
assert ttr.type_token_ratio(
['hola', 'hola', 'chao', 'hola', 'perro', 'hola'], ) == 0.5
['hola', 'hola', 'chao', 'hola', 'perro', 'hola']) == 0.5


def test_one_side_lexical_diversity_mtld():
"""Test one_side_lexical_diversity_mtld."""
assert ttr.one_side_lexical_diversity_mtld(
['hola', 'hola', 'chao', 'hola', 'perro', 'hola'], 1) == 3
['hola', 'hola', 'chao', 'hola', 'perro', 'hola'], ttr_segment=1) == 3


def test_lexical_diversity_mtld():
Expand All @@ -27,4 +27,4 @@ def test_lexical_diversity_mtld():
Token('perro', 'perro'),
Token('hola', 'hola'),
]
assert ttr.lexical_diversity_mtld(doc, 1) == 3
assert ttr.lexical_diversity_mtld(doc, ttr_segment=1) == 3