-
Notifications
You must be signed in to change notification settings - Fork 8
/
nlp_lemmatizer_plus.py
126 lines (113 loc) · 3.71 KB
/
nlp_lemmatizer_plus.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
# -*- coding: utf-8 -*-
from spacy.tokens import Token
from iwnlp.iwnlp_wrapper import IWNLPWrapper
from constants import (
ADJ,
ADV,
INTJ,
NOUN,
PROPN,
VERB,
ADP,
AUX,
CCONJ,
CONJ,
DET,
NUM,
PART,
PRON,
SCONJ,
PUNCT,
SYM,
X,
SPACE,
PHRASE,
NPHRASE,
)
class LemmatizerPlus(object):
def __init__(self, lemmatizer_path, nlp):
self.lemmatizer = IWNLPWrapper(lemmatizer_path=lemmatizer_path)
self.stringstore = nlp.vocab.strings
# self.matcher = PhraseMatcher(nlp.vocab)
Token.set_extension("iwnlp_lemmas", getter=self.lemmatize, force=True)
self.lookup = {
("fast", ADV): "fast",
}
def __call__(self, doc):
for token in doc:
token._.iwnlp_lemmas = self.lemmatize(token)
return doc
def lemmatize(self, token):
"""
TODO: This doc is slightly outdated
This function uses the IWNLP lemmatizer with a few enhancements for compund nouns and nouns
with uncommon capitalization. Can also be used to lemmatize tokens with different POS-tags.
Do not use this function to lemmatize phrases.
:param token: white space stripped single token (str)
:return: str # TODO: tuple of type (str, bool)
value[0]: The lemma of the token if a lemma can be derived, else None.
# TODO: value[1]: True if the token can be retrieved from the Wiktionary database as is,
# else False.
"""
text = token.text.strip()
pos = token.pos_
# nothing to lemmatize here
if pos in {PHRASE, NPHRASE, PUNCT, SPACE, SYM}:
return text
# lemmatiaztions are odd on DET and NUM, so better leave it alone
if pos in {DET, NUM}:
return None
# Wiktionary has no POS PROPN
if pos == PROPN:
pos = NOUN
# first lookup token for given POS in dictionary
if (text, pos) in self.lookup:
return self.lookup[(text, pos)]
value = None
# default IWNLP lemmatization
lemm = self.lemmatizer.lemmatize(text, pos)
# default lemmatization hit?
if lemm:
value = lemm[0]
# default lemmatization miss?
# apply some rules to derive a lemma from the original token (nouns only)
elif pos == NOUN:
# first try default noun capitalization
lemm = self.lemmatizer.lemmatize(text.title(), pos)
if lemm:
value = lemm[0]
else:
# still no results: try all noun suffixes
# TODO: search for a more efficient implementation
text_low = text.lower()
tolerance = 3
for i in range(1, len(text) - tolerance):
# looks ugly, but avoids full captitalization
text_edit = text_low[i].upper() + text_low[i + 1 :]
lemm = self.lemmatizer.lemmatize(text_edit, pos)
if lemm:
value = (text[:i] + lemm[0]).title()
break
# last try: plain lemmatization for all remaining POS tags
else:
lemm = self.lemmatizer.lemmatize_plain(text, ignore_case=True)
if lemm:
value = lemm[0]
if value and pos in {
ADJ,
ADP,
ADV,
AUX,
CCONJ,
CONJ,
INTJ,
PART,
PRON,
SCONJ,
VERB,
}:
value = value.lower()
if value:
self.stringstore.add(value)
self.lookup[(text, pos)] = value
return value