-
Notifications
You must be signed in to change notification settings - Fork 9
/
entity_extractors.py
32 lines (22 loc) · 941 Bytes
/
entity_extractors.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
from typing import Generator
from legal_ner.preprocessing import HtmlConcealer
class EntityExtractor:
def __init__(self, pipeline):
self.pipeline = pipeline
self.doc = None
def run(self, text):
self.doc = self.pipeline.run(text)
def get_entities(self, entity_type) -> Generator:
for ent in self.doc.ents:
if entity_type == ent.label_:
yield (ent.text, ent.start_char, ent.end_char)
class HtmlEntityExtractor(EntityExtractor):
html_concealer = None
def run(self, text):
self.html_concealer = HtmlConcealer(text)
self.html_concealer.conceal()
super().run(self.html_concealer.get_content())
def get_entities(self, entity_type) -> Generator:
for (value, start, end) in super().get_entities(entity_type):
start, end = self.html_concealer.concealed_to_html_pos(start, end)
yield value, start, end