-
Notifications
You must be signed in to change notification settings - Fork 19
/
illegal_check.py
61 lines (48 loc) · 1.57 KB
/
illegal_check.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
"""
@Project :illegal_context_recognition
@File :illegal_check.py
@IDE :PyCharm
@Author :wujx
"""
from config import SystemConfig, FastTextConfig
from preprocess import TradToSimple
from ac import AhocorasickNer
from preprocess import word_segment
from models.fasttext_model import FastText
from train_roberta import predict
config = SystemConfig()
f2s = TradToSimple(config.trad2simple_file)
illegal_match = AhocorasickNer()
illegal_match.add_keywords_by_file(config.illegal_dicts_file)
suspected_illegal_match = AhocorasickNer()
suspected_illegal_match.add_keywords_by_file(config.suspected_illegal_dicts_file)
ft_config = FastTextConfig()
fasttext_model = FastText(ft_config, train=False)
def check(text):
"""
检测文本中是否有违规内容
:param text: str
:return:bool, True: 存在违规内容,False:不存在违规内容
"""
# 繁简转换
text = f2s.transform_sentence(text)
# 违规关键词匹配
if illegal_match.match_results(text):
return True
# 疑似违规关键词匹配
suspect_illegal = False
if suspected_illegal_match.match_results(text):
suspect_illegal = True
# fasttext 分类
fasttext_check = False
words = word_segment(text)
pred = fasttext_model.predict(' '.join(words))
if pred == '__label__1':
fasttext_check = True
if suspect_illegal and fasttext_check:
return True
elif suspect_illegal or fasttext_check:
return predict(text) == 1
return False
if __name__ == '__main__':
check('你这杂种肯定是个愚蠢的小学生')