Skip to content

Commit

Permalink
extractive
Browse files Browse the repository at this point in the history
  • Loading branch information
yongzhuo committed Oct 28, 2019
1 parent 3c432bf commit 64a3028
Show file tree
Hide file tree
Showing 26 changed files with 172 additions and 42 deletions.
15 changes: 15 additions & 0 deletions nlg_yongzhuo/conf/path_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,19 @@
sys.path.append(projectdir)
print(projectdir)

# path of embedding
path_embedding_random_char = path_root + '/data/embeddings/term_char.txt'
path_embedding_random_word = path_root + '/data/embeddings/term_word.txt'
path_embedding_bert = path_root + '/data/embeddings/chinese_L-12_H-768_A-12/'
path_embedding_xlnet = path_root + '/data/embeddings/chinese_xlnet_mid_L-24_H-768_A-12/'
path_embedding_vector_word2vec_char = path_root + '/data/embeddings/w2v_model_wiki_char.vec'
path_embedding_vector_word2vec_word = path_root + '/data/embeddings/w2v_model_merge_short.vec'

# 模型目录
path_model_dir = path_root + "/data/model/text_summarization/"
# 语料地址
path_model = path_root + '/data/model/text_summarization/model_fast_text.h5'
# 超参数保存地址
path_hyper_parameters = path_root + '/data/model/text_summarization/hyper_parameters.json'
# embedding微调保存地址
path_fineture = path_root + "/data/model/text_summarization/embedding_trainable.h5"
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
# @function :data utils of nlg-yongzhuo


from sklearn.feature_extraction.text import TfidfVectorizer
import jieba.posseg as pseg
import pandas as pd
import logging
Expand Down Expand Up @@ -183,6 +184,26 @@ def gram_uni_bi_tri(text):
return gram_uni, gram_bi, gram_tri


def tfidf_fit(sentences):
"""
tfidf相似度
:param sentences:
:return:
"""
# tfidf计算
model = TfidfVectorizer(ngram_range=(1, 2), # 3,5
stop_words=[' ', '\t', '\n'], # 停用词
max_features=10000,
token_pattern=r"(?u)\b\w+\b", # 过滤停用词
min_df=1,
max_df=0.9,
use_idf=1, # 光滑
smooth_idf=1, # 光滑
sublinear_tf=1, ) # 光滑
matrix = model.fit_transform(sentences)
return matrix



if __name__ == '__main__':
text = "你喜欢谁,小老弟,你好烦哇。"
Expand Down
5 changes: 5 additions & 0 deletions nlg_yongzhuo/text_summarization/extractive_sum/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# -*- coding: UTF-8 -*-
# !/usr/bin/python
# @time :2019/9/24 10:26
# @author :Mo
# @function :
93 changes: 93 additions & 0 deletions nlg_yongzhuo/text_summarization/extractive_sum/feature_base/mmr.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
# -*- coding: UTF-8 -*-
# !/usr/bin/python
# @time :2019/10/28 10:16
# @author :Mo
# @function :


from nlg_yongzhuo.data_preprocess.text_preprocess import extract_chinese, cut_sentence
from nlg_yongzhuo.data_preprocess.text_preprocess import jieba_cut,tfidf_fit
from nlg_yongzhuo.data.stop_words.stop_words import stop_words
# sklearn
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import copy


class MMRSum:
def __init__(self):
self.stop_words = stop_words.values()
self.algorithm = 'mmr'

def summarize(self, text, num=8, alpha=0.6):
"""
:param text: str
:param num: int
:return: list
"""
# 切句
if type(text) == str:
self.sentences = cut_sentence(text)
elif type(text) == list:
self.sentences = text
else:
raise RuntimeError("text type must be list or str")
# 切词
sentences_cut = [[word for word in jieba_cut(extract_chinese(sentence))
if word.strip()] for sentence in self.sentences]
# 去除停用词等
self.sentences_cut = [list(filter(lambda x: x not in self.stop_words, sc)) for sc in sentences_cut]
self.sentences_cut = [" ".join(sc) for sc in self.sentences_cut]
# # 计算每个句子的词语个数
# sen_word_len = [len(sc)+1 for sc in sentences_cut]
# 计算每个句子的tfidf
sen_tfidf = tfidf_fit(self.sentences_cut)
# 矩阵中两两句子相似度
SimMatrix = (sen_tfidf * sen_tfidf.T).A # 例如: SimMatrix[1, 3] # "第2篇与第4篇的相似度"
# 输入文本句子长度
len_sen = len(self.sentences)
# 句子标号
sen_idx = [i for i in range(len_sen)]
summary_set = []
mmr = {}
for i in range(len_sen):
if not self.sentences[i] in summary_set:
sen_idx_pop = copy.deepcopy(sen_idx)
sen_idx_pop.pop(i)
# 两两句子相似度
sim_i_j = [SimMatrix[i, j] for j in sen_idx_pop]
score_tfidf = sen_tfidf[i].toarray()[0].sum() # / sen_word_len[i], 如果除以词语个数就不准确
mmr[self.sentences[i]] = alpha * score_tfidf - (1 - alpha) * max(sim_i_j)
summary_set.append(self.sentences[i])
score_sen = [(rc[1], rc[0]) for rc in sorted(mmr.items(), key=lambda d: d[1], reverse=True)]
if len(mmr) > num:
score_sen = score_sen[0:num]
return score_sen


if __name__ == '__main__':
mmr_sum = MMRSum()
doc = "PageRank算法简介。" \
"是上世纪90年代末提出的一种计算网页权重的算法! " \
"当时,互联网技术突飞猛进,各种网页网站爆炸式增长。 " \
"业界急需一种相对比较准确的网页重要性计算方法。 " \
"是人们能够从海量互联网世界中找出自己需要的信息。 " \
"百度百科如是介绍他的思想:PageRank通过网络浩瀚的超链接关系来确定一个页面的等级。 " \
"Google把从A页面到B页面的链接解释为A页面给B页面投票。 " \
"Google根据投票来源甚至来源的来源,即链接到A页面的页面。 " \
"和投票目标的等级来决定新的等级。简单的说, " \
"一个高等级的页面可以使其他低等级页面的等级提升。 " \
"具体说来就是,PageRank有两个基本思想,也可以说是假设。 " \
"即数量假设:一个网页被越多的其他页面链接,就越重)。 " \
"质量假设:一个网页越是被高质量的网页链接,就越重要。 " \
"总的来说就是一句话,从全局角度考虑,获取重要的信。 "
sum = mmr_sum.summarize(doc)
for i in sum:
print(i)






Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,10 @@
# @evaluate :bad, it is for english, and that's not clearly explain of formula


from nlg_yongzhuo.data_proprecess.text_preprocess import jieba_cut, jieba_tag_cut
from nlg_yongzhuo.data_proprecess.text_preprocess import gram_uni_bi_tri
from nlg_yongzhuo.data_proprecess.text_preprocess import extract_chinese
from nlg_yongzhuo.data_proprecess.text_preprocess import cut_sentence
from nlg_yongzhuo.data_preprocess.text_preprocess import jieba_cut, jieba_tag_cut
from nlg_yongzhuo.data_preprocess.text_preprocess import gram_uni_bi_tri
from nlg_yongzhuo.data_preprocess.text_preprocess import extract_chinese
from nlg_yongzhuo.data_preprocess.text_preprocess import cut_sentence
from nlg_yongzhuo.data.stop_words.stop_words import stop_words
import jieba.analyse as analyse
from collections import Counter
Expand Down Expand Up @@ -221,3 +221,14 @@ def summarize(self, text, title=None):
for sum_ in sums:
print(sum_)

ran_20 = range(20)
print(type(ran_20))
print(ran_20)
idx = [1,2,3]
idx.pop(1)
print(idx)
print(max([1,2,3,4]))




Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@
# @url :using Google Scholar


from nlg_yongzhuo.data_proprecess.text_preprocess import extract_chinese
from nlg_yongzhuo.data_proprecess.text_preprocess import cut_sentence
from nlg_yongzhuo.data_proprecess.text_preprocess import jieba_cut
from nlg_yongzhuo.data_preprocess.text_preprocess import extract_chinese
from nlg_yongzhuo.data_preprocess.text_preprocess import cut_sentence
from nlg_yongzhuo.data_preprocess.text_preprocess import jieba_cut
from nlg_yongzhuo.data.stop_words.stop_words import stop_words
from collections import Counter

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@
# @url :http:https://courses.ischool.berkeley.edu/i256/f06/papers/luhn58.pdf


from nlg_yongzhuo.data_proprecess.text_preprocess import extract_chinese
from nlg_yongzhuo.data_proprecess.text_preprocess import cut_sentence
from nlg_yongzhuo.data_proprecess.text_preprocess import jieba_cut
from nlg_yongzhuo.data_preprocess.text_preprocess import extract_chinese
from nlg_yongzhuo.data_preprocess.text_preprocess import cut_sentence
from nlg_yongzhuo.data_preprocess.text_preprocess import jieba_cut
from nlg_yongzhuo.data.stop_words.stop_words import stop_words
from collections import Counter

Expand Down Expand Up @@ -104,3 +104,8 @@ def summarize(self, text, num=8):
for r in res:
print(r)


"多知网. 多知网5月26日消息,今日,方直科技发公告,拟用自有资金人民币1.2亿元,与深圳嘉道谷投资管理有限公司、深圳嘉道功程股权投资基金(有限合伙)共同发起设立嘉道方直教育产业投资基金(暂定名)。该基金认缴出资总规模为人民币3.01亿元。基金的出资方式具体如下:出资进度方面,基金合伙人的出资" \
"应于基金成立之日起四年内分四期缴足,每期缴付7525万元;各基金合伙人每期按其出资比例缴付。" \
"合伙期限为11年,投资目标为教育领域初创期或成长期企业。截止公告披露日,深圳嘉道谷投资管理有限公司股权结构如下:截止公告披露日,深圳嘉道功程股权投资基金产权结构如下:公告还披露,方直科技将探索在中小学教育、在线教育、非学历教育、学前教育、留学咨询等教育行业其他分支领域的投资。方直科技2016年营业收入9691万元,营业利润1432万元,归属于普通股股东的净利润1847万元。(多知网 黎珊)}}"

Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ def predict_proba(oword, iword):
k = pd.Series(keywords(jieba.cut(ques)))
print(k)


if __name__ == '__main__':
# 先训练, 然后再预测
train_word2vec_by_word()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
# @function :text_summary with lead-3


from nlg_yongzhuo.data_proprecess.text_preprocess import cut_sentence
from nlg_yongzhuo.data_preprocess.text_preprocess import cut_sentence


class Lead3:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@
# @paper :Latent Dirichlet Allocation


from nlg_yongzhuo.data_proprecess.text_preprocess import extract_chinese
from nlg_yongzhuo.data_proprecess.text_preprocess import cut_sentence
from nlg_yongzhuo.data_proprecess.text_preprocess import jieba_cut
from nlg_yongzhuo.data_preprocess.text_preprocess import extract_chinese
from nlg_yongzhuo.data_preprocess.text_preprocess import cut_sentence
from nlg_yongzhuo.data_preprocess.text_preprocess import jieba_cut
from nlg_yongzhuo.data.stop_words.stop_words import stop_words
# sklearn
from sklearn.feature_extraction.text import CountVectorizer
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,36 +6,13 @@
# @paper :Text summarization using Latent Semantic Analysis


#
from nlg_yongzhuo.data_proprecess.text_preprocess import extract_chinese
from nlg_yongzhuo.data_proprecess.text_preprocess import cut_sentence
from nlg_yongzhuo.data_proprecess.text_preprocess import jieba_cut
from nlg_yongzhuo.data_preprocess.text_preprocess import extract_chinese, tfidf_fit
from nlg_yongzhuo.data_preprocess.text_preprocess import cut_sentence, jieba_cut
from nlg_yongzhuo.data.stop_words.stop_words import stop_words
# sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD


def tfidf_fit(sentences):
"""
tfidf相似度
:param sentences:
:return:
"""
# tfidf计算
model = TfidfVectorizer(ngram_range=(1, 2), # 3,5
stop_words=[' ', '\t', '\n'], # 停用词
max_features=10000,
token_pattern=r"(?u)\b\w+\b", # 过滤停用词
min_df=1,
max_df=0.9,
use_idf=1, # 光滑
smooth_idf=1, # 光滑
sublinear_tf=1, ) # 光滑
matrix = model.fit_transform(sentences)
return matrix


class LSISum:
def __init__(self):
self.stop_words = stop_words.values()
Expand Down Expand Up @@ -83,6 +60,7 @@ def summarize(self, text, num=8, topic_min=3):

return score_sen


if __name__ == '__main__':
lsi = LSISum()
doc = "多知网5月26日消息,今日,方直科技发公告,拟用自有资金人民币1.2亿元," \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@
# @function :topic model of NMF


from nlg_yongzhuo.data_proprecess.text_preprocess import extract_chinese
from nlg_yongzhuo.data_proprecess.text_preprocess import cut_sentence
from nlg_yongzhuo.data_proprecess.text_preprocess import jieba_cut
from nlg_yongzhuo.data_preprocess.text_preprocess import extract_chinese
from nlg_yongzhuo.data_preprocess.text_preprocess import cut_sentence
from nlg_yongzhuo.data_preprocess.text_preprocess import jieba_cut
from nlg_yongzhuo.data.stop_words.stop_words import stop_words
# sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
Expand Down
1 change: 1 addition & 0 deletions nlg_yongzhuo/text_summarization/readme_summary.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
- graph_base
- textrank(textrank4zh/gensim/sklearn)
- festure_base
- mmr
- text_pronouns
- text_teaser
- word_significance
Expand Down

0 comments on commit 64a3028

Please sign in to comment.