extractive

yongzhuo · Oct 28, 2019 · 64a3028 · 64a3028
1 parent 3c432bf
commit 64a3028
Show file tree

Hide file tree

Showing 26 changed files with 172 additions and 42 deletions.
diff --git a/nlg_yongzhuo/conf/path_config.py b/nlg_yongzhuo/conf/path_config.py
@@ -15,4 +15,19 @@
 sys.path.append(projectdir)
 print(projectdir)
 
+# path of embedding
+path_embedding_random_char = path_root + '/data/embeddings/term_char.txt'
+path_embedding_random_word = path_root + '/data/embeddings/term_word.txt'
+path_embedding_bert = path_root + '/data/embeddings/chinese_L-12_H-768_A-12/'
+path_embedding_xlnet = path_root + '/data/embeddings/chinese_xlnet_mid_L-24_H-768_A-12/'
+path_embedding_vector_word2vec_char = path_root + '/data/embeddings/w2v_model_wiki_char.vec'
+path_embedding_vector_word2vec_word = path_root + '/data/embeddings/w2v_model_merge_short.vec'
 
+# 模型目录
+path_model_dir = path_root + "/data/model/text_summarization/"
+# 语料地址
+path_model = path_root + '/data/model/text_summarization/model_fast_text.h5'
+# 超参数保存地址
+path_hyper_parameters = path_root + '/data/model/text_summarization/hyper_parameters.json'
+# embedding微调保存地址
+path_fineture = path_root + "/data/model/text_summarization/embedding_trainable.h5"
diff --git a/nlg_yongzhuo/data_proprecess/__init__.py → nlg_yongzhuo/data_preprocess/__init__.py b/nlg_yongzhuo/data_proprecess/__init__.py → nlg_yongzhuo/data_preprocess/__init__.py
diff --git a/...ngzhuo/data_proprecess/text_preprocess.py → ...ngzhuo/data_preprocess/text_preprocess.py b/...ngzhuo/data_proprecess/text_preprocess.py → ...ngzhuo/data_preprocess/text_preprocess.py
@@ -5,6 +5,7 @@
 # @function :data utils of nlg-yongzhuo
 
 
+from sklearn.feature_extraction.text import TfidfVectorizer
 import jieba.posseg as pseg
 import pandas as pd
 import logging
@@ -183,6 +184,26 @@ def gram_uni_bi_tri(text):
  return gram_uni, gram_bi, gram_tri
 
 
+def tfidf_fit(sentences):
+ """
+ tfidf相似度
+ :param sentences: 
+ :return: 
+ """
+ # tfidf计算
+ model = TfidfVectorizer(ngram_range=(1, 2), # 3,5
+ stop_words=[' ', '\t', '\n'], # 停用词
+ max_features=10000,
+ token_pattern=r"(?u)\b\w+\b", # 过滤停用词
+ min_df=1,
+ max_df=0.9,
+ use_idf=1, # 光滑
+ smooth_idf=1, # 光滑
+ sublinear_tf=1, ) # 光滑
+ matrix = model.fit_transform(sentences)
+ return matrix
+
+
 
 if __name__ == '__main__':
  text = "你喜欢谁,小老弟,你好烦哇。"

diff --git a/nlg_yongzhuo/text_summarization/extractive_sum/__init__.py b/nlg_yongzhuo/text_summarization/extractive_sum/__init__.py
@@ -0,0 +1,5 @@
+# -*- coding: UTF-8 -*-
+# !/usr/bin/python
+# @time :2019/9/24 10:26
+# @author :Mo
+# @function :
diff --git a/...xt_summarization/feature_base/__init__.py → ...n/extractive_sum/feature_base/__init__.py b/...xt_summarization/feature_base/__init__.py → ...n/extractive_sum/feature_base/__init__.py
diff --git a/nlg_yongzhuo/text_summarization/extractive_sum/feature_base/mmr.py b/nlg_yongzhuo/text_summarization/extractive_sum/feature_base/mmr.py
@@ -0,0 +1,93 @@
+# -*- coding: UTF-8 -*-
+# !/usr/bin/python
+# @time :2019/10/28 10:16
+# @author :Mo
+# @function :
+
+
+from nlg_yongzhuo.data_preprocess.text_preprocess import extract_chinese, cut_sentence
+from nlg_yongzhuo.data_preprocess.text_preprocess import jieba_cut,tfidf_fit
+from nlg_yongzhuo.data.stop_words.stop_words import stop_words
+# sklearn
+from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+import copy
+
+
+class MMRSum:
+ def __init__(self):
+ self.stop_words = stop_words.values()
+ self.algorithm = 'mmr'
+
+ def summarize(self, text, num=8, alpha=0.6):
+ """
+
+ :param text: str
+ :param num: int
+ :return: list
+ """
+ # 切句
+ if type(text) == str:
+ self.sentences = cut_sentence(text)
+ elif type(text) == list:
+ self.sentences = text
+ else:
+ raise RuntimeError("text type must be list or str")
+ # 切词
+ sentences_cut = [[word for word in jieba_cut(extract_chinese(sentence))
+ if word.strip()] for sentence in self.sentences]
+ # 去除停用词等
+ self.sentences_cut = [list(filter(lambda x: x not in self.stop_words, sc)) for sc in sentences_cut]
+ self.sentences_cut = [" ".join(sc) for sc in self.sentences_cut]
+ # # 计算每个句子的词语个数
+ # sen_word_len = [len(sc)+1 for sc in sentences_cut]
+ # 计算每个句子的tfidf
+ sen_tfidf = tfidf_fit(self.sentences_cut)
+ # 矩阵中两两句子相似度
+ SimMatrix = (sen_tfidf * sen_tfidf.T).A # 例如: SimMatrix[1, 3] # "第2篇与第4篇的相似度"
+ # 输入文本句子长度
+ len_sen = len(self.sentences)
+ # 句子标号
+ sen_idx = [i for i in range(len_sen)]
+ summary_set = []
+ mmr = {}
+ for i in range(len_sen):
+ if not self.sentences[i] in summary_set:
+ sen_idx_pop = copy.deepcopy(sen_idx)
+ sen_idx_pop.pop(i)
+ # 两两句子相似度
+ sim_i_j = [SimMatrix[i, j] for j in sen_idx_pop]
+ score_tfidf = sen_tfidf[i].toarray()[0].sum() # / sen_word_len[i], 如果除以词语个数就不准确
+ mmr[self.sentences[i]] = alpha * score_tfidf - (1 - alpha) * max(sim_i_j)
+ summary_set.append(self.sentences[i])
+ score_sen = [(rc[1], rc[0]) for rc in sorted(mmr.items(), key=lambda d: d[1], reverse=True)]
+ if len(mmr) > num:
+ score_sen = score_sen[0:num]
+ return score_sen
+
+
+if __name__ == '__main__':
+ mmr_sum = MMRSum()
+ doc = "PageRank算法简介。" \
+ "是上世纪90年代末提出的一种计算网页权重的算法! " \
+ "当时，互联网技术突飞猛进，各种网页网站爆炸式增长。 " \
+ "业界急需一种相对比较准确的网页重要性计算方法。 " \
+ "是人们能够从海量互联网世界中找出自己需要的信息。 " \
+ "百度百科如是介绍他的思想:PageRank通过网络浩瀚的超链接关系来确定一个页面的等级。 " \
+ "Google把从A页面到B页面的链接解释为A页面给B页面投票。 " \
+ "Google根据投票来源甚至来源的来源，即链接到A页面的页面。 " \
+ "和投票目标的等级来决定新的等级。简单的说， " \
+ "一个高等级的页面可以使其他低等级页面的等级提升。 " \
+ "具体说来就是，PageRank有两个基本思想，也可以说是假设。 " \
+ "即数量假设：一个网页被越多的其他页面链接，就越重）。 " \
+ "质量假设：一个网页越是被高质量的网页链接，就越重要。 " \
+ "总的来说就是一句话，从全局角度考虑，获取重要的信。 "
+ sum = mmr_sum.summarize(doc)
+ for i in sum:
+ print(i)
+
+
+
+
+
+
diff --git a/...mmarization/feature_base/text_pronouns.py → ...ractive_sum/feature_base/text_pronouns.py b/...mmarization/feature_base/text_pronouns.py → ...ractive_sum/feature_base/text_pronouns.py
@@ -7,10 +7,10 @@
 # @evaluate :bad, it is for english, and that's not clearly explain of formula
 
 
-from nlg_yongzhuo.data_proprecess.text_preprocess import jieba_cut, jieba_tag_cut
-from nlg_yongzhuo.data_proprecess.text_preprocess import gram_uni_bi_tri
-from nlg_yongzhuo.data_proprecess.text_preprocess import extract_chinese
-from nlg_yongzhuo.data_proprecess.text_preprocess import cut_sentence
+from nlg_yongzhuo.data_preprocess.text_preprocess import jieba_cut, jieba_tag_cut
+from nlg_yongzhuo.data_preprocess.text_preprocess import gram_uni_bi_tri
+from nlg_yongzhuo.data_preprocess.text_preprocess import extract_chinese
+from nlg_yongzhuo.data_preprocess.text_preprocess import cut_sentence
 from nlg_yongzhuo.data.stop_words.stop_words import stop_words
 import jieba.analyse as analyse
 from collections import Counter
@@ -221,3 +221,14 @@ def summarize(self, text, title=None):
  for sum_ in sums:
  print(sum_)
 
+ ran_20 = range(20)
+ print(type(ran_20))
+ print(ran_20)
+ idx = [1,2,3]
+ idx.pop(1)
+ print(idx)
+ print(max([1,2,3,4]))
+
+
+
+
diff --git a/...summarization/feature_base/text_teaser.py → ...xtractive_sum/feature_base/text_teaser.py b/...summarization/feature_base/text_teaser.py → ...xtractive_sum/feature_base/text_teaser.py
@@ -7,9 +7,9 @@
 # @url :using Google Scholar
 
 
-from nlg_yongzhuo.data_proprecess.text_preprocess import extract_chinese
-from nlg_yongzhuo.data_proprecess.text_preprocess import cut_sentence
-from nlg_yongzhuo.data_proprecess.text_preprocess import jieba_cut
+from nlg_yongzhuo.data_preprocess.text_preprocess import extract_chinese
+from nlg_yongzhuo.data_preprocess.text_preprocess import cut_sentence
+from nlg_yongzhuo.data_preprocess.text_preprocess import jieba_cut
 from nlg_yongzhuo.data.stop_words.stop_words import stop_words
 from collections import Counter
 

diff --git a/...zation/feature_base/word_significance .py → ...ve_sum/feature_base/word_significance .py b/...zation/feature_base/word_significance .py → ...ve_sum/feature_base/word_significance .py
@@ -7,9 +7,9 @@
 # @url :http:https://courses.ischool.berkeley.edu/i256/f06/papers/luhn58.pdf
 
 
-from nlg_yongzhuo.data_proprecess.text_preprocess import extract_chinese
-from nlg_yongzhuo.data_proprecess.text_preprocess import cut_sentence
-from nlg_yongzhuo.data_proprecess.text_preprocess import jieba_cut
+from nlg_yongzhuo.data_preprocess.text_preprocess import extract_chinese
+from nlg_yongzhuo.data_preprocess.text_preprocess import cut_sentence
+from nlg_yongzhuo.data_preprocess.text_preprocess import jieba_cut
 from nlg_yongzhuo.data.stop_words.stop_words import stop_words
 from collections import Counter
 
@@ -104,3 +104,8 @@ def summarize(self, text, num=8):
  for r in res:
  print(r)
 
+
+"多知网. 多知网5月26日消息，今日，方直科技发公告，拟用自有资金人民币1.2亿元，与深圳嘉道谷投资管理有限公司、深圳嘉道功程股权投资基金（有限合伙）共同发起设立嘉道方直教育产业投资基金（暂定名）。该基金认缴出资总规模为人民币3.01亿元。基金的出资方式具体如下：出资进度方面，基金合伙人的出资" \
+"应于基金成立之日起四年内分四期缴足，每期缴付7525万元；各基金合伙人每期按其出资比例缴付。" \
+"合伙期限为11年，投资目标为教育领域初创期或成长期企业。截止公告披露日，深圳嘉道谷投资管理有限公司股权结构如下:截止公告披露日，深圳嘉道功程股权投资基金产权结构如下:公告还披露，方直科技将探索在中小学教育、在线教育、非学历教育、学前教育、留学咨询等教育行业其他分支领域的投资。方直科技2016年营业收入9691万元，营业利润1432万元，归属于普通股股东的净利润1847万元。（多知网 黎珊）}}"
+
diff --git a/...text_summarization/graph_base/__init__.py → ...ion/extractive_sum/graph_base/__init__.py b/...text_summarization/graph_base/__init__.py → ...ion/extractive_sum/graph_base/__init__.py
diff --git a/...arization/graph_base/textrank/__init__.py → ...ctive_sum/graph_base/textrank/__init__.py b/...arization/graph_base/textrank/__init__.py → ...ctive_sum/graph_base/textrank/__init__.py
diff --git a/...arization/graph_base/textrank/textrank.py → ...ctive_sum/graph_base/textrank/textrank.py b/...arization/graph_base/textrank/textrank.py → ...ctive_sum/graph_base/textrank/textrank.py
diff --git a/...on/graph_base/textrank/textrank_gensim.py → ...um/graph_base/textrank/textrank_gensim.py b/...on/graph_base/textrank/textrank_gensim.py → ...um/graph_base/textrank/textrank_gensim.py
diff --git a/...n/graph_base/textrank/textrank_sklearn.py → ...m/graph_base/textrank/textrank_sklearn.py b/...n/graph_base/textrank/textrank_sklearn.py → ...m/graph_base/textrank/textrank_sklearn.py
diff --git a/...aph_base/textrank/textrank_textrank4zh.py → ...aph_base/textrank/textrank_textrank4zh.py b/...aph_base/textrank/textrank_textrank4zh.py → ...aph_base/textrank/textrank_textrank4zh.py
diff --git a/.../text_summarization/nous_base/__init__.py → ...tion/extractive_sum/nous_base/__init__.py b/.../text_summarization/nous_base/__init__.py → ...tion/extractive_sum/nous_base/__init__.py
diff --git a/...mmarization/nous_base/keyword/__init__.py → ...ractive_sum/nous_base/keyword/__init__.py b/...mmarization/nous_base/keyword/__init__.py → ...ractive_sum/nous_base/keyword/__init__.py
diff --git a/...zation/nous_base/keyword/keyword_jieba.py → ...ve_sum/nous_base/keyword/keyword_jieba.py b/...zation/nous_base/keyword/keyword_jieba.py → ...ve_sum/nous_base/keyword/keyword_jieba.py
diff --git a/...ion/nous_base/keyword/keyword_word2vec.py → ...sum/nous_base/keyword/keyword_word2vec.py b/...ion/nous_base/keyword/keyword_word2vec.py → ...sum/nous_base/keyword/keyword_word2vec.py
@@ -70,6 +70,7 @@ def predict_proba(oword, iword):
  k = pd.Series(keywords(jieba.cut(ques)))
  print(k)
 
+
 if __name__ == '__main__':
  # 先训练, 然后再预测
  train_word2vec_by_word()

diff --git a/...ummarization/nous_base/lead_3/__init__.py → ...tractive_sum/nous_base/lead_3/__init__.py b/...ummarization/nous_base/lead_3/__init__.py → ...tractive_sum/nous_base/lead_3/__init__.py
diff --git a/..._summarization/nous_base/lead_3/lead_3.py → ...extractive_sum/nous_base/lead_3/lead_3.py b/..._summarization/nous_base/lead_3/lead_3.py → ...extractive_sum/nous_base/lead_3/lead_3.py
@@ -5,7 +5,7 @@
 # @function :text_summary with lead-3
 
 
-from nlg_yongzhuo.data_proprecess.text_preprocess import cut_sentence
+from nlg_yongzhuo.data_preprocess.text_preprocess import cut_sentence
 
 
 class Lead3:

diff --git a/...text_summarization/topic_base/__init__.py → ...ion/extractive_sum/topic_base/__init__.py b/...text_summarization/topic_base/__init__.py → ...ion/extractive_sum/topic_base/__init__.py
diff --git a/...ext_summarization/topic_base/topic_lda.py → ...on/extractive_sum/topic_base/topic_lda.py b/...ext_summarization/topic_base/topic_lda.py → ...on/extractive_sum/topic_base/topic_lda.py
@@ -6,9 +6,9 @@
 # @paper :Latent Dirichlet Allocation
 
 
-from nlg_yongzhuo.data_proprecess.text_preprocess import extract_chinese
-from nlg_yongzhuo.data_proprecess.text_preprocess import cut_sentence
-from nlg_yongzhuo.data_proprecess.text_preprocess import jieba_cut
+from nlg_yongzhuo.data_preprocess.text_preprocess import extract_chinese
+from nlg_yongzhuo.data_preprocess.text_preprocess import cut_sentence
+from nlg_yongzhuo.data_preprocess.text_preprocess import jieba_cut
 from nlg_yongzhuo.data.stop_words.stop_words import stop_words
 # sklearn
 from sklearn.feature_extraction.text import CountVectorizer

diff --git a/...ext_summarization/topic_base/topic_lsi.py → ...on/extractive_sum/topic_base/topic_lsi.py b/...ext_summarization/topic_base/topic_lsi.py → ...on/extractive_sum/topic_base/topic_lsi.py
@@ -6,36 +6,13 @@
 # @paper :Text summarization using Latent Semantic Analysis
 
 
-#
-from nlg_yongzhuo.data_proprecess.text_preprocess import extract_chinese
-from nlg_yongzhuo.data_proprecess.text_preprocess import cut_sentence
-from nlg_yongzhuo.data_proprecess.text_preprocess import jieba_cut
+from nlg_yongzhuo.data_preprocess.text_preprocess import extract_chinese, tfidf_fit
+from nlg_yongzhuo.data_preprocess.text_preprocess import cut_sentence, jieba_cut
 from nlg_yongzhuo.data.stop_words.stop_words import stop_words
 # sklearn
-from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.decomposition import TruncatedSVD
 
 
-def tfidf_fit(sentences):
- """
- tfidf相似度
- :param sentences: 
- :return: 
- """
- # tfidf计算
- model = TfidfVectorizer(ngram_range=(1, 2), # 3,5
- stop_words=[' ', '\t', '\n'], # 停用词
- max_features=10000,
- token_pattern=r"(?u)\b\w+\b", # 过滤停用词
- min_df=1,
- max_df=0.9,
- use_idf=1, # 光滑
- smooth_idf=1, # 光滑
- sublinear_tf=1, ) # 光滑
- matrix = model.fit_transform(sentences)
- return matrix
-
-
 class LSISum:
  def __init__(self):
  self.stop_words = stop_words.values()
@@ -83,6 +60,7 @@ def summarize(self, text, num=8, topic_min=3):
 
  return score_sen
 
+
 if __name__ == '__main__':
  lsi = LSISum()
  doc = "多知网5月26日消息，今日，方直科技发公告，拟用自有资金人民币1.2亿元，" \

diff --git a/...ext_summarization/topic_base/topic_nmf.py → ...on/extractive_sum/topic_base/topic_nmf.py b/...ext_summarization/topic_base/topic_nmf.py → ...on/extractive_sum/topic_base/topic_nmf.py
@@ -5,9 +5,9 @@
 # @function :topic model of NMF
 
 
-from nlg_yongzhuo.data_proprecess.text_preprocess import extract_chinese
-from nlg_yongzhuo.data_proprecess.text_preprocess import cut_sentence
-from nlg_yongzhuo.data_proprecess.text_preprocess import jieba_cut
+from nlg_yongzhuo.data_preprocess.text_preprocess import extract_chinese
+from nlg_yongzhuo.data_preprocess.text_preprocess import cut_sentence
+from nlg_yongzhuo.data_preprocess.text_preprocess import jieba_cut
 from nlg_yongzhuo.data.stop_words.stop_words import stop_words
 # sklearn
 from sklearn.feature_extraction.text import TfidfVectorizer

diff --git a/nlg_yongzhuo/text_summarization/readme_summary.md b/nlg_yongzhuo/text_summarization/readme_summary.md
@@ -9,6 +9,7 @@
   - graph_base
   - textrank(textrank4zh/gensim/sklearn)
   - festure_base
+  - mmr
   - text_pronouns
   - text_teaser
   - word_significance