-
Notifications
You must be signed in to change notification settings - Fork 0
/
ReadCsv.py
229 lines (211 loc) · 8.18 KB
/
ReadCsv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
import pandas as pd
import numpy as np
import jieba
import collections
import re
import os
import json
class ReadData(object):
"""
Args:
sentence_lenth:指的是评论的长度(根据分词后的词的个数)
mode:指的是运用模式,‘train’适用于训练神经网络的时候,此时保存词典等信息
batch_size: 这里的batch要适配于下面的nextbatch()方法中的batch
vocabulary_size:指的是构造的词典的词数
load_size:指的是需要加载的语料的数量。
"""
def __init__(self, sentence_lenth, mode ='train', batch_size=50, vocabulary_size = 2000, load_size= 1840):
self.stopword = ",。! \n*的了是也"
self.mode = mode
self.offset = 0
self.epochs = 0
self.load_size = load_size
self.vocabulary_size = vocabulary_size
self.sentence_lenth = sentence_lenth
self.file = "./log/word_dict.json"
self.batchsize = batch_size
def Test(self, text):
textword = [jieba.cut(text)]
# print(textword)
textword = self.normalize(textword, self.stopword)
if os.path.exists(self.file):
# with open(self.file,'rb') as line:
for line in open(self.file,'rb'):
line.decode() ## 在写的时候是用utf-8的字节存储的
word_dict = json.loads(line)
# word_dict, word_list = self.make_dictionary(textword, self.vocabulary_size)
word_list = word_dict.keys() ## 获取的是键值list
# print(word_list)
textnum = self.text2num(textword, word_dict, word_list, self.sentence_lenth)
zero = np.zeros([self.batchsize-1, self.sentence_lenth], dtype=np.int32)
textnum = np.append(textnum, zero, axis=0)
textnum = np.int32(textnum)
return textnum
else:
print("字典不存在,请初始化后再应用此方法")
return 0
def Train_Init(self):
"""
在训练之前的对语料进行统一的初始化,包括分词和进行词典、词向量的构造等。
"""
data, self.label = self.Readcsv(self.load_size)
textword = self.text2word(text =data , mode = False)
textword = self.normalize(textword, self.stopword)
word_dict, self.word_list = self.make_dictionary(textword, self.vocabulary_size)
self.textnum = self.text2num(textword, word_dict, self.word_list, self.sentence_lenth)
num = np.arange(self.load_size)
np.random.shuffle(num)
self.textnum = self.textnum[num]
# print(self.textnum.shape)
self.label = self.label[num]
# print(type(self.textnum))
def Readcsv(self, load_size):
"""
对语料的读取和相应属性的读取
Args:
load_size:指的是要加载的数据量,最大值为4985条
"""
Assessdic = {'好评':0, '中评':1, '差评':2 }
if self.mode == 'train':
data_average = pd.read_csv(".\\Data\\JD_Crawler_Average.csv",encoding='gbk')
data_bad = pd.read_csv(".\\Data\\JD_Crawler_Bad.csv",encoding='gbk')
data_good = pd.read_csv(".\\Data\\JD_Crawler_Now.csv",encoding='gbk')
data_all = data_average.append(data_bad)
data_all = data_all.append(data_good)
for filename in os.listdir('.\\Data'):
if '2' not in filename:
continue
if 'csv' not in filename:
continue
path = os.path.join('.\\Data', filename)
data = pd.read_csv(path, encoding='gbk')
data_all = data_all.append(data)
# print(len(data_all))
data_label = data_all['Assess'].values
label = [Assessdic.get(Assess) for Assess in data_label]
label = label[0:load_size]
# print(len(label))
label = np.array(label)
data = list(data_all['Comment'].values) ### 这里返回的是句子组成的list
data = data[0:load_size]
# print(type(data))
else:
data_all = pd.read_csv(".\\Data\\JD_Crawler_Former.csv", encoding = 'gbk')
# print(len(data_all))
data_label = data_all['Assess'].values
label = [Assessdic.get(Assess) for Assess in data_label]
# print(len(label))
self.load_size = len(data_label)
label = np.array(label)
data = list(data_all['Comment'].values) ### 这里返回的是句子组成的list
return data, label
def text2word(self, text, mode):
"""
text:输入文本,要求格式是2维list,[batch, sentence]
mode:True是用关键词提取;
False是直接分词提取
"""
## 提取关键词作为词典
if mode == True:
words = [jieba.analyse.extract_tags(sentence) for sentence in text] ## extract_tags()直接返回list
else: ## 以直接分词结果作为词典
words = [list(jieba.cut(sentence)) for sentence in text] ## jieba 的cut方法返回的是一个迭代器
return words
def normalize(self, words, stopword):
"""
过滤停用词,标点等
"""
## 去掉666等数字
words = [[word for word in sentence if not re.match(r'\d{1,4}', word)]for sentence in words]
words = [[word for word in sentence if word not in "0123456789"]for sentence in words]
word_nor = [[word for word in sentence if word not in stopword] for sentence in words]
return word_nor
def make_dictionary(self, words, vocabulary_size):
"""
Args:
words:每行为句子,列为分词的矩阵
vocabulary——size:统计分词的个数
构造规则词典规则:以出现频数多到少排序,取前vocabulary_size构造词典,同时不再词典内的就以NaN代替。
"""
wordlist = [word for sentence in words for word in sentence] # 展开成一维,统计词
word_dict = collections.Counter(wordlist).most_common(vocabulary_size-1) ## 按频数排序,返回一个个tuple(“word”,频数)
word_list = [word[0] for word in word_dict]
word_list.insert(0, 'NaN') ## 插入无效词
# print(word_list)
# print(type(word_list))
word_dict = {word:index for index,word in enumerate(word_list)} ## enumerate()枚举方式返回序号和值
## 保存词典数据,保证下次的编码还是同样的编码规则
if os.path.exists(self.file):
with open(self.file,'wb') as file:
data = json.dumps(word_dict).encode()
# print(data)
file.write(data)
else:
os.mkdir('./log')
with open(self.file,'wb') as file:
data = json.dumps(word_dict).encode()
# print(data)
file.write(data)
return word_dict, word_list
def text2num(self, words, word_dict, word_list, sentence_lenth):
"""
Args:
words: 已经分好次的数据,格式[batch, words]
word_dict: 已经统计好的词典(根据词频排序)
word_list: 词典中统计的词
sentenc_lenth: 用于规则化的句子长度(补0 或者 删除)
Returns:
textnum:未经过规则化的词表(长短不一)
textnum_nor:经过规则化(补0,删除)的词表 site:[batch, sentence_lenth]
"""
# if len(words)<2:
# textnum = [[word_dict.get(word, 0) for word in sentence if word in word_list]for sentence in words]
# else:
# textnum = [word_dict.get(word, 0) for word in words]
## 把文本数字化
textnum = [[word_dict.get(word, 0) for word in sentence if word in word_list]for sentence in words]
# print(words)
output = []
## 把文本规则化, 多的裁剪,少的补零
for sentence in textnum:
if len(sentence)>=sentence_lenth:
word = sentence[0:sentence_lenth]
else:
zeros = [0]*(sentence_lenth-len(sentence))
word = sentence + zeros
output.append(word)
output = np.array(output)
# print(output.shape)
return output
def NextBatch(self, batchsize):
"""
抽取下一个批次的数据,返回的是array:[batch, sentence_lenth]
Args:
batchsize:指的是下次返回的批次的数量
"""
start = self.offset
self.offset +=batchsize
if self.offset >self.load_size:
self.epochs += 1
print("finish training "+str(self.epochs)+" times")
num = np.arange(self.load_size)
np.random.shuffle(num) ## 随机打乱num的顺序
self.textnum = self.textnum[num] ## 按照num 的顺序来重新排序
self.label = self.label[num]
start = 0
self.offset = batchsize
end = self.offset
return self.textnum[start:end], self.label[start:end]
if __name__ == '__main__':
test = ReadData(20, mode='train', load_size=100)
test.Train_Init()
for i in range(0,20):
a,b = test.NextBatch(20)
# # a, b = test.NextBatch(20)
# print(a.shape)
# print(a[0])
# print(b[0])
# test = ReadData(20, mode='train', load_size=100)
# num = test.Test("给二宝买的奶粉,真的很棒,性价比高,宝宝爱喝,我也安心")
# print(num.shape)
# print(num)