-
Notifications
You must be signed in to change notification settings - Fork 1
/
A_Gen_UISet_MoiveLens.py
93 lines (72 loc) · 2.8 KB
/
A_Gen_UISet_MoiveLens.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import sys ;sys.path.append('../')
import DataLinkSet as DLSet
from tools import *
from collections import defaultdict
# 将原始数据处理为csv表, 并进行频率统计
def Handle():
count_U = defaultdict(lambda: 0) # 统计用户出现次数
count_I = defaultdict(lambda: 0) # 统计物品出现次数
with open(DLSet.orgData_reviewsParse_link, 'w') as f:
with open(DLSet.orgData_reviews_link, 'r') as fData:
dataSet = fData.readlines()
print(len(dataSet))
for aLine in dataSet:
each = aLine[:-1].split('::')
# 存储记录
f.write(" ".join([each[0], each[1],
str(each[2]), str(each[3])]) + '\n')
# 属性提取
userID = each[0]
itemID = each[1]
# timeStamp = each['unixReviewTime']
# 数据统计
count_U[userID] += 1
count_I[itemID] += 1
return count_U, count_I
# 完成keyID映射, 数据量大时可用BST优化
def MapID(keyID, keyMapSet, count):
flag = False
if keyID in keyMapSet:
keyID = keyMapSet[keyID]
flag = True
else:
keyMapSet[keyID] = count
keyID = count
count += 1
return keyID, count, flag
# 按照数量进行过滤
def Fliter(count_U, count_I):
userMapSet = {} # 将用户映射到 [0, 1, ..] 区间中
itemMapSet = {} # 将物品映射到 [0, 1, ..] 区间中
userAction = {} # 记录用户和物品之间交互记录, keyID 为 映射后的用户ID
count_user = 0
count_item = 0
with open(DLSet.orgData_reviews_link, 'r') as fData:
dataSet = fData.readlines()
for aLine in dataSet:
each = aLine[:-1].split('::')
# 属性提取
userID = each[0]
itemID = each[1]
timeStamp = each[3]
# 过滤出现次数少的用户和物品记录
if count_U[userID] < DLSet.LIMIT_EXIST_TIMES or count_I[itemID] < DLSet.LIMIT_EXIST_TIMES:
continue
# 完成ID映射
userID, count_user, flagU = MapID(userID, userMapSet, count_user)
itemID, count_item, flagI = MapID(itemID, itemMapSet, count_item)
# 若为新用户, 初始化其评论列表
if flagU is False:
userAction[userID] = []
# 添加记录
userAction[userID].append([timeStamp, itemID])
# 对每个用户的行为按交互时间进行升序
for userID in userAction.keys():
userAction[userID].sort(key=lambda x: x[0])
# print(userAction[0])
return userMapSet, itemMapSet, userAction, count_user, count_item
if __name__ == '__main__':
count_U, count_I = Handle()
userMapSet, itemMapSet, userAction, count_user, count_item = Fliter(count_U, count_I)
print(count_item)
# print(userAction)