-
Notifications
You must be signed in to change notification settings - Fork 1
/
DataLinkSet.py
69 lines (58 loc) · 3.98 KB
/
DataLinkSet.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import os
import time
dataSet = ['Amazon', 'Google', 'MovieLens', 'YooChoose', 'LastFM']
dataSetChoice = dataSet[4] # 所选用的数据集合
""" 文件夹地址 """
DataSetLink = '../DataSet' # 数据仓库总地址
# DataSetLink = '../../DataSet' # 数据仓库总地址
RawSetLink = DataSetLink + '/RawSet' # 原生数据仓库地址
OrgSetLink = DataSetLink + '/OrgSet' # 原始数据仓库地址
CleanSetLink = DataSetLink + '/CleanSet' # 清洗数据仓库地址
ModelSetLink = DataSetLink + '/ModelSet'
SASRecSetLink = DataSetLink + '/SASRecSet' # 清洗数据仓库地址
""" ---------------------------------- SASRec数据仓库 ------------------------------------ """
SASData_link = SASRecSetLink + '/data_SAS_%s.txt'
""" ---------------------------------- 原始数据仓库 ------------------------------------ """
# 原始数据
orgData_reviews_link = RawSetLink + '/Raw_%s' # 原始数据(review)
orgData_reviewsParse_link = OrgSetLink + '/reviewsParse_%s' # 解析后的原始数据(review)
orgData_meta_link = OrgSetLink + '/meta_%s.json.gz' # 原始数据(metadata)
""" ---------------------------------- 清洗数据仓库 ------------------------------------ """
# 生成后的数据 [userAction, IIG, userMapSet, itemMapSet, count_user, count_item]
mainData_link = CleanSetLink + '/mainData_%s.npy'
TVJ_link = CleanSetLink + '/TVJ_%s.npy'
""" ---------------------------------- 模型仓库 ------------------------------------ """
model_link = ModelSetLink + '/model_%s_%s'
""" ---------------------------------- 常数设置 ------------------------------------ """
LIMIT_EXIST_TIMES = 5 # userID, itemID 至少出现 LIMIT_EXIST_TIMES 次的原始数据被保留
LEN_SEQUENCE_LEN = 3 # 总序列长度
""" ------------------------------------------------------------------------------ """
tempLink = RawSetLink + '/Raw_Google'
if __name__ == '__main__':
# 获得当前工作目录
print(os.getcwd())
# 路径测试
with open(orgData_reviews_link[3:], 'r') as f:
# print(f.readline())
f.close()
""" ---------------------------------- 生数据仓库 ------------------------------------ """
# ------------- 相关常数 --------------
# 选择时间左界
STR_TIME_LIMIT = '2014-1-1' # 选择左界时间
# STR_TIME_LIMIT = '2000-1-1' # 选择左界时间
TIME_LIMIT = int(time.mktime(time.strptime(STR_TIME_LIMIT, "%Y-%m-%d"))) # 对应时间戳
ITEM_TIME_LIMIT = 5 # 保留至少出现 itemTimeLimit
USER_TIME_LIMIT = 2 # 保留至少出现 userTimeLimit
# ------------- 相关文件 --------------
strTemp = 'Google'
# rawRatings_link = RawSetLink + '/ratings_%s.csv' % strTemp
# rawRatings_link = RawSetLink + '/reviews.clean.json.gz'
rawRatings_link = RawSetLink + '/data_Amazon'
rawRatings_Sample_link = RawSetLink + '/rawRatings_Sample_%s_After_%s' % (strTemp, STR_TIME_LIMIT) # 按照时间划分
filter_item_link = RawSetLink + '/filter_item_%s_After_%s' % (strTemp, STR_TIME_LIMIT) # 过滤掉低频物品
filter_user_link = RawSetLink + '/filter_user_%s_After_%s' % (strTemp, STR_TIME_LIMIT) # 过滤掉低频用户
sample_afterFilter_link = RawSetLink + '/sample_%s_afterFilter_%s_After_%s' % ('%s', strTemp, STR_TIME_LIMIT) # 下采样
tempTable_link = RawSetLink + '/tempTable' # 临时表
KCore_link = RawSetLink + '/KCore_%s_%s_After_%s' % ('%d', strTemp, STR_TIME_LIMIT) # K-Core表
mapItemID_SAS_link = RawSetLink + '/mapItemID_SAS_%s' # SASRec 需要映射itemID从0开始
data_SAS_link = RawSetLink + '/data_SAS_%s.txt' # 映射后的结果