Skip to content

Commit

Permalink
爬虫更新
Browse files Browse the repository at this point in the history
允许爬虫生成 用户的模拟浏览数据 对系统的写一些逻辑进行修改
  • Loading branch information
SmacUL committed May 18, 2020
1 parent c8007a7 commit 7ef3538
Show file tree
Hide file tree
Showing 6 changed files with 110 additions and 8 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ public List<String> getArtTypes() {
* 按照类别获取一页文章
* 20-04-19 创建方法
* 20-05-02 添加老用户推荐逻辑
* 20-05-19 修改逻辑, 在新用户切换成老用户时添加计算相似用户的逻辑
* @param artType
* @param page
* @param pageSize
Expand All @@ -57,13 +58,19 @@ public List<String> getArtTypes() {
public List<ArtFullMod> getTinyArtOnePageByType(
@RequestParam String artType, @RequestParam Integer page, @RequestParam Integer pageSize) {
Customer customer = (Customer) session.getAttribute("customer");
List<Integer> relativeCusList = (List<Integer>) session.getAttribute("relative");
if (customer == null || relativeCusList == null) {
if (customer == null) {
return null;
}
if (selfService.checkIsNewUser(customer.getCusId())) {
return loadService.getTinyArtOnePageByTypeForNew(customer.getCusId(), artType, page, pageSize);
} else {
// TODO 删除
System.out.println("============ 老用户推按 =============");
List<Integer> relativeCusList = (List<Integer>) session.getAttribute("relative");
if (relativeCusList == null || relativeCusList.size() == 0) {
relativeCusList = selfService.getRelativeCusList(customer.getCusId(), 10);
session.setAttribute("relative", relativeCusList);
}
return loadService.getTinyArtOnePageByTypeForOld(customer.getCusId(), relativeCusList, artType, page, pageSize);
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,12 @@ public String cusLogin(@RequestParam String cusName, @RequestParam String cusPas
customer.setCusPass(null);
session.setAttribute("customer", customer);
List<Integer> cusList = selfService.getRelativeCusList(customer.getCusId(), 10);
// TODO 删除
System.out.println("=========== 相似用户 =========");
for (Integer cus: cusList) {
System.out.println(cus);
}
System.out.println("=========== 相似用户 end =========");
session.setAttribute("relative", cusList);
return "登录成功";
}
Expand Down Expand Up @@ -132,6 +138,7 @@ public String setCusBasicInfo(@RequestBody Customer customer) {
* 处理用户关注与取消关注
* 20-04-18 创建方法
* 20-04-26 修改逻辑, 防止用户关注自己
* 20-05-17 补上用户关注后, 更新后台三个表信息.
* @param cusId 关注或取消关注的用户的 ID
* @return
*/
Expand All @@ -146,6 +153,7 @@ public String setCusFollow(@RequestParam Integer cusId) {
return "不能关注自己";
}
if (selfService.setCusFollow(cusIdFrom, cusId)) {
shapeService.setCusBehaviorCusFollow(cusIdFrom, cusId);
return "关注成功";
} else {
return "关注失败";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ public interface ShapeService {
Boolean setCusBehaviorRepDislike(Integer cusId, Integer repCusId, Integer artId, Integer repId);

/**
* 用户 follow 10
* 用户 follow 11
* cbrCusIdFrom | cbrCusIdTo | cbrBehavior | cbrTime | cbrArtId | cbrType | cbrTargetId
* cusIdFrom | cusIdTo | 11 | followTime | null | 0 | null
* 20-04-19 创建方法
Expand Down
55 changes: 55 additions & 0 deletions spider/Main.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
import util.Time as Time
import os.path
import logging
import random

log_file_name = os.path.join('log', '%s.txt' % Time.Time.get_local_time())
logger = logging.getLogger()
Expand Down Expand Up @@ -49,6 +50,7 @@ def major(self):
categories = ['news_society', 'news_entertainment', 'news_tech', 'news_military', 'news_sports', 'news_car',
'news_finance', 'news_world', 'news_fashion', 'news_travel', 'news_discovery', 'news_baby',
'news_regimen', 'news_story', 'news_essay', 'news_game', 'news_history', 'news_food']
# categories = ['news_society']

for category in categories:
print("当前类别: %s" % category)
Expand Down Expand Up @@ -125,6 +127,8 @@ def major(self):
"""
try:
coms_json = self.__com_pro.get_coms_json(art_brief_json)
if coms_json is None:
continue
logging.info("%s-%d coms_json 获取 成功" % (category, art_i))
except:
print("\t%s-%d coms_json 获取 失败" % (category, art_i))
Expand All @@ -141,6 +145,7 @@ def major(self):
self.__cus_pro.set_com_cus(com_json, com_cus_mod)
self.__cus_dao.insert_then_get_cus(com_cus_mod)
self.__cus_dao.update_cus_feature(category, com_cus_mod.cus_id, flag=True)
# self.__cus_dao.cus_watch_other_same_category_art(com_cus_mod.cus_id, art_mod.art_id, category)
logging.info("%s-%d-%d com_cus 处理 错误" % (category, art_i, com_i))
except:
print("\t%s-%d-%d com_cus 处理 错误" % (category, art_i, com_i))
Expand Down Expand Up @@ -187,10 +192,36 @@ def major(self):
logging.exception("%s-%d-%d art-cus 行为 4 数据库操作 失败" % (category, art_i, com_i))
continue

""" 评论用户 模拟浏览
"""
try:
result_list = None
rand_category_num = random.randint(0, 18)
rand_cates = random.sample(categories, rand_category_num)
for rand_cate in rand_cates:
result_list = self.__art_dao.get_same_category_art(art_mod.art_id, rand_cate)
if result_list is not None:
for back_art in result_list:
try:
self.__cus_dao.insert_cus_behavior(
com_cus_mod.cus_id, back_art[1], 2, back_art[0], 1, back_art[0]
)
self.__cus_dao.update_cus_feature(rand_cate, com_cus_mod.cus_id, update_num=1)
self.__art_dao.update_art_feature(6, back_art[0])
except:
continue
print("\t%d 用户模拟浏览操作 数量 %d 完成" % (com_cus_mod.cus_id, len(result_list)))
logging.info("%d 模拟浏览操作 数量 %d 完成" % (com_cus_mod.cus_id, len(result_list)))
except:
print("\t%d 用户模拟浏览操作 失败" % com_cus_mod.cus_id)
logging.exception("%d 用户模拟浏览操作 失败" % com_cus_mod.cus_id)

""" 回复处理
"""
try:
reps_json = self.__rep_pro.get_reps_json(com_json)
if reps_json is None:
continue
logging.info("%s-%d-%d reps_json 获取 成功" % (category, art_i, com_i))
except:
print("\t\t%s-%d-%d reps_json 获取 失败" % (category, art_i, com_i))
Expand Down Expand Up @@ -253,6 +284,30 @@ def major(self):
logging.exception("%s-%d-%d-%d art-cus 行为 5 数据库操作 失败" % (category, art_i, com_i, rep_i))
continue

""" 回复用户 模拟浏览
"""
try:
result_list = None
rand_category_num = random.randint(0, 18)
rand_cates = random.sample(categories, rand_category_num)
for rand_cate in rand_cates:
result_list = self.__art_dao.get_same_category_art(art_mod.art_id, rand_cate)
if result_list is not None:
for back_art in result_list:
try:
self.__cus_dao.insert_cus_behavior(
rep_cus_mod.cus_id, back_art[1], 2, back_art[0], 1, back_art[0]
)
self.__cus_dao.update_cus_feature(rand_cate, rep_cus_mod.cus_id, update_num=1)
self.__art_dao.update_art_feature(6, back_art[0])
except:
continue
print("\t\t%d 用户模拟浏览操作 数量 %d 完成" % (rep_cus_mod.cus_id, len(result_list)))
logging.info("%d 用户模拟浏览操作 数量 %d 完成" % (rep_cus_mod.cus_id, len(result_list)))
except:
print("\t\t%d 用户模拟浏览操作 失败" % rep_cus_mod.cus_id)
logging.exception("%d 用户模拟浏览操作 失败" % rep_cus_mod.cus_id)


if __name__ == '__main__':
Major(os.path.join('properties', 'database.json')).major()
Expand Down
28 changes: 27 additions & 1 deletion spider/dao/ArticleDao.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import util.MySql as MySql
import model.ArticleModel as ArtMod
import random

import logging

Expand Down Expand Up @@ -150,6 +151,7 @@ def update_art_feature(self, behavior, art_id, art_time=''):
# 20-04-17 修改完成
# 20-04-23 Rollback BUG Fix
# 20-05-15 修改方法逻辑, 当行为为 6 时, 只将新闻阅读数量加 1
:param behavior: 这个行为和那个用户行为是两个东西.
:param art_id:
Expand All @@ -168,10 +170,14 @@ def update_art_feature(self, behavior, art_id, art_time=''):

if behavior == 1:
update_sql = "insert into ArtFeatureCount(afc_art_id, afc_art_time) values(%d, '%s')" % (art_id, art_time)
else:
elif behavior != 1 or behavior != 6:
update_sql = "update ArtFeatureCount set {0}={1}+1, afc_read_num=afc_read_num+1" \
" where afc_art_id=%d"\
.format(behavior_dict[behavior], behavior_dict[behavior]) % art_id
else:
update_sql = "update ArtFeatureCount set afc_read_num=afc_read_num+1" \
" where afc_art_id=%d"\
% art_id

self.__base.execute_sql(update_sql)
# logging.info("新闻 art_id=%s 特征 %s 数据库插入 成功" % (art_id, behavior))
Expand All @@ -181,4 +187,24 @@ def update_art_feature(self, behavior, art_id, art_time=''):
raise


def get_same_category_art(self, cur_art_id, category):
""" 随机选择一定数量的同类文章
此方法的作用主要是为了帮组增加用户的行为数据.
用户随机浏览发生在两个方面: 一是浏览数量的随机 [1, 40], 二是同类别下浏览文章的随机.
20-05-15 创建方法
:param cur_art_id: 当前文章 id
:param category: 新闻类别
:return:
"""
try:
rand_num = random.randint(1, 40)
select_sql = "select art_id, art_cus_id from Article where art_type = '%s' and art_id != %d limit %d" % \
(category, cur_art_id, rand_num)
self.__base.execute_sql(select_sql)
return self.__base.get_result_all()
except:
raise

14 changes: 10 additions & 4 deletions spider/dao/CustomerDao.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,16 +149,20 @@ def insert_cus_behavior(self, cbr_cus_id_from, cbr_cus_id_to, cbr_behavior, cbr_
raise


def update_cus_feature(self, category, cus_id, flag=False):
def update_cus_feature(self, category, cus_id, update_num=2, flag=False):
""" 更新用户统计数据
这将是一个非常操蛋的方法.
# 20-04-17 修改完成
# 20-04-18 BUG 修改: 每调用一次此方法, 用户特征的增加应该与文章特征的增加保持一致, 即增加 2, 而非 1.
# 20-04-23 接口修改, 添加 flag 字段
# 20-04-23 Rollback BUG Fix
# 20-05-15 修改方法, 允许设置特征更新数量
:param category:
:param cus_id:
:param update_num: 每调用一次此方法, 需要用户特征的增加应该与文章特征的增加保持一致, 有些时候是 1, 有些时候是 2, 默认为 2
:param flag:
当 flag 为 True 且 cus_id 指向的用户已存在时, category 参数将失效,
用于插入一个仅有 cfc_cus_id 的记录, 即初始化.
Expand All @@ -171,14 +175,16 @@ def update_cus_feature(self, category, cus_id, flag=False):
if result[0] == 0:
# logging.info("特征 用户 cus_id=%s 数据库查询 不存在" % (cus_id))
if flag:
# 发现用户不存在, 只是想单纯地创建用户
update_sql = "insert into CusFeatureCount(cfc_cus_id) value (%d)" % cus_id
else:
# 发现用户不存在, 在创建用户的基础上, 还想更新一些数据.
update_sql = "insert into CusFeatureCount(cfc_cus_id, {0}) values(%d, %d)" \
.format('cfc_' + category) % (cus_id, 2)
.format('cfc_' + category) % (cus_id, update_num)
else:
# logging.info("特征 用户 cus_id=%s 数据库查询 存在" % (cus_id))
update_sql = "update CusFeatureCount set {0}={1}+2 where cfc_cus_id=%d" \
.format('cfc_' + category, 'cfc_' + category) % cus_id
update_sql = "update CusFeatureCount set {0}={1}+{2} where cfc_cus_id=%d" \
.format('cfc_' + category, 'cfc_' + category, update_num) % cus_id

self.__base.execute_sql(update_sql)
# logging.info("用户 cus_id=%s 类别 %s 特征 数据库插入 成功" % (cus_id, category))
Expand Down

0 comments on commit 7ef3538

Please sign in to comment.