add second level comment crawl #302

nghuyong · Dec 12, 2023 · c152e24 · c152e24
1 parent aef6a58
commit c152e24
Show file tree

Hide file tree

Showing 2 changed files with 14 additions and 2 deletions.
diff --git a/README.md b/README.md
@@ -372,7 +372,8 @@ python run_spider.py tweet_by_keyword
 
 ## 更新日志
 
-- 2023.11: 支持采集指定时间段的用户推文 [#308](https://github.com/nghuyong/WeiboSpider/issues/308)
+- 2023.12: 支持采集推文的二级评论 [#302](https://github.com/nghuyong/WeiboSpider/issues/302)
+- 2023.12: 支持采集指定时间段的用户推文 [#308](https://github.com/nghuyong/WeiboSpider/issues/308)
 - 2023.04: 支持针对推文id的推文采集 [#272](https://github.com/nghuyong/WeiboSpider/issues/272)
 - 2022.11: 支持针对单个关键词获取单天超过1200页的检索结果 [#257](https://github.com/nghuyong/WeiboSpider/issues/257)
 - 2022.11: 支持长微博全文的获取

diff --git a/weibospider/spiders/comment.py b/weibospider/spiders/comment.py
@@ -37,7 +37,12 @@ def parse(self, response, **kwargs):
  for comment_info in data['data']:
  item = self.parse_comment(comment_info)
  yield item
- if data.get('max_id', 0) != 0:
+ # 解析二级评论
+ if 'more_info' in comment_info:
+ url = f"https://weibo.com/ajax/statuses/buildComments?is_reload=1&id={comment_info['id']}" \
+ f"&is_show_bulletin=2&is_mix=1&fetch_level=1&max_id=0&count=100"
+ yield Request(url, callback=self.parse, priority=20)
+ if data.get('max_id', 0) != 0 and 'fetch_level=1' not in response.url:
  url = response.meta['source_url'] + '&max_id=' + str(data['max_id'])
  yield Request(url, callback=self.parse, meta=response.meta)
 
@@ -53,4 +58,10 @@ def parse_comment(data):
  item['ip_location'] = data.get('source', '')
  item['content'] = data['text_raw']
  item['comment_user'] = parse_user_info(data['user'])
+ if 'reply_comment' in data:
+ item['reply_comment'] = {
+ '_id': data['reply_comment']['id'],
+ 'text': data['reply_comment']['text'],
+ 'user': parse_user_info(data['reply_comment']['user']),
+ }
  return item