Skip to content

Commit

Permalink
增加微信公众号文章爬虫
Browse files Browse the repository at this point in the history
  • Loading branch information
PKUJohnson committed Oct 30, 2019
1 parent 5a1b66a commit e948820
Show file tree
Hide file tree
Showing 6 changed files with 174 additions and 2 deletions.
15 changes: 15 additions & 0 deletions example/wechat_pubaccount.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@

from opendatatools import wechat

if __name__ == "__main__":
result = wechat.login("[email protected]", "密码")
if result == True:
df, msg = wechat.search_pub("饭桶戴老板")
if df is not None:
for index, row in df.iterrows():
fakeid = row["fakeid"]
total_msg_cnt = wechat.get_total_msg_count(fakeid)
result = wechat.get_all_articles(fakeid, total_msg_cnt)
print(result)


2 changes: 1 addition & 1 deletion opendatatools/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,6 @@

import os

__version__ = '0.9.9'
__version__ = '1.0.0'
SOURCE_ROOT_DIR = os.path.dirname(os.path.abspath(__file__))

2 changes: 1 addition & 1 deletion opendatatools/common/rest_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def do_request(self, url, param = None, method="GET", type="text", encoding = No
if json is not None:
res = self.session.post(url, json=json, **kwargs)
else:
res = self.session.post(url, data=param **kwargs)
res = self.session.post(url, data=param, **kwargs)
else:
if method == "GET":
res = self.session.get(url, params=param, proxies=self.proxies, **kwargs)
Expand Down
1 change: 1 addition & 0 deletions opendatatools/wechat/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .wechat_interface import *
129 changes: 129 additions & 0 deletions opendatatools/wechat/wechat_agent.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
from opendatatools.common import RestAgent
import hashlib
import json
from PIL import Image
from io import BytesIO
import random
import time
import urllib
import pandas as pd
import math
import datetime
import threading
import functools


Host = "mp.weixin.qq.com"
agent = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36"
refer = "https://mp.weixin.qq.com"
xrw = "XMLHttpRequest"

loginUrl = "https://mp.weixin.qq.com/cgi-bin/bizlogin?action=startlogin"
qrcodeUrl = "https://mp.weixin.qq.com/cgi-bin/loginqrcode?action=getqrcode&param=4300&rd=120"
checkLogin = "https://mp.weixin.qq.com/cgi-bin/loginqrcode?action=ask&f=json&ajax=1&random="
doLogin = "https://mp.weixin.qq.com/cgi-bin/bizlogin?action=login"
searchUrl = "https://mp.weixin.qq.com/cgi-bin/searchbiz?action=search_biz&needToken&lang=zh_CN&f=json&ajax=1&needRandom&needQuery&begin=0&count=5"
appmsg = "https://mp.weixin.qq.com/cgi-bin/appmsg?needToken&lang=zh_CN&f=json&ajax=1&needRandom&action=list_ex&needBegin&needCount&query=&needFakeid&type=9" # 原9,改为1后条数变多

def md5(data):
m = hashlib.md5()
m.update(data.encode(encoding='UTF-8'))
return m.hexdigest()


def ReqRandom():
ll = random.random() * 100000000000000000
result = "0." + str(ll)
return result

class WechatMPAgent(RestAgent):
def __init__(self):
RestAgent.__init__(self)

def login(self, username, password):
param = {
"username": username,
"pwd": md5(password),
"imgcode": "",
"f": "json"
}

self.add_headers({
"Referer" : "https://mp.weixin.qq.com/cgi-bin/bizlogin?action=validate&lang=zh_CN&[email protected]",
"Host" : Host,
"X-Requested-With" : xrw
})

resp = self.do_request(loginUrl, method="POST", param=param)
print(resp)
redirect_url = json.loads(resp)["redirect_url"]

response = self.do_request(qrcodeUrl, method="GET", type="binary")
qrcode = Image.open(BytesIO(response))
qrcode.show()

# Check if qrcode is verified
while True:
response = self.do_request(checkLogin+ReqRandom(), method="GET")
rsp = json.loads(response)

if rsp["status"] == 1:
qrcode.close()
break
else:
time.sleep(2)

param = {
"userlang": "zh_CN",
"token": "",
"lang": "zh_CN",
"f" : "json",
"ajax" : "1",

}

response = self.do_request(doLogin, method="POST", param=param)
rsp = json.loads(response)
print(rsp)
token_url = rsp["redirect_url"]
pos = token_url.find("token=")
token = token_url[pos+6:]
self.token = token
return True

def get_retcode(self, rsp):
if "base_resp" in rsp and "ret" in rsp["base_resp"] and "err_msg" in rsp["base_resp"]:
return rsp["base_resp"]["ret"], rsp["base_resp"]["err_msg"]
return 0, "ok"

def search_pub(self, pubno):
_searchUrl = searchUrl.replace("needToken", "token=" + self.token).replace("needRandom", "random=" + ReqRandom()).replace("needQuery", "query=" + pubno)
response = self.do_request(_searchUrl, method="GET")
rsp = json.loads(response)
ret, err_msg = self.get_retcode(rsp)
if ret != 0:
return None, err_msg
df = pd.DataFrame(rsp["list"])
return df, ""

def get_article_list(self, fakeid, begin):
count = 5
_appmsg = appmsg.replace("needToken", "token=" + self.token).replace("needRandom", "random=" + ReqRandom()).replace("needFakeid", "fakeid=" + fakeid)
appmsgTemp = _appmsg.replace("needBegin", "begin=" + str(begin)).replace("needCount", "count=" + str(count))

response = self.do_request(appmsgTemp, method="GET")
rsp = json.loads(response)
ret, msg = self.get_retcode(rsp)
# 失败后60秒再试一次
while msg != "ok":
print(response.text)
time.sleep(60)
response = self.do_request(appmsgTemp, method="GET")
rsp = json.loads(response.text)
ret, msg = self.get_retcode(rsp)

app_msg_cnt = rsp["app_msg_cnt"]
app_msg_list = rsp["app_msg_list"]
df = pd.DataFrame(app_msg_list)
return app_msg_cnt, df

27 changes: 27 additions & 0 deletions opendatatools/wechat/wechat_interface.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
from .wechat_agent import *

agent = WechatMPAgent()


def login(mp_username, mp_password):
return agent.login(mp_username, mp_password)

def search_pub(pubaccount):
return agent.search_pub(pubaccount)

def get_total_msg_count(fakeid):
msg_cnt, df = agent.get_article_list(fakeid, 0)
return msg_cnt

def get_all_articles(fakeid, app_msg_cnt):
count = 0
df_list = []
while count < app_msg_cnt:
cnt, df = agent.get_article_list(fakeid, count)
df_list.append(df)
count = count + 5
print(count)
time.sleep(5)

result = pd.concat(df_list)
return result

0 comments on commit e948820

Please sign in to comment.