今天爬取的是微博评论。
可以发现其特点是下一页评论的max_id在上一页中。
于是代码如下:
import requests
import json
import re
import time
headers = {
'User-Agent': '',
"Cookie": "",
"Referer": "https://m.weibo.cn/detail/4991918748471161"
}
url="https://m.weibo.cn/comments/hotflow?id=4991918748471161&mid=4991918748471161&max_id_type=0"
def get_page(url):
response = requests.get(url, headers=headers)
maxid=response.json()['data']["max_id"]
if response.status_code == 200:
return response, maxid
else:
print("请求失败")
def parse_page(datas):
for data in datas:
item=re.compile(r'<[^>]+>',re.S).sub('',data["text"])
print(item)
print("----------")
def get_url(max_id):
urls=[]
for i in range(1,14):
print("第"+str(i)+"页")
url="https://m.weibo.cn/comments/hotflow?id=4991918748471161&mid=4991918748471161&max_id="+str(max_id)
r=requests.get(url,headers=headers)
max_id=r.json()['data']["max_id"]
datas=r.json()['data']["data"]
parse_page(datas)
time.sleep(1)
print("第"+str(i)+"爬取完毕")
if __name__ == '__main__':
html,max_id = get_page(url)
get_url(max_id)
运行效果:
最近新开了公众号,请大家关注一下。