python更多源码/资料/解答/教程等 点击此处跳转文末名片免费获取
爬虫基本流程
nodejs的使用
AES加密还原
解释器: python 3.8
编辑器: pycharm 2022.3
crypto-js>>> npm install crypto-js
requests >>> pip install requests pyexecjs
模块安装:
win + R 输入cmd 输入安装命令 pip install 模块名 (如果你觉得安装速度比较慢, 你可以切换国内镜像源)
采集数据 / 模拟用户行为
(可见即可爬, 爬虫不是破解)
模拟成 客户端 向 服务器 发送网络请求
批量采集数据 / 类重复的行为 自动化
一、思路分析
找到数据的来源
先分析单章的小说情况
https://read.zongheng.com/chapter/1215341/68208370.html
小说内容 都在网页源代码当中
二、实现代码
发送请求
获取数据
解析数据
保存数据
'''
遇到问题没人解答?小编创建了一个Python学习交流QQ群:926207505
寻找有志同道合的小伙伴,互帮互助,群里还有不错的视频学习教程和PDF电子书!
'''
import re
import requests
cookies = {
'sensorsdata2015jssdkcross': '%7B%22distinct_id%22%3A%2218a55c160d8979-0721567b14502b-26031f51-1764000-18a55c160d91161%22%2C%22%24device_id%22%3A%2218a55c160d8979-0721567b14502b-26031f51-1764000-18a55c160d91161%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E8%87%AA%E7%84%B6%E6%90%9C%E7%B4%A2%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22https%3A%2F%2Fwww.baidu.com%2Flink%22%2C%22%24latest_referrer_host%22%3A%22www.baidu.com%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC%22%7D%7D',
'ZHID': '6B8B1AA3F580487ED41316E742908662',
'zhffr': 'www.baidu.com',
'logon': 'NTU1MjUwMDM%3D%7CMA%3D%3D%7C%7C5Lmm5Y%2BLNjE2NjMzMjY%3D%7CdHJ1ZQ%3D%3D%7CMjEzNjIwOTA5MA%3D%3D%7C8592118A0059F35F304A37C75A03EBFA',
'__logon__': 'NTU1MjUwMDM%3D%7CMA%3D%3D%7C%7C5Lmm5Y%2BLNjE2NjMzMjY%3D%7CdHJ1ZQ%3D%3D%7CMjEzNjIwOTA5MA%3D%3D%7C8592118A0059F35F304A37C75A03EBFA',
'__zhs__': '973b86c348fb9e61d9d2d4d0b547163185771221da00ef66e39fda1bb69f1cb3b9fdbf6a9d4951cace27c091188d5bf0523696eec79fda879fde21d1c7f7a04c4f74760316032fe19f4b22607ba71697fe11db20672a9dc00b547639aa6302516f2bed9e51e93c11fc9ca0c6fb9d6c501503ca3588543a4c2c8a03b4dd9d4433cd7a339653b26b8d9f2c612227f501f4e55fa5987245bdf4b8e61a05984c286f53e6f6850b1101338b1517b5c2de5aeea5dc0e993b7a36212a22c1933fa40c6f33033ad83c8d4bd3e357f6de6df90c8d9a3930b43efc853d650c1424e067262e9522284830db113629c22fc63847000f24a276b63eef37b895f70b22c78db2f8',
'__zhc__': '30820122300d06092a864886f70d01010105000382010f003082010a0282010100d2dc7166074e33bbf091a23856b0e3e56888b6060c154a102a401b2a88f60f60610fb20df48d03c51c14441987f59edb3dd73dba0e2fbe0bf0074986ef38dd8f907f1312f06ba93ceef9ec18b9ae4abab4b439490062eb152fb01bbc331fd15fa2c4e1ce370ce555df528b71ca2e12d5cdcc138232b745bbccc3568e39802350d46b1f08925e14127bc0a67b50c74674ca67e42b0c396f92bbee6b38b550a022c32897b6369c2f5def7eaa01d667b3a70953ac2152d2777e91d67a31a4d8c159c8d4a0de1602f73492991c1a88e42157c7c513fde391dd31763664ef75d532262d5b8b72fc4bf38ce361bc17f53b940cd5ce4a37ab69f1a3c9549fd985db8a870203010001',
'loginphone': '19973017649',
'zh_visitTime': '1700136858520',
'Hm_lvt_c202865d524849216eea846069349eb9': '1700136859',
'PassportCaptchaId': '79cceff0bf14300835cd3bad4db019f1',
'Hm_lpvt_c202865d524849216eea846069349eb9': '1700136972',
}
headers = {
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
# Requests sorts cookies= alphabetically
# 'Cookie': 'sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2218a55c160d8979-0721567b14502b-26031f51-1764000-18a55c160d91161%22%2C%22%24device_id%22%3A%2218a55c160d8979-0721567b14502b-26031f51-1764000-18a55c160d91161%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E8%87%AA%E7%84%B6%E6%90%9C%E7%B4%A2%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22https%3A%2F%2Fwww.baidu.com%2Flink%22%2C%22%24latest_referrer_host%22%3A%22www.baidu.com%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC%22%7D%7D; ZHID=6B8B1AA3F580487ED41316E742908662; zhffr=www.baidu.com; logon=NTU1MjUwMDM%3D%7CMA%3D%3D%7C%7C5Lmm5Y%2BLNjE2NjMzMjY%3D%7CdHJ1ZQ%3D%3D%7CMjEzNjIwOTA5MA%3D%3D%7C8592118A0059F35F304A37C75A03EBFA; __logon__=NTU1MjUwMDM%3D%7CMA%3D%3D%7C%7C5Lmm5Y%2BLNjE2NjMzMjY%3D%7CdHJ1ZQ%3D%3D%7CMjEzNjIwOTA5MA%3D%3D%7C8592118A0059F35F304A37C75A03EBFA; __zhs__=973b86c348fb9e61d9d2d4d0b547163185771221da00ef66e39fda1bb69f1cb3b9fdbf6a9d4951cace27c091188d5bf0523696eec79fda879fde21d1c7f7a04c4f74760316032fe19f4b22607ba71697fe11db20672a9dc00b547639aa6302516f2bed9e51e93c11fc9ca0c6fb9d6c501503ca3588543a4c2c8a03b4dd9d4433cd7a339653b26b8d9f2c612227f501f4e55fa5987245bdf4b8e61a05984c286f53e6f6850b1101338b1517b5c2de5aeea5dc0e993b7a36212a22c1933fa40c6f33033ad83c8d4bd3e357f6de6df90c8d9a3930b43efc853d650c1424e067262e9522284830db113629c22fc63847000f24a276b63eef37b895f70b22c78db2f8; __zhc__=30820122300d06092a864886f70d01010105000382010f003082010a0282010100d2dc7166074e33bbf091a23856b0e3e56888b6060c154a102a401b2a88f60f60610fb20df48d03c51c14441987f59edb3dd73dba0e2fbe0bf0074986ef38dd8f907f1312f06ba93ceef9ec18b9ae4abab4b439490062eb152fb01bbc331fd15fa2c4e1ce370ce555df528b71ca2e12d5cdcc138232b745bbccc3568e39802350d46b1f08925e14127bc0a67b50c74674ca67e42b0c396f92bbee6b38b550a022c32897b6369c2f5def7eaa01d667b3a70953ac2152d2777e91d67a31a4d8c159c8d4a0de1602f73492991c1a88e42157c7c513fde391dd31763664ef75d532262d5b8b72fc4bf38ce361bc17f53b940cd5ce4a37ab69f1a3c9549fd985db8a870203010001; loginphone=19973017649; zh_visitTime=1700136858520; Hm_lvt_c202865d524849216eea846069349eb9=1700136859; PassportCaptchaId=79cceff0bf14300835cd3bad4db019f1; Hm_lpvt_c202865d524849216eea846069349eb9=1700136972',
'Origin': '******',
'Pragma': 'no-cache',
'Referer': '*****/',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-site',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
'sec-ch-ua': '"Google Chrome";v="119", "Chromium";v="119", "Not?A_Brand";v="24"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
}
# data = {
# 'bookId': '1215341',
# }
# resp = requests.post('******/api/chapter/getChapterList', cookies=cookies, headers=headers, data=data)
# json_data = resp.json()
# chapterViewList = json_data['result']['chapterList'][0]['chapterViewList']
# for chapterView in chapterViewList:
# url = f'https://read.zongheng.com/chapter/{chapterView["bookId"]}/{chapterView["chapterId"]}.html'
# # 发送请求
# response = requests.get(url=url, headers=headers, cookies=cookies)
# # 获取数据
# html_data = response.text
# # 解析数据
# # .*?
# # .: 可以代替任意字符
# # *: .* 代替多个字符
# # ?: 非贪婪匹配符
# # aaaaacccccbbbbbeeeeebbbbb
# # aaaaa.*?bbbbb
# # <div class="content" .*?>(.*?)</div>
# text = re.findall('<div class="content" .*?>(.*?)</div>', html_data)[0]
# text = chapterView['chapterName']+'\n\n'+text.replace('<p>', '\n').replace('</p>', '\n') + '\n\n'
# print(text)
# # 保存数据
# with open('1.txt', mode='a', encoding='utf-8') as f:
# f.write(text)
url = f'****/chapter/1215341/68311496.html'
# 发送请求
response = requests.get(url=url, headers=headers, cookies=cookies)
# 获取数据
html_data = response.text
# 解析数据
# .*?
# .: 可以代替任意字符
# *: .* 代替多个字符
# ?: 非贪婪匹配符
# aaaaacccccbbbbbeeeeebbbbb
# aaaaa.*?bbbbb
# <div class="content" .*?>(.*?)</div>
text = re.findall('<div class="content" .*?>(.*?)</div>', html_data)[0]
text = chapterView['chapterName']+'\n\n'+text.replace('<p>', '\n').replace('</p>', '\n') + '\n\n'
print(text)
# 保存数据
with open('1.txt', mode='a', encoding='utf-8') as f:
f.write(text)
最后感谢你观看我的文章呐~本次航班到这里就结束啦 🛬
希望本篇文章有对你带来帮助 🎉,有学习到一点知识~
躲起来的星星🍥也在努力发光,你也要努力加油(让我们一起努力叭)。