requests 高级
1. 会话维持
cookies字段形式
import requests
url = 'https://fanyi.baidu.com/v2transapi?from=zh&to=en'
headers = {
'Acs-Token': '1687354513445_1687354530324_U0QUCd0KA/F7BYp74tXcgnoFsNtOOxo4iufv+Hk5xXqn2+frnr0XUBVQuvTA3dfcUNYPfwpE/Y/JKtFNRsVrPchR4jO1sLxlmyw0hvh3usx51exIBKNRgH4NQXBDqAt3YJadXkNDjTR67nCTZiw+RJk7dF5HUYF5tJQ2b6P7MOd74rkMTn+xiwSraonXITV1rfLX6Pljrf7BCAACg8KuPEJplI1HlqnRHpoq54OKlcGiWXm2ZWfAcq4EVmqb1nVSge61u6U85j/n7R3JJ4LA96Vw0kcKtFi5X8GAw2SHCZ1fAZREBFeYdhG6fXVEZP+e6mkHJn/yUmb3IUb+GxtEhS1alaQMFv9QQZSBx6tXbfW6ncLHgcZfcDTqoKWSe3tdX39s1qnOEoWGvwLFFe/XMszJzUdMuOhndbQPdjkofy58aIlMTJErOTeELJ+21UOigR2VuwxiD/k9oI7vmMH0UUYzjVqojZcGNU2GrWMcfto=',
'Host': 'fanyi.baidu.com',
'Origin': 'https://fanyi.baidu.com',
'Referer': 'https://fanyi.baidu.com/?aldtype=16047',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'
}
data = {
'from': 'zh',
'to': 'en',
'query': '你好',
'transtype': 'realtime',
'simple_means_flag': '3',
'sign': '232427.485594',
'token': '351b986af9e8a703056ff2f022cdf830',
'domain': 'common',
'ts': '1687354530307',
}
cookies = {
'BAIDUID': 'A8D9EA340531252BDEF2C13A73AFA5E7:FG=1',
'BAIDUID_BFESS': 'A8D9EA340531252BDEF2C13A73AFA5E7:FG=1',
'ZFY': 'CMMricp5SfogOfi1RswFaP4NBZN6t5zy:Axurblw8al4:C',
'BIDUPSID': 'A8D9EA340531252B16551CBD43A8D395',
'PSTM': '1681976911',
'APPGUIDE_10_0_2': '1',
'REALTIME_TRANS_SWITCH': '1',
'FANYI_WORD_SWITCH': '1',
'HISTORY_SWITCH': '1',
'SOUND_SPD_SWITCH': '1',
'SOUND_PREFER_SWITCH': '1',
'H_WISE_SIDS': '131862_114552_216844_213346_214803_219942_110085_243887_244712_249892_256348_256447_256739_254317_257586_257996_258372_258375_230288_259102_259287_258772_234207_234295_253022_260335_260806_259299_253631_261575_261718_261459_261983_259782_260440_261793_259629_236312_262490_262452_261869_262607_262677_262597_262604_249411_259519_259948_262743_262746_262913_263190_256998_263221_263306_263279_243615_263343_261683_263434_254299_261411_263584_257289_262439_262533_263644_262408_262910_257169_262289_263906_263363_256419_264175_264089_264228_257442_256225_262260_255224_264018_264368_259558_256083_264383_264423_264452_264285_256152_264626_264246_258698_264749_261934_264820_264136_261035_261663',
'H_PS_PSSID': '38516_36550_38686_38860_38793_38841_38581_38802_38828_38840_38640_26350',
'BDORZ': 'B490B5EBF6F3CD402E515D22BCDA1598',
'delPer': '0',
'PSINO': '6',
'BA_HECTOR': 'ah2g2ka48k0g8h2h00a4842h1i95r4t1p',
'BCLID': '8504214758825411246',
'BCLID_BFESS': '8504214758825411246',
'BDSFRCVID': 'TO8OJexroG0ZmSbfuwStIGta5LweG7bTDYrEOwXPsp3LGJLVFakFEG0Pts1-dEu-S2OOogKKLmOTHpKF_2uxOjjg8UtVJeC6EG0Ptf8g0M5',
'BDSFRCVID_BFESS': 'TO8OJexroG0ZmSbfuwStIGta5LweG7bTDYrEOwXPsp3LGJLVFakFEG0Pts1-dEu-S2OOogKKLmOTHpKF_2uxOjjg8UtVJeC6EG0Ptf8g0M5',
'H_BDCLCKID_SF': 'tRAOoC_-tDvDqTrP-trf5DCShUFsLU4OB2Q-XPoO3KJADfOPbRob0n0PQpOKtx7f5mkf3fbgy4op8P3y0bb2DUA1y4vp0toW3eTxoUJ2-KDVeh5Gqq-KXU4ebPRiWPr9QgbjahQ7tt5W8ncFbT7l5hKpbt-q0x-jLTnhVn0MBCK0HPonHjDMDTJy3j',
'H_BDCLCKID_SF_BFESS': 'tRAOoC_-tDvDqTrP-trf5DCShUFsLU4OB2Q-XPoO3KJADfOPbRob0n0PQpOKtx7f5mkf3fbgy4op8P3y0bb2DUA1y4vp0toW3eTxoUJ2-KDVeh5Gqq-KXU4ebPRiWPr9QgbjahQ7tt5W8ncFbT7l5hKpbt-q0x-jLTnhVn0MBCK0HPonHjDMDTJy3j',
'Hm_lvt_64ecd82404c51e03dc91cb9e8c025574': '1687354513',
'Hm_lpvt_64ecd82404c51e03dc91cb9e8c025574': '1687354513',
'ab_sr': '1.0.1_NDJjZTI3ZGQwZjJhN2I2YThjMWMxNDgxNWJhOGM5YTEwMjYwYWM0NDU3Mjk0Y2MxZTAyNWE0MzIyNDlhYjJmYjQ5NDUxNzEzOTI4YmIyZjUyNDFiNjFkM2Q0ZTYyMjZjMGU1ZTU3MDFiMTNhMWU5NTY2NDdlYWExZWEyZDZiMWNkNGVjMTExNTQyM2MyNzYxYThiNzAzYmUxNTAxZGI2NA==',
}
response = requests.post(url=url, data=data, headers=headers, cookies=cookies)
json_data = response.json()
print(json_data)
"""
cookies怎么来的?
服务器生成: 请求与响应间生成cookies的片段信息 Set-cookie
浏览器生成: 一般在我们模拟请求的时候服务器不会校验
js生成: JavaScript, 逆向分析
代码怎么维持用户的cookies状态?
"""
案例 - 某青网
cookies 保证同一个用户
"""
时间戳: 格林威治时间1970年1月1日0时0分0秒开始 到 目前 位置所消耗的时间数
秒级时间戳: 10为数字
毫秒级时间戳: 13为数字
微秒级时间戳: 16为数字
"""
import time
import requests
def get_time():
"""获取时间戳的函数"""
now_time = str(int(time.time() * 1000))
print('当前时间戳为:', now_time)
return now_time
cookies = {'seesion:', 'vnrasebgvi'}
"""请求验证码, 保存"""
img_time = get_time()
img_url = 'http://118.126.88.143:5000/login/captcha?image_code=' + img_time
print('图片地址:', img_url)
img_response = requests.get(url=img_url).content
with open('yzm.png', mode='wb') as f:
f.write(img_response)
img_code = input('请输入验证码:')
print('您输入的验证码为:', img_code)
"""构建登录请求"""
login_url = 'http://118.126.88.143:5000/api/private/v1/login'
json_data = {
"image_code": get_time(),
"username": "admin",
"password": "123456",
"captcha_code": img_code
}
login_response = requests.post(url=login_url, json=json_data)
print(login_response.json())
"""
一般情况下要使用requests模块维持用户状态
1. 需要在指定网站抓登录包, 用代码模拟登录 --> 难点
2. 使用session回话维持, 维持用户的登录状态抓取数据
"""
会话维持
"""
时间戳: 格林威治时间1970年1月1日0时0分0秒开始 到 目前 位置所消耗的时间数
秒级时间戳: 10为数字
毫秒级时间戳: 13为数字
微秒级时间戳: 16为数字
"""
import time
import requests
def get_time():
"""获取时间戳的函数"""
now_time = str(int(time.time() * 1000))
print('当前时间戳为:', now_time)
return now_time
cookies = {'seesion': 'vnrasebgvi'}
session = requests.Session()
"""请求验证码, 保存"""
img_time = get_time()
img_url = 'http://118.126.88.143:5000/login/captcha?image_code=' + img_time
print('图片地址:', img_url)
img_response = session.get(url=img_url, cookies=cookies).content
with open('yzm.png', mode='wb') as f:
f.write(img_response)
img_code = input('请输入验证码:')
print('您输入的验证码为:', img_code)
"""构建登录请求"""
login_url = 'http://118.126.88.143:5000/api/private/v1/login'
json_data = {
"image_code": img_time,
"username": "admin",
"password": "123456",
"captcha_code": img_code
}
login_response = session.post(url=login_url, json=json_data)
print(login_response.cookies.get_dict())
print(login_response.json())
案例 - 太平洋亲子网
无会话维持
"""
官网地址: https://www.pcbaby.com.cn/
个人中心页面地址: https://my.pcbaby.com.cn/user/adminIndex.jsp
登录页面地址: http://my.pcbaby.com.cn/login.jsp?return=http%3A%2F%2Fmy.pcbaby.com.cn%2Fuser%2FadminIndex.jsp
账号: mb51222353
密码: 123456..
"""
import requests
my_home_url = 'https://my.pcbaby.com.cn/user/adminIndex.jsp'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'}
response = requests.get(url=my_home_url, headers=headers)
print(response.text)
print(response.status_code)
with open('my_home.html', mode='w', encoding='gb2312') as f:
f.write(response.text)
"""
当我们没有登录的时候, 请求个人中心页面的数据会自动的重定向
"""
有会话维持
"""
官网地址: https://www.pcbaby.com.cn/
个人中心页面地址: https://my.pcbaby.com.cn/user/adminIndex.jsp
登录页面地址: http://my.pcbaby.com.cn/login.jsp?return=http%3A%2F%2Fmy.pcbaby.com.cn%2Fuser%2FadminIndex.jsp
账号: mb51222353
密码: 123456..
"""
import requests
session = requests.Session()
headers = {
'Host': 'passport3.pcbaby.com.cn',
'Origin': 'http://my.pcbaby.com.cn',
'Referer': 'http://my.pcbaby.com.cn/login.jsp?return=http%3A%2F%2Fmy.pcbaby.com.cn%2Fuser%2FadminIndex.jsp',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
}
"""模拟登录"""
login_url = 'http://passport3.pcbaby.com.cn/passport3/passport/login_ajax_do_new.jsp?req_enc=UTF-8'
data = {
'return': 'https://my.pcbaby.com.cn/user/adminIndex.jsp',
'bindUrl': 'https://my.pcbaby.com.cn/passport/bindMobile.jsp?return=https://my.pcbaby.com.cn/user/adminIndex.jsp',
'username': 'mb51222353',
'password': '123456..',
'auto_login': '30',
'checkbox': 'on',
}
login_response = session.post(url=login_url, data=data, headers=headers)
print(login_response.json())
print(login_response.status_code)
"""
如果直接用requests请求, 那么上下的这两次请求是没有半毛钱关系的 √
需要用回话维持
"""
"""请求个人中心页面"""
my_home_url = 'https://my.pcbaby.com.cn/user/adminIndex.jsp'
response = session.get(url=my_home_url)
print(response.text)
print(response.status_code)
with open('my_home_2.html', mode='w', encoding='gb2312') as f:
f.write(response.text)
"""
当我们没有登录的时候, 请求个人中心页面的数据会自动的重定向
"""
2. 异常处理
乱码错误
import requests
response = requests.get('http://www.pcbaby.com.cn/')
response.encoding = 'gb2312'
html_str = response.text
print(html_str)
请求头参数错误
import requests
headers = {
'User-Agent': ' Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36(KHTML, like Gecko) Chrome/84.0.4128.3 Safari/537.36'
}
response = requests.get('http://www.shuquge.com/txt/8659/index.html',
headers=headers)
response.encoding = response.apparent_encoding
html = response.text
print(html)
"""
请求头字段中, 空格不能多不能少
"""
请求不到数据
import requests
url = 'https://fanyi.baidu.com/v2transapi?from=zh&to=en'
headers = {
'Acs-Token': '1687354513445_1687354530324_U0QUCd0KA/F7BYp74tXcgnoFsNtOOxo4iufv+Hk5xXqn2+frnr0XUBVQuvTA3dfcUNYPfwpE/Y/JKtFNRsVrPchR4jO1sLxlmyw0hvh3usx51exIBKNRgH4NQXBDqAt3YJadXkNDjTR67nCTZiw+RJk7dF5HUYF5tJQ2b6P7MOd74rkMTn+xiwSraonXITV1rfLX6Pljrf7BCAACg8KuPEJplI1HlqnRHpoq54OKlcGiWXm2ZWfAcq4EVmqb1nVSge61u6U85j/n7R3JJ4LA96Vw0kcKtFi5X8GAw2SHCZ1fAZREBFeYdhG6fXVEZP+e6mkHJn/yUmb3IUb+GxtEhS1alaQMFv9QQZSBx6tXbfW6ncLHgcZfcDTqoKWSe3tdX39s1qnOEoWGvwLFFe/XMszJzUdMuOhndbQPdjkofy58aIlMTJErOTeELJ+21UOigR2VuwxiD/k9oI7vmMH0UUYzjVqojZcGNU2GrWMcfto=',
'Cookie': "BIDUPSID=A8D9EA340531252B16551CBD43A8D395; PSTM=1681976911; BAIDUID=A8D9EA340531252BDEF2C13A73AFA5E7:FG=1; APPGUIDE_10_0_2=1; REALTIME_TRANS_SWITCH=1; FANYI_WORD_SWITCH=1; HISTORY_SWITCH=1; SOUND_SPD_SWITCH=1; SOUND_PREFER_SWITCH=1; H_WISE_SIDS=131862_114552_216844_213346_214803_219942_110085_243887_244712_249892_256348_256447_256739_254317_257586_257996_258372_258375_230288_259102_259287_258772_234207_234295_253022_260335_260806_259299_253631_261575_261718_261459_261983_259782_260440_261793_259629_236312_262490_262452_261869_262607_262677_262597_262604_249411_259519_259948_262743_262746_262913_263190_256998_263221_263306_263279_243615_263343_261683_263434_254299_261411_263584_257289_262439_262533_263644_262408_262910_257169_262289_263906_263363_256419_264175_264089_264228_257442_256225_262260_255224_264018_264368_259558_256083_264383_264423_264452_264285_256152_264626_264246_258698_264749_261934_264820_264136_261035_261663; H_PS_PSSID=38516_36550_38686_38860_38793_38841_38581_38802_38828_38840_38640_26350; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; BAIDUID_BFESS=A8D9EA340531252BDEF2C13A73AFA5E7:FG=1; delPer=0; PSINO=6; BA_HECTOR=ah2g2ka48k0g8h2h00a4842h1i95r4t1p; ZFY=CMMricp5SfogOfi1RswFaP4NBZN6t5zy:Axurblw8al4:C; BCLID=8504214758825411246; BCLID_BFESS=8504214758825411246; BDSFRCVID=TO8OJexroG0ZmSbfuwStIGta5LweG7bTDYrEOwXPsp3LGJLVFakFEG0Pts1-dEu-S2OOogKKLmOTHpKF_2uxOjjg8UtVJeC6EG0Ptf8g0M5; BDSFRCVID_BFESS=TO8OJexroG0ZmSbfuwStIGta5LweG7bTDYrEOwXPsp3LGJLVFakFEG0Pts1-dEu-S2OOogKKLmOTHpKF_2uxOjjg8UtVJeC6EG0Ptf8g0M5; H_BDCLCKID_SF=tRAOoC_-tDvDqTrP-trf5DCShUFsLU4OB2Q-XPoO3KJADfOPbRob0n0PQpOKtx7f5mkf3fbgy4op8P3y0bb2DUA1y4vp0toW3eTxoUJ2-KDVeh5Gqq-KXU4ebPRiWPr9QgbjahQ7tt5W8ncFbT7l5hKpbt-q0x-jLTnhVn0MBCK0HPonHjDMDTJy3j; H_BDCLCKID_SF_BFESS=tRAOoC_-tDvDqTrP-trf5DCShUFsLU4OB2Q-XPoO3KJADfOPbRob0n0PQpOKtx7f5mkf3fbgy4op8P3y0bb2DUA1y4vp0toW3eTxoUJ2-KDVeh5Gqq-KXU4ebPRiWPr9QgbjahQ7tt5W8ncFbT7l5hKpbt-q0x-jLTnhVn0MBCK0HPonHjDMDTJy3j; Hm_lvt_64ecd82404c51e03dc91cb9e8c025574=1687354513; Hm_lpvt_64ecd82404c51e03dc91cb9e8c025574=1687354513; ab_sr=1.0.1_NDJjZTI3ZGQwZjJhN2I2YThjMWMxNDgxNWJhOGM5YTEwMjYwYWM0NDU3Mjk0Y2MxZTAyNWE0MzIyNDlhYjJmYjQ5NDUxNzEzOTI4YmIyZjUyNDFiNjFkM2Q0ZTYyMjZjMGU1ZTU3MDFiMTNhMWU5NTY2NDdlYWExZWEyZDZiMWNkNGVjMTExNTQyM2MyNzYxYThiNzAzYmUxNTAxZGI2NA==",
'Host': 'fanyi.baidu.com',
'Origin': 'https://fanyi.baidu.com',
'Referer': 'https://fanyi.baidu.com/?aldtype=16047',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'
}
data = {
'from': 'zh',
'to': 'en',
'query': '你好',
'transtype': 'realtime',
'simple_means_flag': '3',
'sign': '232427.485594',
'token': '351b986af9e8a703056ff2f022cdf830',
'domain': 'common',
'ts': '1687354530307',
}
response = requests.post(url=url, data=data, headers=headers)
json_data = response.json()
print(json_data)
"""
当我们请求不到数据的时候, 可通过一下方式考虑
是不是请求头反扒了?
params 参数有没有加密
data参数 有没有加密
是不是服务器问题
"""
目标计算机积极拒绝
import requests
proxy_response = requests.get('http://127.0.0.1:5010/get')
proxy = proxy_response.json()
print(proxy)
"""
如果目标计算机积极拒绝 --> requests.exceptions.ConnectionError:
此问题是你在服务器没有权限
或者服务器没有服务程序
"""
连接超时
import requests
proxy_response = requests.get('http://134.175.188.27:5010/get', timeout=0.0001)
proxy = proxy_response.json()
print(proxy)
异常重试
import requests
try:
proxy_response = requests.get('http://134.175.188.27:5010/get', timeout=0.0001)
proxy = proxy_response.json()
print(proxy)
except:
proxy_response = requests.get('http://134.175.188.27:5010/get', timeout=0.0001)
proxy = proxy_response.json()
print(proxy)
案例 - 异常重试
import parsel
import requests
def get_one_chapter(url, times=3):
try:
response = requests.get(url=url, headers=headers, timeout=0.1)
html_data = response.text
selector = parsel.Selector(html_data)
title = selector.css('h1.wap_none::text').re(' (.*)')[0]
contend = selector.css('#chaptercontent::text').getall()
contend = [i.replace('\u3000\u3000', '') for i in contend]
contend = '\n'.join(contend)
file_path = '小说\\' + str(count) + title + '.txt'
with open(file_path, mode='w', encoding='utf-8') as f:
f.write(contend)
print('保存完成:', file_path)
except Exception as e:
print(e)
if times >= 1:
get_one_chapter(url, times=times - 1)
print('*' * 100)
if __name__ == '__main__':
count = 1
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'}
url = 'https://www.bqg70.com/book/1031/'
response = requests.get(url=url, headers=headers)
selector = parsel.Selector(response.text)
dds = selector.css('.listmain>dl dd')
for dd in dds:
title = dd.css('a::text').get()
link_url = dd.css('a::attr(href)').get()
all_url = 'https://www.bqg70.com' + link_url
if '展开全部章节' in title:
continue
get_one_chapter(all_url)
count += 1
案例 - 爬取我的钢铁网
import csv
import requests
url = 'https://index.mysteel.com/api/pricetrend/getChartMultiCity.htm'
params = {
'catalog': '%E8%A7%92%E9%92%A2_:_%E8%A7%92%E9%92%A2',
'city': '%E9%95%BF%E6%B2%99',
'spec': 'Q235B%2050*50*5_:_Q235B_50*50*5',
'startTime': '2023-03-01',
'endTime': '2023-04-01',
'callback': 'json',
'v': '1688557188999',
}
response = requests.get(url=url, params=params)
json_data = response.json()
print(json_data)
city_name = json_data['data'][0]['lineName']
print(city_name)
with open('data.csv', mode='a', encoding='utf-8', newline='') as f:
csv_write = csv.DictWriter(f, fieldnames=['city_name', 'date', 'value'])
csv_write.writeheader()
for i in json_data['data'][0]['dateValueMap']:
i['city_name'] = city_name
csv_write.writerow(i)
print(requests.utils.unquote('%E9%95%BF%E6%B2%99'))
print(requests.utils.unquote('%E8%A7%92%E9%92%A2_'))
print(requests.utils.unquote('%20'))