下面实现的这种方式过于简单,如果对爬虫有过了解的读者可以直接滤过这篇博客,因为确实没什么用。
实现原理
因为在你使用 request 获取网页源代码的过程中,实际上是通过向服务器发送请求来获取网页内容的。每一次请求都会被服务器记录下来,而服务器通常会根据不同的请求信息来判断访问者的身份和行为。
使用不同的代理可以模拟出不同的游客信息,这是因为代理服务器会将你的请求发往目标网站,而代理服务器本身的 IP 地址和地理位置等属性不同,从而使得目标网站无法确定请求的真正来源。这样就可以模拟出不同的游客信息,包括 IP 地址、地理位置、设备信息等,从而避免了被服务器识别出来,同时也造成了阅读量的上升。
所以只需要满足这部分内容的话就可以规避掉被认定成机器人的问题,当然有的网站在进行验证的时候往往需要使用验证码进行人机判断,那个只能使用工具,或者是扣js代码模拟这个加密过程。当然我们这里的网站比较简单,这边直接采用request的手段,只需要保证满足cookie每次情况的不同即可进行相关引流
import requests
import random
import string
import hashlib
import secrets
import time
def generate_hex_string(length):
# 生成随机字节
random_bytes = secrets.token_bytes(length // 2)
# 将随机字节转换为十六进制字符串
hex_string = random_bytes.hex()
return hex_string
def generate_random_number():
# 生成一个随机的十六进制数
random_number = generate_hex_string(32)
return random_number
def generate_random_string():
# 生成前三位字母
letters = ''.join(random.choices(string.ascii_lowercase, k=3))
# 生成后三位数字
numbers = ''.join(random.choices(string.digits, k=3))
# 拼接并返回结果
return letters + numbers
headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
"Cache-Control": "max-age=0",
"Connection": "keep-alive",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "none",
"Sec-Fetch-User": "?1",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0",
"sec-ch-ua": "\"Not_A Brand\";v=\"8\", \"Chromium\";v=\"120\", \"Microsoft Edge\";v=\"120\"",
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": "\"Windows\""
}
# ?spm=1001.2014.3001.5501
url = "引流文章地址"
params = {
"spm": "参数携带"
}
with open("D:\桌面\pythoncode\Webshare 10 proxies (1).txt",'r',encoding='utf-8') as file:
datalist = file.readlines()
count = 0
index = 0
erro = 0
while True:
for i in datalist:
cookies = {
"uuid_tt_dd": "10_37471902900-1694849986886-528459",
"UN": "screamn",
"BT": "1694850004848",
"p_uid": "U010000",
"Hm_up_6bcd52f51e9b3dce32bec4a3997715ac": "%7B%22islogin%22%3A%7B%22value%22%3A%221%22%2C%22scope%22%3A1%7D%2C%22isonline%22%3A%7B%22value%22%3A%220%22%2C%22scope%22%3A1%7D%2C%22isvip%22%3A%7B%22value%22%3A%220%22%2C%22scope%22%3A1%7D%2C%22uid_%22%3A%7B%22value%22%3A%22screamn%22%2C%22scope%22%3A1%7D%7D",
"log_Id_click": "66",
"dc_session_id": "10_1704072680215.626007",
"c_pref": "default",
"c_ref": "default",
"c_first_ref": "default",
"c_first_page": "https%3A//blog.csdn.net/screamn/article/details/135310244%3Fspm%3D1001.2014.3001.5501",
"c_segment": "11",
"hide_login": "1",
"Hm_lvt_6bcd52f51e9b3dce32bec4a3997715ac": "1704072684",
"SESSION": "49e537ff-4030-400c-95ab-8052d7d016d9",
"loginbox_strategy": "%7B%22taskId%22%3A317%2C%22abCheckTime%22%3A1704072684948%2C%22version%22%3A%22ExpA%22%2C%22nickName%22%3A%22screamn%22%7D",
"c_dsid": "11_1704072695124.471246",
"c_page_id": "default",
"log_Id_pv": "71",
"creative_btn_mp": "2",
"Hm_lpvt_6bcd52f51e9b3dce32bec4a3997715ac": "1704072696",
"log_Id_view": "543",
"dc_sid": str(generate_hex_string(32)),
"dc_tos": generate_random_string()
}
i = i[0:len(i)-1]
prox = {
"http":i,
"https":i
}
try:
response = requests.get(url[index], headers=headers, cookies=cookies, params=params,proxies=prox)
print(response)
except:
print(f"出现问题的代理{i}")
erro+=1
count+=1
index+=1
if index == 2:
index = 0
if erro == 1000:
break
time.sleep(5)
print(f"本次成功增加{count}次阅读量")