根据scrapy运行流程中所在位置不同分为:
但在scrapy默认的情况下 两种中间件都在middlewares.py一个文件中
爬虫中间件使用方法和下载中间件相同,且功能重复,通常使用下载中间件
接下来我们对腾讯招聘爬虫进行修改完善,通过下载中间件来学习如何使用中间件
编写一个Downloader Middlewares和我们编写一个pipeline一样,定义一个类,然后在setting中开启
Downloader Middlewares默认的方法:
import random
from Tencent.settings import USER_AGENTS_LIST # 注意导入路径,请忽视pycharm的错误提示
class UserAgentMiddleware(object):
def process_request(self, request, spider):
user_agent = random.choice(USER_AGENTS_LIST)
request.headers['User-Agent'] = user_agent
# 不写return
class CheckUA:
def process_response(self,request,response,spider):
print(request.headers['User-Agent'])
return response # 不能少!
DOWNLOADER_MIDDLEWARES = {
'Tencent.middlewares.UserAgentMiddleware': 543, # 543是权重值
'Tencent.middlewares.CheckUA': 600, # 先执行543权重的中间件,再执行600的中间件
}
USER_AGENTS_LIST = [
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
"Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
"Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
"Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
"Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5"
]
proxy
字段request.meta['proxy']
免费代理ip:
class ProxyMiddleware(object):
def process_request(self,request,spider):
# proxies可以在settings.py中,也可以来源于代理ip的webapi
# proxy = random.choice(proxies)
# 免费的会失效,报 111 connection refused 信息!重找一个代理ip再试
proxy = 'https://1.71.188.37:3128'
request.meta['proxy'] = proxy
return None # 可以不写return
收费代理ip:
# 人民币玩家的代码(使用abuyun提供的代理ip)
import base64
# 代理隧道验证信息 这个是在那个网站上申请的
proxyServer = 'http://proxy.abuyun.com:9010' # 收费的代理ip服务器地址,这里是abuyun
proxyUser = 用户名
proxyPass = 密码
proxyAuth = "Basic " + base64.b64encode(proxyUser + ":" + proxyPass)
class ProxyMiddleware(object):
def process_request(self, request, spider):
# 设置代理
request.meta["proxy"] = proxyServer
# 设置认证
request.headers["Proxy-Authorization"] = proxyAuth
在使用了代理ip的情况下可以在下载中间件的process_response()方法中处理代理ip的使用情况,如果该代理ip不能使用可以替换其他代理ip
class ProxyMiddleware(object):
......
def process_response(self, request, response, spider):
if response.status != '200':
request.dont_filter = True # 重新发送的请求对象能够再次进入队列
return requst
#设置一个无头无可视化界面的浏览器
chrome_options = Options()
# 无可视化界面
chrome_options.add_argument("--headless")
chrome_options.add_argument("--di sable-gpu")
# 规避监测
chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
self.browse = webdriver.Chrome(options=chrome_options)
# 开始拦截篡改下载中间件
class NewsSpiderDownloaderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
# 该方法拦截四大板块对应的响应对象 且篡改
# 注意settings文件中一定要启动对应权限
def process_response(self, request, response, spider):
# 对需要篡改部分做判断 否则会影响其他请求对应的repsonse
if request.url in spider.module_urls:
# 获取从爬虫程序中创建出的浏览器对象
browse = spider.browse
# 通过selenium向四大板块的url发起请求,获取到动态加载的数据
browse.get(request.url)
# 下拉翻页
browse.execute_script('window.scrollTo(0,document.body.scrollHeight)')
time.sleep(1.5)
# 获取源码
page_text = browse.page_source
#针对定位到的response进行篡改
# 这里的篡改指实例化一个新的响应对象(符合需求:包含动态加载的数据),替换(HtmlResponse)原来的响应对象
# 参数解释:url:响应对应的url body:响应体 requests:scrapy中的请求对象,数据都是跟着请求对象走的
new_response = HtmlResponse(url=request.url, body=page_text, encoding='utf-8', request=request)
return new_response # 篡改响应对象 不再经过download,直接将新的响应体返回给引擎
# 此时就可以回到爬虫文件继续往下写 判断如果是selenium请求过来的我们才返回处理后的new_response
else:
# 其他请求对应的响应对象
return response
# return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
中间件的使用: