# 设置失败时重试的次数,默认是2
RETRY_TIMES = 2
?
在 `middlewares.py` 文件中(如果不存在,则需要创建该文件,并在 `settings.py` 中启用它),添加一个新的中间件类:
from scrapy import signals
import logging?
class ExceptionLoggingMiddleware:
? ? def process_response(self, request, response, spider):
? ? ? ? # 如果响应是异常的
? ? ? ? if response.status != 200: ?# Or use any other criteria for error
? ? ? ? ? ? logging.error(f"Error! Response Status: {response.status}, URL: {response.url}")
? ? ? ? ? ? with open("error_report.txt", "a") as file:
? ? ? ? ? ? ? ? file.write(f"Error! Response Status: {response.status}, URL: {response.url}\n")
? ? ? ? return response
? ? def process_exception(self, request, exception, spider):
? ? ? ? logging.error(f"Exception caught: {exception}, URL: {request.url}")
? ? ? ? with open("error_report.txt", "a") as file:
? ? ? ? ? ? file.write(f"Exception caught: {exception}, URL: {request.url}\n")
? ? ? ? # 回到重试中间件
? ? ? ? return None
DOWNLOADER_MIDDLEWARES = {
? ? 'myproject.middlewares.ExceptionLoggingMiddleware': 540,
}
DOWNLOADER_MIDDLEWARES和SPIDER_MIDDLEWARES
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
SPIDER_MIDDLEWARES = {
"ip_pool.middlewares.IpPoolSpiderMiddleware": 543,
}
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
"ip_pool.middlewares.IpPoolDownloaderMiddleware": 543,
"ip_pool.middlewares.ExceptionLoggingMiddleware": 544,
}
重要的用处包括:
- 设置代理服务器
- 修改User-Agent或其他请求头
- 自动重定向处理
- 请求/响应重写,如填充默认表单数据
- 请求重试,捕获并处理下载过程中的异常
主要用于:
- 修改进入Spider的响应
- 修改从Spider出来的结果(Items和Requests)
- 捕获Spider处理过程中的异常
- 收集统计数据
- 扩展Spider进行请求的深度限制或宽度优先/深度优先排序