【进阶】【Python网络爬虫】【17.爬虫框架】scrapy进阶(附大量案例代码)(建议收藏)

发布时间:2024年01月05日

一、scrapy进阶

案例 - qd_04_kfc
tems.py
import scrapy

class Qd04KfcItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    pass
middlewares.py
from scrapy import signals

# useful for handling different item types with a single interface
from itemadapter import is_item, ItemAdapter


class Qd04KfcSpiderMiddleware:

    @classmethod
    def from_crawler(cls, crawler):
        # This method is used by Scrapy to create your spiders.
        s = cls()
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s

    def process_spider_input(self, response, spider):
        return None

    def process_spider_output(self, response, result, spider):
        for i in result:
            yield i

    def process_spider_exception(self, response, exception, spider):
        pass

    def process_start_requests(self, start_requests, spider):
        for r in start_requests:
            yield r

    def spider_opened(self, spider):
        spider.logger.info("Spider opened: %s" % spider.name)


class Qd04KfcDownloaderMiddleware:

    @classmethod
    def from_crawler(cls, crawler):
        # This method is used by Scrapy to create your spiders.
        s = cls()
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s

    def process_request(self, request, spider):
        return None

    def process_response(self, request, response, spider):
        return response

    def process_exception(self, request, exception, spider):
        pass

    def spider_opened(self, spider):
        spider.logger.info("Spider opened: %s" % spider.name)
pipelines.py
from itemadapter import ItemAdapter

class Qd04KfcPipeline:
    def process_item(self, item, spider):
        return item
settings.py
BOT_NAME = "qd_04_KFC"

SPIDER_MODULES = ["qd_04_KFC.spiders"]
NEWSPIDER_MODULE = "qd_04_KFC.spiders"

ROBOTSTXT_OBEY = False

REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
FEED_EXPORT_ENCODING = "utf-8"
spiders
kfc.py
import scrapy

class KfcSpider(scrapy.Spider):
    name = "KFC"
    allowed_domains = ["kfc.com.cn"]

    # start_urls = ["https://kfc.com.cn"]
    def start_requests(self):
        # for page in range(1, 11):  # 翻页方式一
        # FormRequest 能够帮助我们发送post请求
        yield scrapy.FormRequest(
            # scrapy 中不支持查询参数的构建
            url='http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword',
            # 传递post请求参数
            formdata={
                'cname': '',
                'pid': '',
                'keyword': '北京',
                'pageIndex': '1',
                'pageSize': '10'},
            callback=self.parse,
            # 临时使用请求头
            # headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'}
            meta={'page': 2}
        )
    """
    meta用于两函数间的数据传递
    meta是一次性的, 每一次构建请求对象都会被创建
    如果在代码中meta需要再多个函数间传递到最后一个函数, 那么需要一级一级传递
    meta是一个字典, 可以自定义键值对
    """
    def parse(self, response):
        # print(response.json())
        json_data = response.json()

        list_data = json_data['Table1']
        for res in list_data:
            storeName = res['storeName']
            addressDetail = res['addressDetail']
            pro = res['pro']
            print(storeName, addressDetail, pro)

        print('上一个函数传下来的meta:', response.meta.get('page'))
        page = response.meta.get('page')  # 2

        if page <= 10:
            yield scrapy.FormRequest(
                url='http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword',
                formdata={
                    'cname': '',
                    'pid': '',
                    'keyword': '北京',
                    'pageIndex': str(page),
                    'pageSize': '10'},
                callback=self.parse,
                meta={'page': page + 1}  # 3  # 翻页方式2
            )

    def parse_2(self, response):
        pass
案例 - qd_05_info
items.py
import scrapy

class Qd05InfoItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    pass
middlewares.py
from scrapy import signals

# useful for handling different item types with a single interface
from itemadapter import is_item, ItemAdapter


class Qd05InfoSpiderMiddleware:

    @classmethod
    def from_crawler(cls, crawler):
        # This method is used by Scrapy to create your spiders.
        s = cls()
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s

    def process_spider_input(self, response, spider):
        return None

    def process_spider_output(self, response, result, spider):
        for i in result:
            yield i

    def process_spider_exception(self, response, exception, spider):
        pass

    def process_start_requests(self, start_requests, spider):
        for r in start_requests:
            yield r

    def spider_opened(self, spider):
        spider.logger.info("Spider opened: %s" % spider.name)


class Qd05InfoDownloaderMiddleware:
    @classmethod
    def from_crawler(cls, crawler):
        # This method is used by Scrapy to create your spiders.
        s = cls()
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s

    def process_request(self, request, spider):
        return None

    def process_response(self, request, response, spider):
        return response

    def process_exception(self, request, exception, spider):
        pass

    def spider_opened(self, spider):
        spider.logger.info("Spider opened: %s" % spider.name)
pipelines.py
from itemadapter import ItemAdapter

class Qd05InfoPipeline:
    def process_item(self, item, spider):
        return item
settings.py
BOT_NAME = "qd_05_info"

SPIDER_MODULES = ["qd_05_info.spiders"]
NEWSPIDER_MODULE = "qd_05_info.spiders"

ROBOTSTXT_OBEY = False

REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
FEED_EXPORT_ENCODING = "utf-8"
spiders
info.py
import scrapy

class InfoSpider(scrapy.Spider):
    name = "info"
    allowed_domains = ["zfcg.sh.gov.cn"]

    # 默认GET
    # start_urls = ["http://www.zfcg.sh.gov.cn/portal/category"]
    def start_requests(self):
        yield scrapy.http.JsonRequest(
            url='http://www.zfcg.sh.gov.cn/portal/category',
            # 提交请求参数
            data={"pageNo": 1, "pageSize": 15, "categoryCode": "ZcyAnnouncement1", "_t": 1687780627000},
            callback=self.parse
        )

    def parse(self, response):
        print(response.json())
        pass

"""
scrapy.Request              -- get
scrapy.FormRequest          -- post --> formdata
scrapy.http.JsonRequest     -- post --> json
"""
案例 - qd_06_douban
items.py
import scrapy

class Qd06DoubanItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    pass
middlewares.py
import requests
from scrapy import signals

from itemadapter import is_item, ItemAdapter
"""
中间件文件, 处理请求与响应的

主要有爬虫中间件和下载中间件:
    两个中间件都能处理请求和响应, 但是使用场景不同
    SpiderMiddleware: 爬虫中间件
        主要作用: 过滤错误请求, 框架底层会自动帮助我们处理
    
    DownloaderMiddleware; 下载中间件
        主要作用: 处理请求 headers  cookies  proxies 
"""
from faker import Faker

""" headers中间件 """
class HeadersDownloaderMiddleware:
    def __init__(self):
        self.result = Faker()

    def process_request(self, request, spider):
        # request.headers 可以拿到请求体对象中的请求头, 是一个字典
        request.headers.update(
            {
                'Host': 'movie.douban.com',
                # 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'
                # 在网站部检测请求头的时候可以用, 因为有的服务器老版本的UA不给数据
                'User-Agent': self.result.user_agent()  # 随机替换请求头
            }
        )
        return None

""" cookies中间件 """
class CookiesDownloaderMiddleware:
    def process_request(self, request, spider):
        # request.cookies 拿到的请求对象的cookies, 是一个字典
        request.cookies.update(
            {
                'Cookie': 'll="118267"; bid=VrC8tT1GWz8; __yadk_uid=iHqVKZD4ZHIVREbOrlu9k4uWFSsAdZtO; _pk_id.100001.4cf6=b39d476add4f5658.1683638062.; __gads=ID=744f53c3cb2ebb52-22841ef3a4e00021:T=1683638065:RT=1687952998:S=ALNI_MZhRKuML1OBDnNRafe3qd6-ndhaiQ; __gpi=UID=00000c03bafcda5c:T=1683638065:RT=1687952998:S=ALNI_MbkLLsUm467wiS6ZZ6Mn2ohKIWBZw; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1690203081%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3Dlc41QNmlrwGQ8O8PjCJunGz_0h_G6BB1gDtLlqVFX7RwhX14GJJd57QXeYyq92TY%26wd%3D%26eqid%3D910ad14e0025a91d0000000664be73c7%22%5D; _pk_ses.100001.4cf6=1; ap_v=0,6.0; __utma=30149280.1169382564.1682168622.1688732870.1690203082.13; __utmb=30149280.0.10.1690203082; __utmc=30149280; __utmz=30149280.1690203082.13.11.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utma=223695111.1640817040.1683638062.1687952054.1690203082.6; __utmb=223695111.0.10.1690203082; __utmc=223695111; __utmz=223695111.1690203082.6.5.utmcsr=baidu|utmccn=(organic)|utmcmd=organic'
            }
        )
"""
cookies添加形式:
1.加请求头里面
2.自己构建cookies字典, 通过cookies关键字传递
3.将cookies的每一个片段构建键值对
"""


 """ 代理ip中间件 """
class ProxiesDownloaderMiddleware:
    def __init__(self):
        """初始化的时候就获取代理数据"""
        url = 'http://zltiqu.pyhttp.taolop.com/getip?count=1&neek=13873&type=2&yys=0&port=2&sb=&mr=2&sep=0'
        proxy_json = requests.get(url=url).json()
        print('获取的代理:', proxy_json)

        self.ip = proxy_json['data'][0]['ip']
        self.port = str(proxy_json['data'][0]['port'])

        # 在scrapy框架中代理需要构建成--> https://ip:端口

    def process_request(self, request, spider):
        request.meta.update(
            {'proxy': 'https://' + self.ip + ':' + self.port}
        )
pipelines.py
from itemadapter import ItemAdapter

class Qd06DoubanPipeline:
    def process_item(self, item, spider):
        return item
settings.py
BOT_NAME = "qd_06_douban"

SPIDER_MODULES = ["qd_06_douban.spiders"]
NEWSPIDER_MODULE = "qd_06_douban.spiders"

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

DOWNLOADER_MIDDLEWARES = {
    # 配置headers中间件   权重
    "qd_06_douban.middlewares.HeadersDownloaderMiddleware": 543,
    # 配置cookies中间件
    "qd_06_douban.middlewares.CookiesDownloaderMiddleware": 544,
    # 配置proxies代理中间
    "qd_06_douban.middlewares.ProxiesDownloaderMiddleware": 542,
}

REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
FEED_EXPORT_ENCODING = "utf-8"

"""常见配置项"""
# 设置请求响应时间, 单位秒, 如果超过这个时间还没有数据返回,直接过滤
DOWNLOAD_TIMEOUT = 180  # 3mins

# 设置请求延迟, 单位秒
DOWNLOAD_DELAY = 5

# 开启异常重试
RETRY_ENABLED = True
# 开启异常重试   单位/次数
RETRY_TIMES = 2  # initial response + 2 retries = 3 requests
# 重试的状态码
RETRY_HTTP_CODES = [500, 502, 503, 504, 522, 524, 408, 429, 404]
spiders
douban.py
import scrapy

class DoubanSpider(scrapy.Spider):
    name = "douban"
    allowed_domains = ["douban.com"]
    # start_urls 直接使用start_urls地址, 无法在爬虫文件中加请求头
    # start_urls = ["https://movie.douban.com/top250"]
    def start_requests(self):
        for page in range(0, 226, 25):
            yield scrapy.Request(
                url=f'https://movie.douban.com/top250?start={page}&filter=',
                callback=self.parse,
                # 添加请求头的方式1
                # headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'}
            )

    def parse(self, response):
        print(response.text)
        print(response.request.headers)
        pass

"""
如果请求返回的状态码是错误的状态码, 会自动被框架过滤
403
"""
"""
添加请求头的方式:
1. 直接在请求对象方法中使用headers关键字添加, 添加一个字典对象
    除了咱们自己添加请求头字段以外, 会默认加上常用的请求头
2. 在middlewares.py写中间件, 添加请求头
3. 在settings.py中设置 DEFAULT_REQUEST_HEADERS 配置信息
"""
案例 - qd_07_maoyan
items.py
import scrapy

class Qd06MaoyanItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    name = scrapy.Field()
    star = scrapy.Field()
    releasetime = scrapy.Field()
    score = scrapy.Field()
middlewares.py
from scrapy import signals

from itemadapter import is_item, ItemAdapter

class Headers:
    def process_request(self, request, spider):
        request.headers.update(
            {
                'User-Agent': 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
                # 'Cookie': 'iuuid=F5783590DF7F11ED9B25CD5E5234694FA2DAA684142046EEA9F6AB456B2AFF02; _lxsdk_cuid=1879ededf23c8-0483e83e302f2e-26031b51-1fa400-1879ededf23c8; _lxsdk=F5783590DF7F11ED9B25CD5E5234694FA2DAA684142046EEA9F6AB456B2AFF02; ci=70%2C%E9%95%BF%E6%B2%99; ci.sig=6ddYdqOybjnPiJJMwTWmS44rF4o; ci=70%2C%E9%95%BF%E6%B2%99; ci.sig=6ddYdqOybjnPiJJMwTWmS44rF4o; ci=70%2C%E9%95%BF%E6%B2%99; ci.sig=6ddYdqOybjnPiJJMwTWmS44rF4o; Hm_lvt_703e94591e87be68cc8da0da7cbd0be2=1690368528; Hm_lpvt_703e94591e87be68cc8da0da7cbd0be2=1690368528',
                'Host': 'm.maoyan.com',
            }
        )
        return None
pipelines.py
from itemadapter import ItemAdapter

class Qd06MaoyanPipeline:
    def process_item(self, item, spider):
        return item
settings.py
BOT_NAME = "qd_06_maoyan"

SPIDER_MODULES = ["qd_06_maoyan.spiders"]
NEWSPIDER_MODULE = "qd_06_maoyan.spiders"

ROBOTSTXT_OBEY = False

DOWNLOADER_MIDDLEWARES = {
    "qd_06_maoyan.middlewares.Headers": 543,
}

REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
FEED_EXPORT_ENCODING = "utf-8"
spiders
maoyan.py
import scrapy

from ..items import Qd06MaoyanItem

class MaoyanSpider(scrapy.Spider):
    name = "maoyan"
    allowed_domains = ["maoyan.com"]
    start_urls = ["https://m.maoyan.com/asgard/board/4"]

    # def start_requests(self):

    def parse(self, response):
        # print(response.text)
        divs = response.css('.clearfix')

        for div in divs:
            name = div.css('.title::text').get()
            star = div.css('.actors::text').get()
            releasetime = div.css('.date::text').get()
            score = div.css('.number::text').get()
            yield Qd06MaoyanItem(name=name, star=star, releasetime=releasetime, score=score)
案例 - qd_08_zzs
items.py
import scrapy

class Qd07ZzsItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    title = scrapy.Field()  # 标题
    info = scrapy.Field()  # 简介
    put_time = scrapy.Field()  # 发布时间
    likes = scrapy.Field()  # 喜欢数
    stars = scrapy.Field()  # 点赞数
    comments = scrapy.Field()  # 评论数
middlewares.py
from scrapy import signals

from itemadapter import is_item, ItemAdapter


class Qd07ZzsSpiderMiddleware:

    @classmethod
    def from_crawler(cls, crawler):
        # This method is used by Scrapy to create your spiders.
        s = cls()
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s

    def process_spider_input(self, response, spider):
        return None

    def process_spider_output(self, response, result, spider):
        for i in result:
            yield i

    def process_spider_exception(self, response, exception, spider):
        pass

    def process_start_requests(self, start_requests, spider):
        for r in start_requests:
            yield r

    def spider_opened(self, spider):
        spider.logger.info("Spider opened: %s" % spider.name)


class Qd07ZzsDownloaderMiddleware:

    @classmethod
    def from_crawler(cls, crawler):
        # This method is used by Scrapy to create your spiders.
        s = cls()
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s

    def process_request(self, request, spider):
        return None

    def process_response(self, request, response, spider):
        return response

    def process_exception(self, request, exception, spider):
        pass

    def spider_opened(self, spider):
        spider.logger.info("Spider opened: %s" % spider.name)
pipelines.py
from itemadapter import ItemAdapter

class Qd07ZzsPipeline:
    def process_item(self, item, spider):
        return item
settings.py
BOT_NAME = "qd_07_zzs"

SPIDER_MODULES = ["qd_07_zzs.spiders"]
NEWSPIDER_MODULE = "qd_07_zzs.spiders"

# Crawl responsibly by identifying yourself (and your website) on the user-agent
# USER_AGENT = "qd_07_zzs (+http://www.yourdomain.com)"

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
FEED_EXPORT_ENCODING = "utf-8"

# 设置请求延迟, 单位秒
DOWNLOAD_DELAY = 3
spiders
zzs.py
import scrapy

from ..items import Qd07ZzsItem

class ZzsSpider(scrapy.Spider):
    name = "zzs"
    allowed_domains = ["zhangzs.com"]

    # start_urls 不能发送post请求的
    # start_urls = ["https://zhangzs.com"]

    def start_requests(self):
        for page in range(1, 11):
            yield scrapy.FormRequest(
                url='https://www.zhangzs.com/wp-admin/admin-ajax.php',
                formdata={
                    'action': 'wpcom_load_posts',
                    'page': str(page),
                    'type': 'default'
                },
                callback=self.parse
            )

    def parse(self, response):
        # print(response.text)
        lis = response.css('.item')
        for li in lis:
            title = li.css('h2>a::text').getall()  # 标题
            if len(title) == 1:
                title = title[0].replace('???', '').strip()
            else:
                title = title[1].replace('???', '').strip()

            # 有的文章没有简介, 需要判断
            info = li.css('.item-excerpt>p::text').get()  # 简介
            if not info:
                info = 'null'

            put_time = li.css('.item-meta-li.date::text').get()  # 发布时间
            likes = li.css('.item-meta-li.hearts::text').get()  # 喜欢数
            stars = li.css('.item-meta-li.likes::text').get()  # 点赞数
            comments = li.css('.item-meta-li.comments::text').get()  # 评论数
            print(Qd07ZzsItem(title=title, info=info, put_time=put_time,
                              likes=likes, stars=stars, comments=comments))
            yield Qd07ZzsItem(title=title, info=info, put_time=put_time,
                              likes=likes, stars=stars, comments=comments)
文章来源:https://blog.csdn.net/weixin_43612602/article/details/135335002
本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。