import scrapy
classQd04KfcItem(scrapy.Item):# define the fields for your item here like:# name = scrapy.Field()pass
middlewares.py
from scrapy import signals
# useful for handling different item types with a single interfacefrom itemadapter import is_item, ItemAdapter
classQd04KfcSpiderMiddleware:@classmethoddeffrom_crawler(cls, crawler):# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)return s
defprocess_spider_input(self, response, spider):returnNonedefprocess_spider_output(self, response, result, spider):for i in result:yield i
defprocess_spider_exception(self, response, exception, spider):passdefprocess_start_requests(self, start_requests, spider):for r in start_requests:yield r
defspider_opened(self, spider):
spider.logger.info("Spider opened: %s"% spider.name)classQd04KfcDownloaderMiddleware:@classmethoddeffrom_crawler(cls, crawler):# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)return s
defprocess_request(self, request, spider):returnNonedefprocess_response(self, request, response, spider):return response
defprocess_exception(self, request, exception, spider):passdefspider_opened(self, spider):
spider.logger.info("Spider opened: %s"% spider.name)
pipelines.py
from itemadapter import ItemAdapter
classQd04KfcPipeline:defprocess_item(self, item, spider):return item
import scrapy
classQd05InfoItem(scrapy.Item):# define the fields for your item here like:# name = scrapy.Field()pass
middlewares.py
from scrapy import signals
# useful for handling different item types with a single interfacefrom itemadapter import is_item, ItemAdapter
classQd05InfoSpiderMiddleware:@classmethoddeffrom_crawler(cls, crawler):# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)return s
defprocess_spider_input(self, response, spider):returnNonedefprocess_spider_output(self, response, result, spider):for i in result:yield i
defprocess_spider_exception(self, response, exception, spider):passdefprocess_start_requests(self, start_requests, spider):for r in start_requests:yield r
defspider_opened(self, spider):
spider.logger.info("Spider opened: %s"% spider.name)classQd05InfoDownloaderMiddleware:@classmethoddeffrom_crawler(cls, crawler):# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)return s
defprocess_request(self, request, spider):returnNonedefprocess_response(self, request, response, spider):return response
defprocess_exception(self, request, exception, spider):passdefspider_opened(self, spider):
spider.logger.info("Spider opened: %s"% spider.name)
pipelines.py
from itemadapter import ItemAdapter
classQd05InfoPipeline:defprocess_item(self, item, spider):return item
import scrapy
classQd06MaoyanItem(scrapy.Item):# define the fields for your item here like:# name = scrapy.Field()
name = scrapy.Field()
star = scrapy.Field()
releasetime = scrapy.Field()
score = scrapy.Field()
middlewares.py
from scrapy import signals
from itemadapter import is_item, ItemAdapter
classHeaders:defprocess_request(self, request, spider):
request.headers.update({'User-Agent':'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',# 'Cookie': 'iuuid=F5783590DF7F11ED9B25CD5E5234694FA2DAA684142046EEA9F6AB456B2AFF02; _lxsdk_cuid=1879ededf23c8-0483e83e302f2e-26031b51-1fa400-1879ededf23c8; _lxsdk=F5783590DF7F11ED9B25CD5E5234694FA2DAA684142046EEA9F6AB456B2AFF02; ci=70%2C%E9%95%BF%E6%B2%99; ci.sig=6ddYdqOybjnPiJJMwTWmS44rF4o; ci=70%2C%E9%95%BF%E6%B2%99; ci.sig=6ddYdqOybjnPiJJMwTWmS44rF4o; ci=70%2C%E9%95%BF%E6%B2%99; ci.sig=6ddYdqOybjnPiJJMwTWmS44rF4o; Hm_lvt_703e94591e87be68cc8da0da7cbd0be2=1690368528; Hm_lpvt_703e94591e87be68cc8da0da7cbd0be2=1690368528','Host':'m.maoyan.com',})returnNone
pipelines.py
from itemadapter import ItemAdapter
classQd06MaoyanPipeline:defprocess_item(self, item, spider):return item
import scrapy
from..items import Qd06MaoyanItem
classMaoyanSpider(scrapy.Spider):
name ="maoyan"
allowed_domains =["maoyan.com"]
start_urls =["https://m.maoyan.com/asgard/board/4"]# def start_requests(self):defparse(self, response):# print(response.text)
divs = response.css('.clearfix')for div in divs:
name = div.css('.title::text').get()
star = div.css('.actors::text').get()
releasetime = div.css('.date::text').get()
score = div.css('.number::text').get()yield Qd06MaoyanItem(name=name, star=star, releasetime=releasetime, score=score)
案例 - qd_08_zzs
items.py
import scrapy
classQd07ZzsItem(scrapy.Item):# define the fields for your item here like:# name = scrapy.Field()
title = scrapy.Field()# 标题
info = scrapy.Field()# 简介
put_time = scrapy.Field()# 发布时间
likes = scrapy.Field()# 喜欢数
stars = scrapy.Field()# 点赞数
comments = scrapy.Field()# 评论数
middlewares.py
from scrapy import signals
from itemadapter import is_item, ItemAdapter
classQd07ZzsSpiderMiddleware:@classmethoddeffrom_crawler(cls, crawler):# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)return s
defprocess_spider_input(self, response, spider):returnNonedefprocess_spider_output(self, response, result, spider):for i in result:yield i
defprocess_spider_exception(self, response, exception, spider):passdefprocess_start_requests(self, start_requests, spider):for r in start_requests:yield r
defspider_opened(self, spider):
spider.logger.info("Spider opened: %s"% spider.name)classQd07ZzsDownloaderMiddleware:@classmethoddeffrom_crawler(cls, crawler):# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)return s
defprocess_request(self, request, spider):returnNonedefprocess_response(self, request, response, spider):return response
defprocess_exception(self, request, exception, spider):passdefspider_opened(self, spider):
spider.logger.info("Spider opened: %s"% spider.name)
pipelines.py
from itemadapter import ItemAdapter
classQd07ZzsPipeline:defprocess_item(self, item, spider):return item
settings.py
BOT_NAME ="qd_07_zzs"
SPIDER_MODULES =["qd_07_zzs.spiders"]
NEWSPIDER_MODULE ="qd_07_zzs.spiders"# Crawl responsibly by identifying yourself (and your website) on the user-agent# USER_AGENT = "qd_07_zzs (+http://www.yourdomain.com)"# Obey robots.txt rules
ROBOTSTXT_OBEY =False
REQUEST_FINGERPRINTER_IMPLEMENTATION ="2.7"
TWISTED_REACTOR ="twisted.internet.asyncioreactor.AsyncioSelectorReactor"
FEED_EXPORT_ENCODING ="utf-8"# 设置请求延迟, 单位秒
DOWNLOAD_DELAY =3
spiders
zzs.py
import scrapy
from..items import Qd07ZzsItem
classZzsSpider(scrapy.Spider):
name ="zzs"
allowed_domains =["zhangzs.com"]# start_urls 不能发送post请求的# start_urls = ["https://zhangzs.com"]defstart_requests(self):for page inrange(1,11):yield scrapy.FormRequest(
url='https://www.zhangzs.com/wp-admin/admin-ajax.php',
formdata={'action':'wpcom_load_posts','page':str(page),'type':'default'},
callback=self.parse
)defparse(self, response):# print(response.text)
lis = response.css('.item')for li in lis:
title = li.css('h2>a::text').getall()# 标题iflen(title)==1:
title = title[0].replace('???','').strip()else:
title = title[1].replace('???','').strip()# 有的文章没有简介, 需要判断
info = li.css('.item-excerpt>p::text').get()# 简介ifnot info:
info ='null'
put_time = li.css('.item-meta-li.date::text').get()# 发布时间
likes = li.css('.item-meta-li.hearts::text').get()# 喜欢数
stars = li.css('.item-meta-li.likes::text').get()# 点赞数
comments = li.css('.item-meta-li.comments::text').get()# 评论数print(Qd07ZzsItem(title=title, info=info, put_time=put_time,
likes=likes, stars=stars, comments=comments))yield Qd07ZzsItem(title=title, info=info, put_time=put_time,
likes=likes, stars=stars, comments=comments)