# Python脚本内部运行Scrapy
from scrapy.crawler import CrawlerProcess
from myproject.spiders.spider_one import SpiderOne
from myproject.spiders.spider_two import SpiderTwo
process = CrawlerProcess(get_project_settings())
# 定时运行不同的爬虫
process.crawl(SpiderOne)
process.crawl(SpiderTwo)
# 开始爬取
process.start() # 脚本会在这里阻塞直到所有爬虫执行完毕
# 爬取完成后,例如,发送邮件通知或生成报告
send_email_notification()
generate_report()
from threading import Thread
from scrapy.crawler import CrawlerRunner
from scrapy.utils.project import get_project_settings
settings = get_project_settings()
runner = CrawlerRunner(settings)
def run_spider(spider):
runner.crawl(spider)
runner.join()
# 启动线程以运行爬虫
thread1 = Thread(target=run_spider, args=(SpiderOne,))
thread2 = Thread(target=run_spider, args=(SpiderTwo,))
thread1.start()
thread2.start()
thread1.join()
thread2.join()
from multiprocessing import Process
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
def run_spider(spider):
process = CrawlerProcess(get_project_settings())
process.crawl(spider)
process.start()
# 创建并启动进程
process1 = Process(target=run_spider, args=(SpiderOne,))
process2 = Process(target=run_spider, args=(SpiderTwo,))
process1.start()
process2.start()
process1.join()
process2.join()
假设我们有一个爬虫需要根据用户输入的关键词搜索数据:
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
from myproject.spiders import MySpider
process = CrawlerProcess(get_project_settings())
# 运行爬虫,并将关键词作为参数传递
keyword = 'python'
process.crawl(MySpider, keyword=keyword)
process.start()
在上面的脚本示例中,我们通过crawl方法传递了一个关键字参数到爬虫。在爬虫中,你可以通过__init__方法访问这个参数。
import scrapy
class MySpider(scrapy.Spider):
name = 'myspider'
def __init__(self, keyword=None, *args, **kwargs):
super(MySpider, self).__init__(*args, **kwargs)
self.keyword = keyword # 使用传递的关键词
def start_requests(self):
url = f'http://search.example.com/?q={self.keyword}'
yield scrapy.Request(url, self.parse)
def parse(self, response):
# 爬虫的解析逻辑
pass
Scrapy爬虫可以将抓取到的数据作为Items返回。然后,这些Items可以在Pipeline中进一步处理。如果需要在脚本中直接处理数据,可以在爬虫中捕获这些数据并将其返回。
import scrapy
from scrapy.crawler import CrawlerRunner
from scrapy import signals
from twisted.internet import reactor
from scrapy.utils.project import get_project_settings
class MySpider(scrapy.Spider):
# ... 其他代码 ...
def parse(self, response):
# 解析逻辑,提取数据
yield {'url': response.url}
def item_collected(item):
print(item['url']) # 打印或处理Item
runner = CrawlerRunner(get_project_settings())
def crawl_job():
spider = MySpider(keyword='python')
deferred = runner.crawl(spider)
deferred.addBoth(lambda _: reactor.stop()) # 当爬虫结束时停止reactor
runner.signals.connect(item_collected, signal=signals.item_scraped)
reactor.run() # 启动事件循环
crawl_job()