此网站爬取江南大学官网新闻信息
https://news.jiangnan.edu.cn/yw.htm
mac | Linux : pip install scrapy
windows:
安装完成后在此项目终端输入scrapy ,查看是否安装成功(不报错)
创建工程:scrapy startproject 文件名称
创建爬虫文件:scrapy genspider spiderName www.xxx.com
执行后spiders中会多出一个文件,用来编写爬取规则
执行工程: scrapy crawl spiderName(指定执行的爬虫文件)
正则表达式,各语言差不多
pip install bs4
pip install lxml
from bs4 import BeautifulSoup
本地实例化
抓取网络页面
本地
网络
class SchoolItem(scrapy.Item):
school = scrapy.Field()
Time = scrapy.Field()
Col = scrapy.Field()
Title = scrapy.Field()
Text = scrapy.Field()
Provenance = scrapy.Field()
URL = scrapy.Field()
FWLCount = scrapy.Field()
Heat = scrapy.Field()
item = SchoolItem()
item["school"] = "江南大学"
item["Col"] = "综合新闻"
item["Heat"] = random.randint(500, 1000)
item["FWLCount"] = random.randint(100, 2000)
item["Time"] = data
item["URL"] = handle_url
item["Title"] = title
item["Text"] = content
item["Provenance"] = source
yield item
ITEM_PIPELINES = {
# 数值表示管道的权重 小的靠前
"school.pipelines.SchoolPipeline": 300,
}
class SchoolPipeline:
conn = None
cursor = None
new_Num = 0
def open_spider(self,spider):
print("网站信息开始收集...")
self.conn = pymysql.Connect(
user='root',
password='root',
host='localhost',
port=3306,
database='yu'
)
def process_item(self, item, spider):
self.cursor = self.conn.cursor()
try:
query = "insert into app01_schoolnews values ('%s','%s','%s','%s','%s','%s','%s','%s','%s')"
values = (
item['school'], item['Time'], item['Col'], item['Title'], item['Text'],item['URL'],item['Provenance'], item['Heat'],
item['FWLCount'])
QUERY = format(query%values)
# print(QUERY)
self.cursor.execute(QUERY)
self.conn.commit()
print("插入数据库成功...")
self.new_Num += 1
except Exception as e:
print("mysql连接异常...",e)
# return item
def close_spider(self,spider):
self.cursor.close()
self.conn.close()
print("关闭数据库连接。共入库%d条数据..."%self.new_Num)
表模型
CREATE TABLE `app01_schoolnews` (
`school` varchar(255) COLLATE utf8mb4_general_ci DEFAULT NULL,
`Time` varchar(64) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NOT NULL,
`Col` longtext COLLATE utf8mb4_general_ci NOT NULL,
`Title` longtext COLLATE utf8mb4_general_ci NOT NULL,
`Text` longtext COLLATE utf8mb4_general_ci NOT NULL,
`Provenance` longtext COLLATE utf8mb4_general_ci NOT NULL,
`URL` varchar(255) COLLATE utf8mb4_general_ci NOT NULL,
`FWLCount` int NOT NULL,
`Heat` double NOT NULL
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci;