import time
from lxml import etree
import requests
import pymysql
class MaoYanSpider(object):
def __init__(self):
self.url = "https://www.maoyan.com/board/4?offset={}"
self.headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36',
'referer': 'https://passport.meituan.com/',
'Cookie': '__mta=42753434.1633656738499.1634781127005.1634781128998.34; uuid_n_v=v1; _lxsdk_cuid=17c5d879290c8-03443510ba6172-6373267-144000-17c5d879291c8; uuid=60ACEF00317A11ECAAC07D88ABE178B722CFA72214D742A2849B46660B8F79A8; _lxsdk=60ACEF00317A11ECAAC07D88ABE178B722CFA72214D742A2849B46660B8F79A8; _csrf=94b23e138a83e44c117736c59d0901983cb89b75a2c0de2587b8c273d115e639; Hm_lvt_703e94591e87be68cc8da0da7cbd0be2=1634716251,1634716252,1634719353,1634779997; Hm_lpvt_703e94591e87be68cc8da0da7cbd0be2=1634781129; _lxsdk_s=17ca07b2470-536-b73-84%7C%7C12'
}
self.db = pymysql.connect(host="127.0.0.1", user='root', password='', db='maoyandb')
self.cursor = self.db.cursor()
def get_html(self, url):
html = requests.get(url=url, headers=self.headers).text
r_list = etree.HTML(html).xpath('//dl[@class="board-wrapper"]/dd')
items = []
for dd in r_list:
i =(
dd.xpath('.//p[@class="name"]/a/text()')[0].strip(),
dd.xpath('.//p[@class="star"]/text()')[0].strip()[3:],
dd.xpath('.//p[@class="releasetime"]/text()')[0].strip()[5:15]
)
items.append(i)
self.save_html(items)
def save_html(self,items):
try:
sql = 'insert into filmtab(name,star,time)values(%s,%s,%s)'
self.cursor.executemany(sql,items)
self.db.commit()
except Exception as e:
self.db.rollback()
print(str(e.args))
def run(self):
offset = int(input("请输入页码:"))
url = self.url.format((offset-1)*10)
self.get_html(url)
self.cursor.close()
self.db.close()
if __name__ == '__main__':
start = time.time()
spider = MaoYanSpider()
spider.run()
end = time.time()
print("数据抓取完毕,总耗时:%.2f" % (end - start))