爬虫爬取快手短视频

发布时间:2024年01月19日

?爬取快手短视频并自动下载保存;

先输入关键词,例如"慢摇',"美女"等

再输入页数(是从开始下载到当前页数),例如5就是从0到5页

然后等待下载即可

import pprint
import requests
import os
import re
import json

def get_response(url, keywords,pcursor):
    hearders = {
        'Accept': '*/*',
        'Accept-Encoding': 'gzip, deflate, br',
        'Accept-Language': 'zh-CN,zh;q=0.9',
        'Connection': 'keep-alive',
        'Content-Length': '1839',
        'Content-Type': 'application/json',
        'Cookie': 'kpf=PC_WEB; clientid=3; did=web_713774521487450db89fcfc3892aae65; didv=1705562481178; ktrace-context=1|MS43NjQ1ODM2OTgyODY2OTgyLjQzOTc2MzU1LjE3MDU1NjM4MDkxNTEuNzUzNzYy|MS43NjQ1ODM2OTgyODY2OTgyLjk2MjU0NDIxLjE3MDU1NjM4MDkxNTEuNzUzNzYz|0|graphql-server|webservice|false|NA; kpn=KUAISHOU_VISION',
        'Host': 'www.kuaishou.com',
        'Origin': 'https://www.kuaishou.com',
        'Referer': 'https://www.kuaishou.com/search/video?searchKey=%E6%85%A2%E6%91%87',
        'Sec-Ch-Ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
        'Sec-Ch-Ua-Mobile': '?0',
        'Sec-Ch-Ua-Platform': '"Windows"',
        'Sec-Fetch-Dest': 'empty',
        'Sec-Fetch-Mode': 'cors',
        'Sec-Fetch-Site': 'same-origin',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
    }
    data = {
        "operationName": "visionSearchPhoto",
        "query": "fragment photoContent on PhotoEntity {\n  __typename\n  id\n  duration\n  caption\n  originCaption\n  likeCount\n  viewCount\n  commentCount\n  realLikeCount\n  coverUrl\n  photoUrl\n  photoH265Url\n  manifest\n  manifestH265\n  videoResource\n  coverUrls {\n    url\n    __typename\n  }\n  timestamp\n  expTag\n  animatedCoverUrl\n  distance\n  videoRatio\n  liked\n  stereoType\n  profileUserTopPhoto\n  musicBlocked\n}\n\nfragment recoPhotoFragment on recoPhotoEntity {\n  __typename\n  id\n  duration\n  caption\n  originCaption\n  likeCount\n  viewCount\n  commentCount\n  realLikeCount\n  coverUrl\n  photoUrl\n  photoH265Url\n  manifest\n  manifestH265\n  videoResource\n  coverUrls {\n    url\n    __typename\n  }\n  timestamp\n  expTag\n  animatedCoverUrl\n  distance\n  videoRatio\n  liked\n  stereoType\n  profileUserTopPhoto\n  musicBlocked\n}\n\nfragment feedContent on Feed {\n  type\n  author {\n    id\n    name\n    headerUrl\n    following\n    headerUrls {\n      url\n      __typename\n    }\n    __typename\n  }\n  photo {\n    ...photoContent\n    ...recoPhotoFragment\n    __typename\n  }\n  canAddComment\n  llsid\n  status\n  currentPcursor\n  tags {\n    type\n    name\n    __typename\n  }\n  __typename\n}\n\nquery visionSearchPhoto($keyword: String, $pcursor: String, $searchSessionId: String, $page: String, $webPageArea: String) {\n  visionSearchPhoto(keyword: $keyword, pcursor: $pcursor, searchSessionId: $searchSessionId, page: $page, webPageArea: $webPageArea) {\n    result\n    llsid\n    webPageArea\n    feeds {\n      ...feedContent\n      __typename\n    }\n    searchSessionId\n    pcursor\n    aladdinBanner {\n      imgUrl\n      link\n      __typename\n    }\n    __typename\n  }\n}\n",
        "variables": {"keyword": keywords, "pcursor": pcursor, "page": "search"}
    }
    data = json.dumps(data)
    response = requests.post(url=url, data=data, headers=hearders)
    return response


def save(url, dir_name,pcursor):
    response = get_response(url, dir_name,pcursor)
    json_data = response.json()
    feed_list = json_data['data']['visionSearchPhoto']['feeds']  # 这是一个列表
    for feeds in feed_list:
        try:
            video_url = feeds['photo']['photoUrl']
            title = feeds['photo']['caption']
            new_title = re.sub(r'[/\:*?"<>|@#]', '', title).split(' ')[0]
            print(video_url, new_title)
            mp4_data = requests.get(video_url).content
            if not os.path.exists(dir_name):
                os.mkdir(dir_name)
            with open(dir_name + '/' + new_title + '.mp4', mode="wb") as f:
                f.write(mp4_data)
            print(new_title + '下载成功')
        except:
            print('下载失败')

if __name__ == '__main__':
    url = "https://www.kuaishou.com/graphql"
    dir_name = input('请输入快手关键词')
    pcursor=input('请输入要下载的页数')
    for i in pcursor:
        save(url, dir_name,str(i))

文章来源:https://blog.csdn.net/m0_74045628/article/details/135682298
本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。