爬虫案例

2022-08-24 12:30:13 浏览数 (1)

一、壁纸网站

代码语言:javascript复制
# coding=utf-8
"""
    作者:gaojs
    功能:
    新增功能:
    日期:2022/3/25 19:35
"""
import os.path

import requests
import parsel


def get_address():
    """
    获取url地址
    :return:
    """
    dirname = 'photo/'

    if not os.path.exists(dirname):
        os.mkdir(dirname)

    for page in range(2, 11):
        print(f'=====================正在爬取第{page}页内容========================')
        url = f'http://www.netbian.com/1920x1080/index_{page}.htm'

        # url = 'http://www.netbian.com/1920x1080/'
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.74 Safari/537.36 Edg/99.0.1150.46'
        }
        res = requests.get(url, headers=headers)
        res.encoding = res.apparent_encoding

        selector = parsel.Selector(res.text)
        href = selector.css('.list li a::attr(href)').getall()

        # <img src="http://img.netbian.com/file/2022/0326/small003835uYAUe1648226315.jpg" alt="绿色草地 美女刘亦菲2022年4月日历桌面壁纸护眼">

        url_lis = selector.css('.list li')
        for lis in url_lis:
            title = lis.css('b::text').get()
            # 取出广告页面
            if title:
                list_url = 'http://www.netbian.com'   lis.css('a::attr(href)').get()
                # print(list_url)
                res1 = requests.get(list_url, headers=headers)
                # print(res1.text)
                selector1 = parsel.Selector(res1.text)
                img_url = selector1.css('.pic img::attr(src)').get()
                # print(img_url)

                # 保存图片
                img_content = requests.get(url=img_url).content
                with open('photo/'   title   '.jpg', 'wb') as f:
                    f.write(img_content)
                    print(title, img_url)


get_address()

二、彼岸壁纸

代码语言:javascript复制
# coding=utf-8
"""
    作者:gaojs
    功能:
    新增功能:
    日期:2022/4/2 14:59
"""
import os.path
import re
import requests


if not os.path.exists('photo/'):
    os.mkdir('photo/')

url = 'http://www.netbian.com'
# http://www.netbian.com/index_2.htm

# http://www.netbian.com/desk/26344-1920x1080.htm
# http://www.netbian.com/desk/26345-1920x1080.htm
headers = {
    'Host': 'www.netbian.com',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36',
    'Upgrade-Insecure-Requests': '1',
    'Cookie': '__yjs_duid=1_4535c561a20964f1ade88776981a0f411648389371877; Hm_lvt_0f461eb489c245a31c209d36e41fcc0f=1648389374,1648986956; Hm_lpvt_0f461eb489c245a31c209d36e41fcc0f=1648986956'
}
rsp = requests.get(url, headers=headers)
rsp.encoding = rsp.apparent_encoding
# print(rsp.text)

# <img src="http://img.netbian.com/file/2022/0402/small004425v1bwe1648831465.jpg" alt="lol英雄联盟九尾妖狐 命运之子 阿狸壁纸"/>
# <a href="(.*?)"title="(.*?)" target="_blank"><img src=".*?" alt=".*?" />
url_list = re.findall('<a href="(.*?)"title="(.*?)" target="_blank"><img src=".*?" alt=".*?" />', rsp.text)
# print(url_list)

for index in url_list:
    url_lis = index[0]
    title = index[1]
    new_url = url   url_lis
    # print(new_url)

    rsp1 = requests.get(new_url)
    rsp1.encoding = rsp1.apparent_encoding
    img_list = re.findall('<a href=".*?" target="_blank"><img src="(.*?)" alt="(.*?)" title=".*?"></a>', rsp1.text)
    # print(img_list)

    for img in img_list:
        img_url = img[0]
        img_title = img[1]
        content_data = requests.get(img_url).content

        with open('photo/'   img_title   '.jpg', 'wb') as f:
            f.write(content_data)
            print(f'***************正在爬取{title}中****************')

三、某手视频

代码语言:javascript复制
# coding=utf-8
"""
    作者:gaojs
    功能:
    新增功能:
    日期:2022/4/15 20:13
"""
import json
import os.path
import pprint

import requests


def get_page(pcursor):
    path = 'video/'
    if not os.path.exists(path):
        os.mkdir(path)
    # 爬取对象'https://www.kuaishou.com/profile/3xhv7zhkfr3rqag'
    """
    ctrl r 批量替换
    https://www.kuaishou.com/short-video/3xw5fmcf9jdap29?authorId=3xhv7zhkfr3rqag&streamSource=profile&area=profilexxnull
    https://www.kuaishou.com/short-video/3xf98wc5q2cuxtq?authorId=3xhv7zhkfr3rqag&streamSource=profile&area=profilexxnull
    """

    url = 'https://www.kuaishou.com/graphql'
    headers = {
        'content-type': 'application/json',
        'Cookie': 'kpf=PC_WEB; kpn=KUAISHOU_VISION; clientid=3; did=web_72314bf978cb158dd7034b2370d2ae70',
        'Host': 'www.kuaishou.com',
        'Origin': 'https://www.kuaishou.com',
        'Referer': 'https://www.kuaishou.com/short-video/3x6v3xmcjsd5cki?authorId=3xhv7zhkfr3rqag&streamSource=profile&area=profilexxnull',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36',
    }
    data = {
        "operationName": "visionProfilePhotoList",
        "query": "query visionProfilePhotoList($pcursor: String, $userId: String, $page: String, $webPageArea: String) {n  visionProfilePhotoList(pcursor: $pcursor, userId: $userId, page: $page, webPageArea: $webPageArea) {n    resultn    llsidn    webPageArean    feeds {n      typen      author {n        idn        namen        followingn        headerUrln        headerUrls {n          cdnn          urln          __typenamen        }n        __typenamen      }n      tags {n        typen        namen        __typenamen      }n      photo {n        idn        durationn        captionn        likeCountn        realLikeCountn        coverUrln        coverUrls {n          cdnn          urln          __typenamen        }n        photoUrls {n          cdnn          urln          __typenamen        }n        photoUrln        likedn        timestampn        expTagn        animatedCoverUrln        stereoTypen        videoRation        profileUserTopPhoton        __typenamen      }n      canAddCommentn      currentPcursorn      llsidn      statusn      __typenamen    }n    hostNamen    pcursorn    __typenamen  }n}n",
        "variables": {"userId": "3xhv7zhkfr3rqag", "pcursor": pcursor, "page": "detail", "webPageArea": "profilexxnull"}
    }
    rsp = requests.post(url=url, json=data, headers=headers)

    # 第一种方式转成json
    # json_data = json.loads(rsp.text)
    # 或者
    json_data = rsp.json()
    # print(json_data, type(json_data))
    url_list = json_data['data']['visionProfilePhotoList']['feeds']
    pcursor = json_data['data']['visionProfilePhotoList']['pcursor']
    # print(url_list)
    # pprint.pprint(url_list)

    for key in url_list:
        # 视屏标题
        title = key['photo']['caption']
        # print(title)
        # 视频url
        new_url = key['photo']['photoUrl']
        # print(title, new_url)
        # 发送请求
        content_data = requests.get(url=new_url).content
        # 保存目录
        with open(f'video/{title}.mp4', mode='wb') as f:
            f.write(content_data)
            print(f'=======================正在下载标题为 {title} 的快手短视频==========================')
    if pcursor != "no_more":
        get_page(pcursor)


get_page("")

四、拉钩数据

代码语言:javascript复制
# coding=utf-8
"""
    作者:gaojs
    功能:
    新增功能:
    日期:2022/4/3 17:58
"""
import csv
import json
import os.path
import pprint

import requests
import re


if not os.path.exists('info/'):
    os.makedirs('info/')

f = open('info/招聘.csv', encoding='utf-8', mode='a', newline='')
csv_writer = csv.DictWriter(f, fieldnames=[
    '职位名字',
    '公司名字',
    '工作城市',
    '学历要求',
    '经验要求',
    '薪资要求',
    '公司地址',
    '详情页'
])
# 写入表头
csv_writer.writeheader()

for page in range(1, 11):
    url = f'https://www.lagou.com/wn/jobs?pn={page}&fromSearch=true&kd=python'
    headers = {
        'Host': 'www.lagou.com',
        'Referer': 'https://www.lagou.com/utrack/trackMid.html?f=https://www.lagou.com/wn/jobs?pn=2&fromSearch=true&kd=python&t=1648984113&_ti=1',
        'Cookie': 'user_trace_token=20211122110451-60eec88a-fbaf-47fd-9a53-188f3632144b; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1637550277; _ga=GA1.2.1219095688.1637550277; LGUID=20211122110452-94ffa347-2c46-4c2d-8429-b83e30e86693; RECOMMEND_TIP=true; __lg_stoken__=9ec31e7a3301bab4f215bd5f80c8af0ab0dc2b8ce81af654fe848cf33ad7c4f33d0748020b30281d56a28a756342ce5d42e6c218bcfd56dbf764c51686741cbaf14de987ef24; JSESSIONID=ABAAABAABEIABCIA45B6C458598FF70789BDFD5A4574786; WEBTJ-ID=20220403173842-17feeca7ea0402-090b1b6ee61841-a3e3164-3686400-17feeca7ea15f1; sensorsdata2015session={}; X_HTTP_TOKEN=1ca92d1d8ffe4ecb3114898461b10fa2c7054519c6; X_MIDDLE_TOKEN=3e27b9a5a69f9fa78d5d2fe99174c9c5; sensorsdata2015jssdkcross={"distinct_id":"9659966","$device_id":"17d459f2858540-02719bae0efae1-4343363-2073600-17d459f2859704","props":{"$latest_traffic_source_type":"直接流量","$latest_search_keyword":"未取到值_直接打开","$latest_referrer":"","$os":"Windows","$browser":"Chrome","$browser_version":"98.0.4758.102"},"first_id":"17d459f2858540-02719bae0efae1-4343363-2073600-17d459f2859704"}',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36'
    }

    rsp = requests.get(url=url, headers=headers)
    print(rsp.status_code)
    # print(rsp.text)
    # <script id="__NEXT_DATA__" type="application/json">(.*?)</script>
    html_data = re.findall('<script id="__NEXT_DATA__" type="application/json">(.*?)</script>', rsp.text)[0]
    # print(html_data)


    json_data = json.loads(html_data)
    # print(json_data)
    # pprint.pprint(json_data)
    result = json_data['props']['pageProps']['initData']['content']['positionResult']['result']
    # print(result)
    # 格式输出
    # pprint.pprint(result)
    for index in result:
        # pprint.pprint(index)
        # 岗位职责
        job_index = index['positionDetail'].replace('<br />', '').replace('<br>', '')
        href = f'https://www.lagou.com/wn/jobs{index["positionId"]}.html'
        dict1 = {
            '职位名字': index['positionName'],
            '公司名字': index['companyFullName'],
            '工作城市': index['city'],
            '学历要求': index['education'],
            '经验要求': index['workYear'],
            '薪资要求': index['workYear'],
            '公司地址': index['positionAddress'],
            '详情页': href
        }
        csv_writer.writerow(dict1)
        title = index['positionName']   index['companyFullName']
        new_title = re.sub(r'[/?:"<>|]', '', title)
        with open('info/'   new_title   '.txt', 'w', encoding='utf-8') as f:
            f.write(job_index)
        print(dict1)

五、王者荣耀英雄皮肤高清壁纸

代码语言:javascript复制
# coding=utf-8
"""
    作者:gaojs
    功能:
    新增功能:
    日期:2022/4/2 13:05
"""

import requests
import os
import re


url = 'https://pvp.qq.com/web201605/js/herolist.json'
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.74 Safari/537.36 Edg/99.0.1150.55'
}
rsp = requests.get(url, headers=headers)
# print(rsp.text)
print(rsp.status_code)
# print(rsp.json())
for index in rsp.json():
    # 获取英雄名字和id
    hero_name = index['cname']
    hero_id = index['ename']

    # filename = f'{hero_name}\'
    # if not os.path.exists(filename):
    #     os.mkdir(filename)

    index_url = f'https://pvp.qq.com/web201605/herodetail/{hero_id}.shtml'
    # print(hero_name, hero_id, index_url)
    rsp1 = requests.get(url=index_url, headers=headers)
    # rsp1.encoding = 'gbk'
    rsp1.encoding = rsp1.apparent_encoding#自动识别编码
    # print(rsp1.text)
    temp = '<ul class="pic-pf-list pic-pf-list3" data-imgname="(.*?)">'
    title_list = re.findall('<ul class="pic-pf-list pic-pf-list3" data-imgname="(.*?)">', rsp1.text)[0]
    title_list = re.sub('&d ', '', title_list).split('|')

    for num in range(1, len(title_list)   1):

        img_url = f'https://game.gtimg.cn/images/yxzj/img201606/skin/hero-info/{hero_id}/{hero_id}-bigskin-{num}.jpg'
        img_title = title_list[num - 1]

        img_data = requests.get(url=img_url, headers=headers).content
        with open('photo/'   img_title   '.jpg', 'wb') as f:
            print(f'=====================正在爬取{hero_name}的皮肤========================')
            f.write(img_data)
        # print(img_title, img_url)

六、美图网站

代码语言:javascript复制
# coding=utf-8
"""
    作者:gaojs
    功能:
    新增功能:
    日期:2022/3/26 12:17
"""
import os.path
from time import sleep

import requests
import re


dirname = 'photo/'
if not os.path.exists(dirname):
    os.mkdir(dirname)

url = 'https://www.vmgirls.com/17081.html'

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.74 Safari/537.36 Edg/99.0.1150.46'
}
res = requests.get(url, headers=headers)
# print(res.text)
print(res.status_code)

# <a href="(.*?)" alt=".*?" title=".*?">
# 只匹配括号内的内容
url_list = re.findall('<a href="(.*?)" alt=".*?" title=".*?">', res.text)
print(url_list)

for urls in url_list:
    name = urls.split('/')[-1]
    new_url = 'https:'   urls
    # print(new_url)
    res_content = requests.get(url=new_url, headers=headers).content
    sleep(2)
    # 保存文件
    with open('photo/'   name   '.jpeg', mode='wb') as f:
        f.write(res_content)

七、表情包

代码语言:javascript复制
# coding=utf-8
"""
    作者:gaojs
    功能:
    新增功能:
    日期:2022/3/25 17:35
"""

import requests
import re


def download_photo(name, url):
    res = requests.get(url)
    print(res.status_code)
    suffix = url.split('.')[-1]
    with open('photo/'   name   '.'   suffix, 'wb') as f:
        f.write(res.content)


"""
http://tva1.sinaimg.cn/large/6a2a7a61ly1gy5fd1pb7ij20iz0iz41l.jpg
http://tva1.sinaimg.cn/large/6a2a7a61ly1gy5fd3od4lg208w08wdvb.gif

https://www.fabiaoqing.com/bqb/lists/page/3.html
"""


def download_page(url):
    # url = 'https://img.yuanmabao.com/zijie/pic/2022/08/24/mi25ciq0jb5.html&" title="(.*?)" alt="(.*?)" style="max-height:188;margin: 0 auto"/>'
    result1 = re.findall(temp, res1.text)
    print(result1)
    for img in result1:
        print(img)
        # name = img[0]
        # new_name = re.sub(r'[/:*?;"<>|n]', '_', name)
        # download_photo(img[1], new_name)
        download_photo(img[1], img[0])

# download_page('https://www.fabiaoqing.com/biaoqing/lists/page/3.html')


def download_all_page():
    for page in range(1, 50):
        pages = 'https://www.fabiaoqing.com/biaoqing/lists/page/'   str(page)   '.html'
        download_page(pages)


download_all_page()

八、酷狗music

代码语言:javascript复制
# coding=utf-8
"""
    作者:gaojs
    功能:
    新增功能:
    日期:2022/4/8 12:59
"""
import os.path
import pprint
import re

import requests


if not os.path.exists('music/'):
    os.mkdir('music/')
url = 'https://www.kugou.com/yy/html/rank.html'

headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.93 Safari/537.36'
}

rsp = requests.get(url, headers=headers)
# print(rsp.text)
hash_list = re.findall('"Hash":"(.*?)"', rsp.text)
album_list = re.findall('"album_id":(.*?),', rsp.text)
# print(rsp.text)
zip_list = zip(hash_list, album_list)
for hash1, album_id in zip_list:
    # print(hash1, album_id)

    index_url = 'https://wwwapi.kugou.com/yy/index.php'
    data = {
        'r': 'play/getdata',
        'hash': hash1,
        'dfid': '34dlds4MjPyk0XgC5n0MobxL',
        'appid': '1014',
        'mid': 'fbcb28bbcbd1758696a1eb4363b645d6',
        'platid': '4',
        'album_id': album_id,
        '_': '1649395118742'
    }
    rsp1 = requests.get(url=index_url, params=data, headers=headers)
    # print(rsp1.json())
    # pprint.pprint(rsp1.json())
    audioname = rsp1.json()['data']['audio_name']
    playurl = rsp1.json()['data']['play_url']
    # print(audioname, playurl)
    music_content = requests.get(url=playurl, headers=headers).content
    with open('music\'   audioname   '.mp3', 'wb') as f:
        print(f'*************************正在爬取歌曲{audioname}中***********************')
        f.write(music_content)

0 人点赞