一、壁纸网站
代码语言:javascript
复制# coding=utf-8
"""
作者:gaojs
功能:
新增功能:
日期:2022/3/25 19:35
"""
import os.path
import requests
import parsel
def get_address():
"""
获取url地址
:return:
"""
dirname = 'photo/'
if not os.path.exists(dirname):
os.mkdir(dirname)
for page in range(2, 11):
print(f'=====================正在爬取第{page}页内容========================')
url = f'http://www.netbian.com/1920x1080/index_{page}.htm'
# url = 'http://www.netbian.com/1920x1080/'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.74 Safari/537.36 Edg/99.0.1150.46'
}
res = requests.get(url, headers=headers)
res.encoding = res.apparent_encoding
selector = parsel.Selector(res.text)
href = selector.css('.list li a::attr(href)').getall()
# <img src="http://img.netbian.com/file/2022/0326/small003835uYAUe1648226315.jpg" alt="绿色草地 美女刘亦菲2022年4月日历桌面壁纸护眼">
url_lis = selector.css('.list li')
for lis in url_lis:
title = lis.css('b::text').get()
# 取出广告页面
if title:
list_url = 'http://www.netbian.com' lis.css('a::attr(href)').get()
# print(list_url)
res1 = requests.get(list_url, headers=headers)
# print(res1.text)
selector1 = parsel.Selector(res1.text)
img_url = selector1.css('.pic img::attr(src)').get()
# print(img_url)
# 保存图片
img_content = requests.get(url=img_url).content
with open('photo/' title '.jpg', 'wb') as f:
f.write(img_content)
print(title, img_url)
get_address()
二、彼岸壁纸
代码语言:javascript
复制# coding=utf-8
"""
作者:gaojs
功能:
新增功能:
日期:2022/4/2 14:59
"""
import os.path
import re
import requests
if not os.path.exists('photo/'):
os.mkdir('photo/')
url = 'http://www.netbian.com'
# http://www.netbian.com/index_2.htm
# http://www.netbian.com/desk/26344-1920x1080.htm
# http://www.netbian.com/desk/26345-1920x1080.htm
headers = {
'Host': 'www.netbian.com',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36',
'Upgrade-Insecure-Requests': '1',
'Cookie': '__yjs_duid=1_4535c561a20964f1ade88776981a0f411648389371877; Hm_lvt_0f461eb489c245a31c209d36e41fcc0f=1648389374,1648986956; Hm_lpvt_0f461eb489c245a31c209d36e41fcc0f=1648986956'
}
rsp = requests.get(url, headers=headers)
rsp.encoding = rsp.apparent_encoding
# print(rsp.text)
# <img src="http://img.netbian.com/file/2022/0402/small004425v1bwe1648831465.jpg" alt="lol英雄联盟九尾妖狐 命运之子 阿狸壁纸"/>
# <a href="(.*?)"title="(.*?)" target="_blank"><img src=".*?" alt=".*?" />
url_list = re.findall('<a href="(.*?)"title="(.*?)" target="_blank"><img src=".*?" alt=".*?" />', rsp.text)
# print(url_list)
for index in url_list:
url_lis = index[0]
title = index[1]
new_url = url url_lis
# print(new_url)
rsp1 = requests.get(new_url)
rsp1.encoding = rsp1.apparent_encoding
img_list = re.findall('<a href=".*?" target="_blank"><img src="(.*?)" alt="(.*?)" title=".*?"></a>', rsp1.text)
# print(img_list)
for img in img_list:
img_url = img[0]
img_title = img[1]
content_data = requests.get(img_url).content
with open('photo/' img_title '.jpg', 'wb') as f:
f.write(content_data)
print(f'***************正在爬取{title}中****************')
三、某手视频
代码语言:javascript
复制# coding=utf-8
"""
作者:gaojs
功能:
新增功能:
日期:2022/4/15 20:13
"""
import json
import os.path
import pprint
import requests
def get_page(pcursor):
path = 'video/'
if not os.path.exists(path):
os.mkdir(path)
# 爬取对象'https://www.kuaishou.com/profile/3xhv7zhkfr3rqag'
"""
ctrl r 批量替换
https://www.kuaishou.com/short-video/3xw5fmcf9jdap29?authorId=3xhv7zhkfr3rqag&streamSource=profile&area=profilexxnull
https://www.kuaishou.com/short-video/3xf98wc5q2cuxtq?authorId=3xhv7zhkfr3rqag&streamSource=profile&area=profilexxnull
"""
url = 'https://www.kuaishou.com/graphql'
headers = {
'content-type': 'application/json',
'Cookie': 'kpf=PC_WEB; kpn=KUAISHOU_VISION; clientid=3; did=web_72314bf978cb158dd7034b2370d2ae70',
'Host': 'www.kuaishou.com',
'Origin': 'https://www.kuaishou.com',
'Referer': 'https://www.kuaishou.com/short-video/3x6v3xmcjsd5cki?authorId=3xhv7zhkfr3rqag&streamSource=profile&area=profilexxnull',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36',
}
data = {
"operationName": "visionProfilePhotoList",
"query": "query visionProfilePhotoList($pcursor: String, $userId: String, $page: String, $webPageArea: String) {n visionProfilePhotoList(pcursor: $pcursor, userId: $userId, page: $page, webPageArea: $webPageArea) {n resultn llsidn webPageArean feeds {n typen author {n idn namen followingn headerUrln headerUrls {n cdnn urln __typenamen }n __typenamen }n tags {n typen namen __typenamen }n photo {n idn durationn captionn likeCountn realLikeCountn coverUrln coverUrls {n cdnn urln __typenamen }n photoUrls {n cdnn urln __typenamen }n photoUrln likedn timestampn expTagn animatedCoverUrln stereoTypen videoRation profileUserTopPhoton __typenamen }n canAddCommentn currentPcursorn llsidn statusn __typenamen }n hostNamen pcursorn __typenamen }n}n",
"variables": {"userId": "3xhv7zhkfr3rqag", "pcursor": pcursor, "page": "detail", "webPageArea": "profilexxnull"}
}
rsp = requests.post(url=url, json=data, headers=headers)
# 第一种方式转成json
# json_data = json.loads(rsp.text)
# 或者
json_data = rsp.json()
# print(json_data, type(json_data))
url_list = json_data['data']['visionProfilePhotoList']['feeds']
pcursor = json_data['data']['visionProfilePhotoList']['pcursor']
# print(url_list)
# pprint.pprint(url_list)
for key in url_list:
# 视屏标题
title = key['photo']['caption']
# print(title)
# 视频url
new_url = key['photo']['photoUrl']
# print(title, new_url)
# 发送请求
content_data = requests.get(url=new_url).content
# 保存目录
with open(f'video/{title}.mp4', mode='wb') as f:
f.write(content_data)
print(f'=======================正在下载标题为 {title} 的快手短视频==========================')
if pcursor != "no_more":
get_page(pcursor)
get_page("")
四、拉钩数据
代码语言:javascript
复制# coding=utf-8
"""
作者:gaojs
功能:
新增功能:
日期:2022/4/3 17:58
"""
import csv
import json
import os.path
import pprint
import requests
import re
if not os.path.exists('info/'):
os.makedirs('info/')
f = open('info/招聘.csv', encoding='utf-8', mode='a', newline='')
csv_writer = csv.DictWriter(f, fieldnames=[
'职位名字',
'公司名字',
'工作城市',
'学历要求',
'经验要求',
'薪资要求',
'公司地址',
'详情页'
])
# 写入表头
csv_writer.writeheader()
for page in range(1, 11):
url = f'https://www.lagou.com/wn/jobs?pn={page}&fromSearch=true&kd=python'
headers = {
'Host': 'www.lagou.com',
'Referer': 'https://www.lagou.com/utrack/trackMid.html?f=https://www.lagou.com/wn/jobs?pn=2&fromSearch=true&kd=python&t=1648984113&_ti=1',
'Cookie': 'user_trace_token=20211122110451-60eec88a-fbaf-47fd-9a53-188f3632144b; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1637550277; _ga=GA1.2.1219095688.1637550277; LGUID=20211122110452-94ffa347-2c46-4c2d-8429-b83e30e86693; RECOMMEND_TIP=true; __lg_stoken__=9ec31e7a3301bab4f215bd5f80c8af0ab0dc2b8ce81af654fe848cf33ad7c4f33d0748020b30281d56a28a756342ce5d42e6c218bcfd56dbf764c51686741cbaf14de987ef24; JSESSIONID=ABAAABAABEIABCIA45B6C458598FF70789BDFD5A4574786; WEBTJ-ID=20220403173842-17feeca7ea0402-090b1b6ee61841-a3e3164-3686400-17feeca7ea15f1; sensorsdata2015session={}; X_HTTP_TOKEN=1ca92d1d8ffe4ecb3114898461b10fa2c7054519c6; X_MIDDLE_TOKEN=3e27b9a5a69f9fa78d5d2fe99174c9c5; sensorsdata2015jssdkcross={"distinct_id":"9659966","$device_id":"17d459f2858540-02719bae0efae1-4343363-2073600-17d459f2859704","props":{"$latest_traffic_source_type":"直接流量","$latest_search_keyword":"未取到值_直接打开","$latest_referrer":"","$os":"Windows","$browser":"Chrome","$browser_version":"98.0.4758.102"},"first_id":"17d459f2858540-02719bae0efae1-4343363-2073600-17d459f2859704"}',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36'
}
rsp = requests.get(url=url, headers=headers)
print(rsp.status_code)
# print(rsp.text)
# <script id="__NEXT_DATA__" type="application/json">(.*?)</script>
html_data = re.findall('<script id="__NEXT_DATA__" type="application/json">(.*?)</script>', rsp.text)[0]
# print(html_data)
json_data = json.loads(html_data)
# print(json_data)
# pprint.pprint(json_data)
result = json_data['props']['pageProps']['initData']['content']['positionResult']['result']
# print(result)
# 格式输出
# pprint.pprint(result)
for index in result:
# pprint.pprint(index)
# 岗位职责
job_index = index['positionDetail'].replace('<br />', '').replace('<br>', '')
href = f'https://www.lagou.com/wn/jobs{index["positionId"]}.html'
dict1 = {
'职位名字': index['positionName'],
'公司名字': index['companyFullName'],
'工作城市': index['city'],
'学历要求': index['education'],
'经验要求': index['workYear'],
'薪资要求': index['workYear'],
'公司地址': index['positionAddress'],
'详情页': href
}
csv_writer.writerow(dict1)
title = index['positionName'] index['companyFullName']
new_title = re.sub(r'[/?:"<>|]', '', title)
with open('info/' new_title '.txt', 'w', encoding='utf-8') as f:
f.write(job_index)
print(dict1)
五、王者荣耀英雄皮肤高清壁纸
代码语言:javascript
复制# coding=utf-8
"""
作者:gaojs
功能:
新增功能:
日期:2022/4/2 13:05
"""
import requests
import os
import re
url = 'https://pvp.qq.com/web201605/js/herolist.json'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.74 Safari/537.36 Edg/99.0.1150.55'
}
rsp = requests.get(url, headers=headers)
# print(rsp.text)
print(rsp.status_code)
# print(rsp.json())
for index in rsp.json():
# 获取英雄名字和id
hero_name = index['cname']
hero_id = index['ename']
# filename = f'{hero_name}\'
# if not os.path.exists(filename):
# os.mkdir(filename)
index_url = f'https://pvp.qq.com/web201605/herodetail/{hero_id}.shtml'
# print(hero_name, hero_id, index_url)
rsp1 = requests.get(url=index_url, headers=headers)
# rsp1.encoding = 'gbk'
rsp1.encoding = rsp1.apparent_encoding#自动识别编码
# print(rsp1.text)
temp = '<ul class="pic-pf-list pic-pf-list3" data-imgname="(.*?)">'
title_list = re.findall('<ul class="pic-pf-list pic-pf-list3" data-imgname="(.*?)">', rsp1.text)[0]
title_list = re.sub('&d ', '', title_list).split('|')
for num in range(1, len(title_list) 1):
img_url = f'https://game.gtimg.cn/images/yxzj/img201606/skin/hero-info/{hero_id}/{hero_id}-bigskin-{num}.jpg'
img_title = title_list[num - 1]
img_data = requests.get(url=img_url, headers=headers).content
with open('photo/' img_title '.jpg', 'wb') as f:
print(f'=====================正在爬取{hero_name}的皮肤========================')
f.write(img_data)
# print(img_title, img_url)
六、美图网站
代码语言:javascript
复制# coding=utf-8
"""
作者:gaojs
功能:
新增功能:
日期:2022/3/26 12:17
"""
import os.path
from time import sleep
import requests
import re
dirname = 'photo/'
if not os.path.exists(dirname):
os.mkdir(dirname)
url = 'https://www.vmgirls.com/17081.html'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.74 Safari/537.36 Edg/99.0.1150.46'
}
res = requests.get(url, headers=headers)
# print(res.text)
print(res.status_code)
# <a href="(.*?)" alt=".*?" title=".*?">
# 只匹配括号内的内容
url_list = re.findall('<a href="(.*?)" alt=".*?" title=".*?">', res.text)
print(url_list)
for urls in url_list:
name = urls.split('/')[-1]
new_url = 'https:' urls
# print(new_url)
res_content = requests.get(url=new_url, headers=headers).content
sleep(2)
# 保存文件
with open('photo/' name '.jpeg', mode='wb') as f:
f.write(res_content)
七、表情包
代码语言:javascript
复制# coding=utf-8
"""
作者:gaojs
功能:
新增功能:
日期:2022/3/25 17:35
"""
import requests
import re
def download_photo(name, url):
res = requests.get(url)
print(res.status_code)
suffix = url.split('.')[-1]
with open('photo/' name '.' suffix, 'wb') as f:
f.write(res.content)
"""
http://tva1.sinaimg.cn/large/6a2a7a61ly1gy5fd1pb7ij20iz0iz41l.jpg
http://tva1.sinaimg.cn/large/6a2a7a61ly1gy5fd3od4lg208w08wdvb.gif
https://www.fabiaoqing.com/bqb/lists/page/3.html
"""
def download_page(url):
# url = 'https://img.yuanmabao.com/zijie/pic/2022/08/24/mi25ciq0jb5.html&" title="(.*?)" alt="(.*?)" style="max-height:188;margin: 0 auto"/>'
result1 = re.findall(temp, res1.text)
print(result1)
for img in result1:
print(img)
# name = img[0]
# new_name = re.sub(r'[/:*?;"<>|n]', '_', name)
# download_photo(img[1], new_name)
download_photo(img[1], img[0])
# download_page('https://www.fabiaoqing.com/biaoqing/lists/page/3.html')
def download_all_page():
for page in range(1, 50):
pages = 'https://www.fabiaoqing.com/biaoqing/lists/page/' str(page) '.html'
download_page(pages)
download_all_page()
八、酷狗music
代码语言:javascript
复制# coding=utf-8
"""
作者:gaojs
功能:
新增功能:
日期:2022/4/8 12:59
"""
import os.path
import pprint
import re
import requests
if not os.path.exists('music/'):
os.mkdir('music/')
url = 'https://www.kugou.com/yy/html/rank.html'
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.93 Safari/537.36'
}
rsp = requests.get(url, headers=headers)
# print(rsp.text)
hash_list = re.findall('"Hash":"(.*?)"', rsp.text)
album_list = re.findall('"album_id":(.*?),', rsp.text)
# print(rsp.text)
zip_list = zip(hash_list, album_list)
for hash1, album_id in zip_list:
# print(hash1, album_id)
index_url = 'https://wwwapi.kugou.com/yy/index.php'
data = {
'r': 'play/getdata',
'hash': hash1,
'dfid': '34dlds4MjPyk0XgC5n0MobxL',
'appid': '1014',
'mid': 'fbcb28bbcbd1758696a1eb4363b645d6',
'platid': '4',
'album_id': album_id,
'_': '1649395118742'
}
rsp1 = requests.get(url=index_url, params=data, headers=headers)
# print(rsp1.json())
# pprint.pprint(rsp1.json())
audioname = rsp1.json()['data']['audio_name']
playurl = rsp1.json()['data']['play_url']
# print(audioname, playurl)
music_content = requests.get(url=playurl, headers=headers).content
with open('music\' audioname '.mp3', 'wb') as f:
print(f'*************************正在爬取歌曲{audioname}中***********************')
f.write(music_content)