爬取 nhentai 指定图册。
输入具体网址以及一个空格,即可。 注意文件结构,以及 toolkits/ip_proxies toolkits/down_load 工具包的搭建参考上一篇博文。
代码语言:javascript复制from bs4 import BeautifulSoup
from toolkits.ip_proxies import get_proxies
from toolkits.down_load import down_load
from fake_useragent import UserAgent
import requests
import os
import pymongo
client = pymongo.MongoClient('localhost', 27017)
nhentai = client['nhentai']
imgs = nhentai['imgs']
rootpath = '/home/x/BORBER/File/Tmp/nhentai' # 自己改
seq = '/'
ua = UserAgent()
headers = {
'User-Agent': ua.random,
'Referer': 'https://nhentai.net'
}
def mkdir(path):
folder = os.path.exists(path)
if not folder:
os.makedirs(path)
def get_title_pages(url, index=1):
r = requests.get(url, headers=headers, proxies=get_proxies())
soup = BeautifulSoup(r.text, 'lxml')
title = soup.select('#info > h2')
path = rootpath seq title[0].get_text()
mkdir(path)
max_pages = soup.select('#info > div:nth-child(4)')
data = {
'index': index,
'title': title[0].get_text(),
'url': url,
'max': int(max_pages[0].get_text().split()[0]),
'path': path
}
imgs.insert_one(data)
def get_img(url):
r = requests.get(url, headers=headers, proxies=get_proxies())
soup = BeautifulSoup(r.text, 'lxml')
return soup.select('#image-container > a > img')[0].get('src')
def download_all():
for item in imgs.find():
for i in range(1, item['max'] 1):
down_load(get_img(item['url'] str(i)), item['path'] seq[0] str(i), i, item['max'] 1, headers)
if __name__ == '__main__':
nhentai.drop_collection('imgs')
print('Enter the specific link to the pictures:')
ur = input()[:-1]
get_title_pages(ur)
download_all()