1.分析获取小说内容的URL
这里以某度小说网站举例说明,其余网站均可类似处理,打开小说网站的首页(网页链接见评论区),打开网页,输入并查询我们想要下载的小说,点击相应章节就能跳转到对应内容中,此时要检查页面源代码是否包含所有的小说内容数据。
2.查看网页源代码
- 鼠标停留在页面,右键点击“查看网页源代码”,通过“查找”发现小说数据并不是包含在源代码中。此时就需要抓包进行分析查看,按下"F12",切换到网络——Fetch/XHR——刷新页面就得到了这三个动态请求。打开getCatalog(获取目录),点击预览,就能看到一大堆名称了,这就是我们要拿的所有章节名称以及它所对应的id了,但是内容还没有出现。
- 切换到下一个getChapterContent(获取章节内容),打开果然发现了文本!这就是我们要的内容。
3.捋清思路,手撕代码
细心的网友会发现,请求头地址URL中出现了一堆“"”的乱码,其实它就是ASCII码中的双引号,参阅百度百科的词条内容:URL编码。
- 首先要获取这个小说所有章节的id
def get_child_url(book_id):
cids=[]#保存所有章节的id
url = main_url '/getCatalog?data={"book_id":"' book_id '"}'
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36',
'Referer': url
}
resp = requests.get(url, headers=header)
result = resp.json()
resp.close()
data=result['data']['novel']['items']
for item in data:
cids.append(item['cid'])
return cids
- 根据上一步章节的id去下载对应的内容
def download_one_page(book_id,title_id):
data = {
"book_id": book_id,
"cid": f"{book_id}|{title_id}",
"need_bookinfo": 1
}#拼接请求头参数
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36'
}
url = main_url '/getChapterContent?data=' json.dumps(data)
resp = requests.get(url, headers=header)
result = resp.json()
resp.close()
title = result['data']['novel']['chapter_title']
content = result['data']['novel']['content']
with open(f'小说/{title}.txt', 'w', encoding='utf-8') as file:
file.write(content)
- 完整代码如下
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import json
import requests
from concurrent.futures import ThreadPoolExecutor
def get_child_url(book_id):
cids=[]
url = main_url '/getCatalog?data={"book_id":"' book_id '"}'
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36',
'Referer': url
}
resp = requests.get(url, headers=header)
result = resp.json()
resp.close()
data=result['data']['novel']['items']
for item in data:
cids.append(item['cid'])
return cids
def download_one_page(book_id,title_id):
data = {
"book_id": book_id,
"cid": f"{book_id}|{title_id}",
"need_bookinfo": 1
}#拼接请求头参数
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36'
}
url = main_url '/getChapterContent?data=' json.dumps(data)
resp = requests.get(url, headers=header)
result = resp.json()
resp.close()
title = result['data']['novel']['chapter_title']
content = result['data']['novel']['content']
with open(f'小说/{title}.txt', 'w', encoding='utf-8') as file:
file.write(content)
if __name__ == '__main__':
book_id = '4305593636'
main_url=网址
cids=get_child_url(book_id)
# 创建线程池
with ThreadPoolExecutor(50) as t:
for cid in cids:
# 提交下载任务给线程池
t.submit(download_one_page, book_id,cid)
print('全部下载完毕!')