一、程序设计
本次基于Python爬虫分析书旗小说网站内容数据可视化系统,主要内容涉及:
主要功能模块:小说数据采集,小说热门分析,用户评论分析,分析小说关键词等等
主要包含技术:python编程语言,flask,网络爬虫,scrapy,mysql,html,javascript,echarts
主要包含算法:数据分析计算等
原文地址
二、效果实现
书籍热度
其他效果省略
三、核心代码
1.小说爬虫
本系统小说采集模块,主要采用scrapy框架实现小说数据的分析与采集,针对书旗小说网站解析小说资源列表,分析小说详情页面,通过xpath解析小说数据字段,写入mysql数据库中,针对全量的小说URL采用redis进行去重等。
代码语言:java复制# master 将首页获取到的评论链接,存到redis
class MasterSpider(scrapy.Spider):
name = 'kc_master'
redis_key = 'kc:start_urls'
allowed_domains = ["shuqi.com"]
start_urls = [
"https://www.shuqi.com",
]
def __init__(self, *args, **kwargs):
super(MasterSpider, self).__init__(*args, **kwargs)
def parse(self, response):
book_list = response.xpath('//ul[@class="coverrec"]/li')
rank_list = response.xpath('//ul[@class="cp-ranks-list js-ranksList"]/li')
cate_list = response.xpath('//div[@class="first"]')
total_list = book_list rank_list cate_list
for book in total_list:
book_link = book.xpath('./a/@href')[0]
if book_link:
link = SITE_URL book_link.extract()
self.logger.debug('书籍cover: %s' % link)
try:
# salve lpush
self._filter_url(link)
except Exception as e:
self.logger.error('salve error: %s' % e, exc_info=True)
# URL去重复
def _filter_url(self, url, key="kc_slave:start_urls"):
is_exist = redis_client.set_member(url, key='url_set')
if not bool(is_exist):
redis_client.set_add(url, key="url_set")
res = redis_client.lpush(key, url)
self.logger.debug('salve url add %s' % res)
2.数据写入Mysql
本系统针对采集之后的小说及用户评论数据写入mysql数据库,核心代码如下。
代码语言:java复制 def insert_book(self, item, spider):
try:
values = (
item['author_id'],
item['book_id'],
item['author_name'],
item['book_name'],
int(item['book_hot'])
)
sql = 'INSERT INTO books (author_id, book_id, author_name, book_name, hot_count) VALUES ("%s", "%s", "%s", "%s", "%d")'
% (values[0], values[1], values[2], values[3], values[4])
self.cursor.execute(sql)
self.db.commit()
spider.logger.debug('写入数据 成功 %s', item)
except Exception as e:
self.db.rollback()
spider.logger.error('写入书籍 error %s %s' % (item, e), exc_info=True)
def insert_comment(self, item, spider):
try:
values = (
item['book_id'],
item['username'],
item['text'],
item['comment_time'],
int(item['up_count']),
item['book_name']
)
sql = 'INSERT INTO comments (book_id, username, text, comment_time, up_count, book_name) '
'VALUES ("%s", "%s", "%s", "%s", "%d", "%s" );' % (
values[0], values[1], values[2], values[3], values[4], values[5])
self.cursor.execute(sql)
self.db.commit()
spider.logger.debug('写入评论 成功 %s', item)
except Exception as e:
self.db.rollback()
spider.logger.error('写入评论 error %s %s' % (item, e), exc_info=True)
3.模拟请求
代码语言:java复制import os
import requests
from bs4 import BeautifulSoup
#爬虫头数据
cookies = {
'SINAGLOBAL': '6797875236621.702.1603159218040',
'SUB': '_2AkMXbqMSf8NxqwJRmfkTzmnhboh1ygvEieKhMlLJJRMxHRl-yT9jqmg8tRB6PO6N_Rc_2FhPeZF2iThYO9DfkLUGpv4V',
'SUBP': '0033WrSXqPxfM72-Ws9jqgMF55529P9D9Wh-nU-QNDs1Fu27p6nmwwiJ',
'_s_tentry': 'www.baidu.com',
'UOR': 'www.hfut.edu.cn,widget.weibo.com,www.baidu.com',
'Apache': '7782025452543.054.1635925669528',
'ULV': '1635925669554:15:1:1:7782025452543.054.1635925669528:1627316870256',
}
headers = {
'Connection': 'keep-alive',
'Cache-Control': 'max-age=0',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36 SLBrowser/7.0.0.6241 SLBChan/25',
'Accept': 'text/html,application/xhtml xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Sec-Fetch-Site': 'cross-site',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-User': '?1',
'Sec-Fetch-Dest': 'document',
'Accept-Language': 'zh-CN,zh;q=0.9',
}
params = (
('cate', 'realtimehot'),
)
#数据存储
fo = open("./微博热搜.txt",'a',encoding="utf-8")
#获取网页
response = requests.get('https://s.weibo.com/top/summary', headers=headers, params=params, cookies=cookies)
#解析网页
response.encoding='utf-8'
soup = BeautifulSoup(response.text, 'html.parser')
#爬取内容
content="#pl_top_realtimehot > table > tbody > tr > td.td-02 > a"
#清洗数据
a=soup.select(content)
for i in range(0,len(a)):
a[i] = a[i].text
fo.write(a[i] 'n')
fo.close()