1. 用到的技术
爬虫 ——> xpath 数据可视化 ——> matplotlib
2. 爬虫
1. 正常爬取
代码语言:javascript复制# -*- coding:UTF-8 -*-
import requests
from lxml import etree
import urllib
"""
正常爬取
爬取17173游戏排行榜前1500名
目的网站:
http://top.17173.com/list-0-0-0-0-0-0-0-0-0-0-1.html
"""
def parse_html(url):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36"
}
resp = requests.get(url, headers=headers)
return resp.text
"""
解析主页面
获取主页面需要信息(排名,子页面链接,游戏名称,票数)
"""
def fetch(home_url):
home_page = etree.HTML(parse_html(home_url))
lis = home_page.xpath('//div[@class="main-c1"]/div/div[2]/div/div[2]/ul/li')
for li in lis:
# 跳过广告
if li.xpath('./@data-index') == 4:
continue
# 获取主页面需要信息(排名,子页面链接,游戏名称,票数)
rank = li.xpath('./div/div[1]/em/text()')[0]
child_url = "https:" li.xpath('./div/div[2]/div/a/@href')[0]
name = li.xpath('./div/div[2]/div/a/text()')[0]
votes = li.xpath('./div/div[3]/text()')[0].strip()
fetch_child(child_url, rank, name, votes)
# 测试
# print(
# rank, # 1
# name, # 暗黑破坏神4
# votes # 93303
# )
"""
解析子页面
获取子页面需要信息(标签,类型,语言,开发商,注册,运营商)
"""
def fetch_child(child_url, rank, name, votes):
child_page = etree.HTML(parse_html(child_url))
game_label = "暂无"
game_type = "暂无"
game_language = "暂无"
game_developer = "暂无"
game_registered = "暂无"
game_operator = "暂无"
# 判断目标网页是否 因不存在而导致跳转到了其他页面
if len(child_page.xpath('//ul[@class="list-mater-info"]/li')) != 0:
game_label = "|".join(child_page.xpath('//div[@class="box-mater-cate"]/a/text()'))
# print(game_label) # ['PK', '虚幻引擎', 'TPS', '射击']
# child_ul = child_page.xpath('//ul[@class="list-mater-info"]')
# 判断游戏类型部分内容是否为空
if len(child_page.xpath('//ul[@class="list-mater-info"]/li[1]/a')) == 0:
game_type = "暂无"
else:
game_type = child_page.xpath('//ul[@class="list-mater-info"]/li[1]/a/text()')[0]
game_language = "".join(child_page.xpath('//ul[@class="list-mater-info"]/li[2]/div/span/text()'))
# 判断开发商部分内容是否为空
if len(child_page.xpath('//ul[@class="list-mater-info"]/li[3]/span[2]/text()')) != 0:
game_developer = child_page.xpath('//ul[@class="list-mater-info"]/li[3]/span[2]/text()')[0]
else:
game_developer = "暂无"
# 判断注册部分内容是否为空
if len(child_page.xpath('//ul[@class="list-mater-info"]/li[4]/span')) == 2:
game_registered = child_page.xpath('//ul[@class="list-mater-info"]/li[4]/span[2]/text()')[0]
else:
a = child_page.xpath('//ul[@class="list-mater-info"]/li[4]/a/@href')[0]
game_registered = urllib.parse.unquote(a.rsplit("=")[-1])
game_operator = child_page.xpath('//ul[@class="list-mater-info"]/li[5]//span[2]/text()')[0]
# 测试
# print(
# game_label, # PK,虚幻引擎,TPS,射击
# game_type, # 第三人称射击
# game_language, # 简体中文英语葡萄牙语土耳其语
# game_developer, # PUBG Corporation
# game_registered, # 暂无
# game_operator, # PUBG Corporation(中国)
# )
msg = [rank, name, votes, game_label, game_type, game_language, game_developer, game_registered, game_operator]
writer(msg)
"""
信息写入文件
"""
def writer(msg):
with open("gameRank.csv", mode="a", encoding="GBK", newline="") as f:
print(msg[1], "开始")
f.writelines(msg[0] "," msg[1] "," msg[2] "," msg[3] "," msg[4] "," msg[5] "," msg[6] "," msg[7] "," msg[8] "n")
print(msg[1], "over!")
if __name__ == '__main__':
for i in range(70):
i = i 1
url = f"https://top.17173.com/list-0-0-0-0-0-0-0-0-0-0-{i}.html"
fetch(url)
2. 异步协程爬取
代码语言:javascript复制from lxml import etree
import urllib
import aiohttp
import aiofiles
import asyncio
"""
使用异步协程进行爬取
爬取17173游戏排行榜前1500名
目的网站:
http://top.17173.com/list-0-0-0-0-0-0-0-0-0-0-1.html
"""
async def parse_html(url):
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36',
}
async with aiohttp.ClientSession() as session:
# 它的使用方式和requests相似,也是在get()方法中添加一个参数,但此时的参数名为proxy,
# 参数值是字符串,且字符串中的代理协议,只支持http,写成https会报错。
async with session.get(url, headers=headers) as resp:
return await resp.text()
"""
函数目标 : 获取需要的信息
"""
async def fetch(child_url, rank, name, votes):
child_page = etree.HTML(await parse_html(child_url))
game_label = "暂无"
game_type = "暂无"
game_language = "暂无"
game_developer = "暂无"
game_registered = "暂无"
game_operator = "暂无"
# 判断目标网页是否 因不存在而导致跳转到了其他页面
if len(child_page.xpath('//ul[@class="list-mater-info"]/li')) != 0:
game_label = "|".join(child_page.xpath('//div[@class="box-mater-cate"]/a/text()'))
# print(game_label) # ['PK', '虚幻引擎', 'TPS', '射击']
# child_ul = child_page.xpath('//ul[@class="list-mater-info"]')
# 判断游戏类型部分内容是否为空
if len(child_page.xpath('//ul[@class="list-mater-info"]/li[1]/a')) == 0:
game_type = "暂无"
else:
game_type = child_page.xpath('//ul[@class="list-mater-info"]/li[1]/a/text()')[0]
game_language = "".join(child_page.xpath('//ul[@class="list-mater-info"]/li[2]/div/span/text()'))
# 判断开发商部分内容是否为空
if len(child_page.xpath('//ul[@class="list-mater-info"]/li[3]/span[2]/text()')) != 0:
game_developer = child_page.xpath('//ul[@class="list-mater-info"]/li[3]/span[2]/text()')[0]
else:
game_developer = "暂无"
# 判断注册部分内容是否为空
if len(child_page.xpath('//ul[@class="list-mater-info"]/li[4]/span')) == 2:
game_registered = child_page.xpath('//ul[@class="list-mater-info"]/li[4]/span[2]/text()')[0]
else:
a = child_page.xpath('//ul[@class="list-mater-info"]/li[4]/a/@href')[0]
game_registered = urllib.parse.unquote(a.rsplit("=")[-1])
game_operator = child_page.xpath('//ul[@class="list-mater-info"]/li[5]//span[2]/text()')[0]
# 测试
print(
rank,
name,
votes,
game_label, # PK,虚幻引擎,TPS,射击
game_type, # 第三人称射击
game_language, # 简体中文英语葡萄牙语土耳其语
game_developer, # PUBG Corporation
game_registered, # 暂无
game_operator, # PUBG Corporation(中国)
)
async with aiofiles.open("gameRank.csv", mode="a", encoding="GBK", newline="") as f:
print(name, "开始")
await f.writelines(rank "," name "," votes "," game_label "," game_type "," game_language "," game_developer "," game_registered "," game_operator "n")
print(name, "over!")
async def main():
tasks = []
home_page = etree.HTML(await parse_html(home_url))
lis = home_page.xpath('//div[@class="main-c1"]/div/div[2]/div/div[2]/ul/li')
for li in lis:
# 跳过广告
if li.xpath('./@data-index') == 4:
continue
# 获取主页面需要信息(排名,子页面链接,游戏名称,票数)
rank = li.xpath('./div/div[1]/em/text()')[0]
a = "https:" li.xpath('./div/div[2]/div/a/@href')[0]
name = li.xpath('./div/div[2]/div/a/text()')[0]
votes = li.xpath('./div/div[3]/text()')[0].strip()
tasks.append(asyncio.ensure_future(fetch(a, rank, name, votes)))
await asyncio.wait(tasks)
if __name__ == '__main__':
# 创建事件循环
loop = asyncio.get_event_loop()
for i in range(10):
home_url = f"https://top.17173.com/list-0-0-0-0-0-0-0-0-0-0-{i}.html"
loop.run_until_complete(main())
loop.close()
3. 数据可视化
代码语言:javascript复制import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import pandas as pd
# 汉字字体,优先使用楷体,如果找不到楷体,则使用黑体
mpl.rcParams['font.sans-serif'] = ['KaiTi', 'SimHei', 'FangSong']
# 字体大小
mpl.rcParams['font.size'] = 12
# 正常显示负号
mpl.rcParams['axes.unicode_minus'] = False
# 使用ggplot的绘图风格,这个类似于美化
plt.style.use('ggplot')
# 读取csv文件
csv_data = pd.read_csv("../reptile/gameRank.csv", encoding="GBK")
# 1-游戏类型分布-柱状图
def picture01():
# 获取 (类型)MMORPG (该类游戏数量)489
group_type = csv_data.groupby("type", as_index=False).size()
# 获取 所有类型对应的游戏的 数量 一个数组 [489 56 14 86 ……]
number_types = group_type["size"].values
# 游戏类型
x = group_type["type"].values
# 类型对应的游戏的数量
y = number_types
# 画图:
plt.bar(x, y, facecolor='#ff9999', edgecolor='white')
# 数据录入:zip指把x,y结合为一个整体,一次可以读取一个x和一个y
for x, y in zip(x, y):
plt.text(x, y, y, ha='center', va='bottom') # 指字体在中间和柱最顶的顶部
# 轴坐标 竖着显示
plt.xticks(rotation=270)
# 设置轴标签以及标题
plt.xlabel("type")
plt.ylabel("number")
plt.title("1-各游戏类型条形图")
plt.show()
# 2-游戏类型分布-饼图
def picture02():
plt.axes(aspect='equal') # 将横、纵坐标轴标准化处理,确保饼图是一个正圆,否则为椭圆
# 以type为标准 分组 (type size)
group_type = csv_data.groupby("type", as_index=False).size()
# 游戏类型 数组
type_types = group_type["type"].values
# 游戏类型对应游戏的数量
number_types = group_type["size"].values
# 每种类型游戏的个数占比 数组
pro_types = number_types / number_types.sum()
plt.pie(
x=pro_types, # 绘图数据
labels=type_types, # 添加游戏类型标签
autopct='%.2f%%', # 设置百分比的格式,这里保留两位小数
pctdistance=0.8, # 设置百分比标签与圆心的距离
labeldistance=1.05, # 设置游戏类型标签与圆心的距离
startangle=180, # 设置饼图的初始角度
radius=1.1, # 设置饼图的半径
counterclock=False, # 是否逆时针,这里设置为顺时针方向
wedgeprops={'linewidth': 1.5, 'edgecolor': 'green'}, # 设置饼图内外边界的属性值
textprops={'fontsize': 5, 'color': 'black'}, # 设置文本标签的属性值
)
# 添加图标题
plt.title('2-各大游戏类型分布')
# 显示图形
plt.show()
# 3-游戏隶属厂商TOP20-折线图
def picture03():
# 数据准备
group_dev = csv_data.groupby("developer", as_index=False).size()
group_dev = group_dev.sort_values(by='size', axis=0, ascending=False).drop(index=1046, axis=0)[0:20]
type_dev = group_dev["developer"].values
number_dev = group_dev["size"].values
x = type_dev
y = number_dev
plt.xticks(rotation=270)
plt.title("3-各游戏隶属厂商TOP20")
plt.plot(x, y)
plt.show()
# 4-最火游戏类型top10-雷达图
def picture04():
# 以type为标准 分组 (type size)
group_type = csv_data.groupby("type", as_index=False).size()
group_type = group_type.sort_values(by='size', axis=0, ascending=False)[0:10]
# 游戏类型 数组
type_types = group_type["type"].values
# 游数戏类型对应游戏的量
number_types = group_type["size"].values
labels = type_types
values = number_types
# 设置每个数据点的显示位置,在雷达图上用角度表示
angles = np.linspace(0, 2 * np.pi, len(values), endpoint=False)
# 拼接数据首尾,使图形中线条封闭
values = np.concatenate((values, [values[0]]))
angles = np.concatenate((angles, [angles[0]]))
labels = np.concatenate((labels, [labels[0]]))
# 绘图
fig = plt.figure()
# 设置为极坐标格式
ax = fig.add_subplot(111, polar=True)
# 绘制折线图
ax.plot(angles, values, 'o-', linewidth=2)
# 填充颜色
ax.fill(angles, values, alpha=0.25)
# 设置图标上的角度划分刻度,为每个数据点处添加标签
ax.set_thetagrids(angles * 180 / np.pi, labels)
# 设置雷达图的范围
ax.set_ylim(0, 500)
# 添加标题
plt.title('4-最火游戏类型top10')
# 添加网格线
ax.grid(True)
plt.show()
# 5-最火游戏top20-横向柱状图
def picture05():
data = csv_data.sort_values(by='popularity', axis=0, ascending=False)[0:20]
# 游戏类型
y = data["game_name"].values[::-1]
# 类型对应的游戏的数量
x = data["popularity"].values[::-1]
# 图像绘制
fig, ax = plt.subplots()
b = ax.barh(range(len(y)), x, color='#ff9999')
# 添加数据标签
for rect in b:
w = rect.get_width()
ax.text(w, rect.get_y() rect.get_height() / 2, '%d' % int(w), ha='left', va='center')
# 设置Y轴刻度线标签
ax.set_yticks(range(len(y)))
ax.set_yticklabels(y)
plt.title("5-最火游戏top20")
plt.show()
# 6-公司游戏版权数量TOP20-横向柱状图
def picture06():
# 数据准备
group_dev = csv_data.groupby("developer", as_index=False).size()
group_dev = group_dev.sort_values(by='size', axis=0, ascending=False).drop(index=1046, axis=0)[0:20]
y = group_dev["developer"].values[::-1]
# 类型对应的游戏的数量
x = group_dev["size"].values[::-1]
# 图像绘制
fig, ax = plt.subplots()
b = ax.barh(range(len(y)), x, color='#ff9999')
# 添加数据标签
for rect in b:
w = rect.get_width()
ax.text(w, rect.get_y() rect.get_height() / 2, '%d' % int(w), ha='left', va='center')
# 设置Y轴刻度线标签
ax.set_yticks(range(len(y)))
ax.set_yticklabels(y)
plt.title("6-公司游戏版权数量TOP20")
plt.show()
if __name__ == '__main__':
picture01()
# picture02()
# picture03()
# picture04()
# picture05()
# picture06()