刚刚接触爬虫,模仿之前写的代码对80s网站的电影信息进行爬取,爬取的网址为80s
使用的库
代码语言:javascript复制import re # 正则表达式
import urllib.request, urllib.error # 指定url,获取网页数据
from bs4 import BeautifulSoup # 网页解析
爬虫代码
代码语言:javascript复制from api import test as t
# 引入第三方模块
import re # 正则表达式
import urllib.request, urllib.error # 指定url,获取网页数据
from bs4 import BeautifulSoup # 网页解析
baseurl = 'https://www.80s.tw/hot'
imglink = re.compile(r'<img alt="(.*?)" class="p" id="(.*?)" src="(.*?)"/>')
titlelink = re.compile(r'<a href="(.*?)" title="(.*?)">')
findlink = re.compile(r'<a href="(.*?)">') # 创建正则表达式 表示规则
# 1.爬取网页
def getData():
urllist = []
valuelist = []
# 2.解析数据
img = []
src = []
title = []
fens = []
contents = []
html = askURL(baseurl)
bs = BeautifulSoup(html, "html.parser")
for item in bs.find_all('div', class_="lpelmt2 me2li"):
item = str(item)
titlel = re.findall(titlelink, item)
for t in titlel:
title.append(t[1])
print(t[1])
tsrc = "https://www.80s.tw" t[0]
fen, content = getContentAndFen(tsrc)
# fen, content = "6","2"
fens.append(fen)
contents.append(content)
src.append(tsrc)
print(fen,content)
imgl = re.findall(imglink, item)
for i in imgl:
img.append("https:" i[2])
return title, img, src, fens, contents;
# 得到一个url的网页内容1
def askURL(url):
head = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36",
"Cookie": "BAIDU_SSP_lcr=https://www.baidu.com/link?url=HMnQR6d-rPO0YlyHtrIM7E4dn4YUvW6Vm1bNsMLt4WO&wd=&eqid=e3e4166c0000b93600000003603caae8; Hm_lvt_caa88905362b7957005130b28e579d36=1614588653; _ga=GA1.2.617854400.1614588655; _gid=GA1.2.1808945187.1614588655; beitouviews_3758=OUHKI5ksCimBxsKCLklg%2BlwvUZh1FuJ6Vyi9m6XmS6eaAV9W6jgPS14FvCyFS4GHUf3YfgIhBBj5A%2FQLXbdsgSrgYpyHGtzo%2BLBHH0vHJdqh8jvMZEDRH%2FSbbFZITKsr5ErvsUY2Ao%2B5ID8ZFZIeOtAU%2F%2F6wFTelIC3oCspNs%2BbSHJcV2GtqrjikD4mrMGEkdsd3tL0z9v6mHtZh8cPS48AvWQtlpbvQi%2F6jyNUEP1ziCm9fHUmufiDHQEPZNMx0LXzlQATlHuRiScjiXziIgn9w%2BXqCyODFwuwkhDsdEmE1W%2FpFNiIfS9FE1Om0jr22Ig5Ybaavihtfb4NPt89qtQ%3D%3D; 3758_2470_111.36.138.122=1; richviews_3760=tNiZFpEMqXWe%2BFIoHRJd6y6X7RfaTIM3payNSGO2qHjxpAF9DWNOhKKdRJppp4O4V5EHhtbdcrsdgMHtJ04HLqx%2B94djknSuo1i%2B4mFZgv1hOId%2FB49VuDfByAxn5GkjahAWEq3XZww2iosVDdJQtudDjU5V%2BZH17hqG%2FQQB0XHUTOpmaLSMwQB8uoBynw%2F3xAd0ZnPNenng5MOlP2jZBh4%2Fnyan4yKv1zE33NWayTbIyXKnk1NVN1xaiKlRWO6r2Xo9b71Uk97wu9TAG9qJ54szIm90ke%2BDsPoBO1M3ZjeLBgPwN%2F9djQV6daKpCeJjPJqkY2tzbrxnKvddMmFJ1Q%3D%3D; 3760_2444_111.36.138.122=1; Hm_lpvt_caa88905362b7957005130b28e579d36=1614588658"
}
req = urllib.request.Request(url=url, headers=head)
html = ""
try:
response = urllib.request.urlopen(req)
html = response.read()
except Exception as result:
print(result)
return html
def getContentAndFen(url):
contentlink = re.compile(r'<span class="font_888">剧情介绍:</span>(.*?)<', re.S)
fenlink = re.compile(r'<span class="score .*?"></span>(.*?)</span>', re.S)
html = askURL(str(url))
f = ""
c = ""
bs = BeautifulSoup(html, "html.parser")
for item in bs.find_all('div', class_="info"):
item = str(item)
content = re.findall(contentlink, item)
fen = re.findall(fenlink, item)
if len(fen) > 0:
f = fen[0]
if len(content) > 0:
c = content[0]
return f, c