import requests
from bs4 import BeautifulSoup
# import pandas
def GetBlogByPage(pageNum):
headers={
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"
}
targetUrl="https://blog.csdn.net/CJB_King/article/list/{}?"
response=requests.get(targetUrl.format(pageNum),headers=headers)
response.encoding='utf-8'
contentText=response.text
soup=BeautifulSoup(contentText,"html.parser")
getTargetInfo=[]
articles=soup.select('.article-item-box')
for article in articles:
info={}
info["title"]=article.a.text.strip()
info["source"]=article.a['href'].strip()
info["sendTime"]=article.div.p.span.text.strip()
info["ReadNum"]=article.div.select('span')[1].text
info["writeNum"]=article.div.select('span')[3].text
getTargetInfo.append(info)
with open("blog.txt",'w') as f:
for info in getTargetInfo:
print(info)
f.write(str(info))
# df=pandas.DataFrame(getTargetInfo)
# df.head()
# df.to_excel('blog.xlsx')
for i in range(1,9): #按页爬取
GetBlogByPage(i)
爬取租房信息
代码语言:javascript复制
from bs4 import BeautifulSoup
import requests
import csv
import time
import lxml
url = "https://bj.58.com/pinpaigongyu/pn/{page}/?minprice=2000_4000"
#已完成的页数序号,初时为0
page = 0
csv_file = open("rent1.csv","w")
csv_writer = csv.writer(csv_file, delimiter=',')
while True:
page = 1
print("fetch: ", url.format(page=page))
#由于该网站设置了反爬虫机制非常容易被屏蔽。因此在每次爬取页面时使用 time.sleep(1),1 代表 1 秒
time.sleep(1)
response = requests.get(url.format(page=page))
html = BeautifulSoup(response.text,features="lxml")
house_list = html.select(".list > li")
# 循环在读不到新的房源时结束
if not house_list:
break
for house in house_list:
house_title = house.select("h2")[0].string
house_url = house.select("a")[0]["href"]
house_info_list = house_title.split()
print(house_info_list)
# 如果第二列是公寓名则取第一列作为地址
if "公寓" in house_info_list[1] or "青年社区" in house_info_list[1]:
house_location = house_info_list[0]
else:
house_location = house_info_list[1]
house_money = house.select(".money")[0].select("b")[0].string
csv_writer.writerow([house_title.strip(), house_location.strip(), house_money.strip(), house_url.strip()])
csv_file.close()
爬取慕课网课程信息
代码语言:javascript复制
import requests
from pyquery import PyQuery as pq
from urllib.parse import urljoin
import pandas
totalInfo=[]
def GetTargetPageInfo(pageNum):
with requests.Session() as s:
res=s.get("https://www.imooc.com/course/list?page={}".format(1))
d=pq(res.text)
courses=d.items(".course-card-container") #得到所有课程
for course in courses: #遍历课程,查找单个课程的信息
title=course.find('.course-card-name').text() #课程名称
des=course.find('.course-card-desc').text() #课程描述
level=course.find('.course-card-info>span:eq(0)').text() #课程等级
users=course.find('.course-card-info>span:eq(1)').text() #课程观看人数
prices=course.find('.price').text() #课程价格
labels=course.find('.course-label').text().split(' ') #标签
url=urljoin("https://www.imooc.com/learn/", course.find("a").attr("href")) #拼接课程URl
img_url=urljoin("https://img3.mukewang.com/",course.find("img").attr("src"))
infoDict={
"title":title,
"des":des,
"level":level,
"users":users,
"prices":prices,
"labels":labels,
"url":url,
"img_url":img_url
}
totalInfo.append(infoDict)
for i in range(7):
GetTargetPageInfo(i)
df=pandas.DataFrame(totalInfo)
df.to_excel('videoInfo.xlsx') #转化xlsx
检查内容截图
urllib.request爬取链接
代码语言:javascript复制
import urllib.request
from urllib.error import URLError,HTTPError,ContentTooShortError
import re
"""
def download(url,user_agent='wswp',num_retries=2):
request=urllib.request.Request(url)
request.add_header('User-agent',user_agent)
try:
html=urllib.request.urlopen(url).read()
except (URLError,HTTPError,ContentTooShortError) as e:
print('download error:',e.reason)
html=None
if num_retries>0:
return download(url,num_retries-1)
return html
download("http://httpstat.us/500")
"""
def download(url,user_agent='wswp',num_retries=2,charset='utf-8'):
print("downloading:",url)
request=urllib.request.Request(url)
request.add_header("User-agent",user_agent)
try:
resp=urllib.request.urlopen(request)
cs=resp.headers.get_content_charset()
if not cs:
cs=charset
html=resp.read().decode(cs)
except (URLError,HTTPError,ContentTooShortError) as e:
html=None
if num_retries>0:
return download(url,num_retries-1)
return html
def crawl_sitemap(url):
sitemap=download(url)
links=re.findall('<loc>(.*?)</loc>',sitemap)
for link in links:
html=download(link)
crawl_sitemap("http://example.python-scraping.com/sitemap.xml")