前言
本文的文字及图片来源于网络,仅供学习、交流使用,不具有任何商业用途,版权归原作者所有,如有问题请及时联系我们以作处理
本次目标
爬取惠农网信息
受害者地址
代码语言:javascript复制https://www.cnhnb.com/
环境
Python3.6
pycharm
爬虫代码
导入工具
代码语言:javascript复制import requests
import parsel
import csv
import time
请求头
代码语言:javascript复制headers = {
'Cookie': 'deviceIdRenew=1; Hm_lvt_91cf34f62b9bedb16460ca36cf192f4c=1604579356,1604659451; deviceId=d1dd5b9-d191-406b-971d-391916a0e; sessionId=S_0KH64T2IHLHSO77N; lmvid=b24dcd0ad2a8f0b783f248c7ff2675a8; lmvid.sig=w1UBnTUKSDq-GfAlx6TyR_K7SjyujGIlF-1kRjTrOAI; hnUserTicket=b80e6b3a-38a3-432c-816d-aeb0376228bd; hnUserId=870048250; Hm_lpvt_91cf34f62b9bedb16460ca36cf192f4c=1604659904',
'Host': 'www.cnhnb.com',
'Referer': 'https://www.cnhnb.com/supply/pingguo/',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36',
}
解析网站、爬取数据
代码语言:javascript复制def get_page_url(page_url, page_id):
response_2 = requests.get(url=page_url, headers=headers)
selector_2 = parsel.Selector(response_2.text)
num_id = selector_2.css('#__layout > div > div > div > div > ul > li:nth-child(1) > a::attr(href)').get().split('/')[2]
title = selector_2.css('.proinfo-title::text').get().strip() # 标题
update_time = selector_2.css('.update-time::text').get().strip('更新时间:') # 更新时间
price = selector_2.css('.priceTxt .orange .fs30::text').get().strip() '元' # 价格
ads = selector_2.css('div:nth-child(5) > span.fs14.gray6::text').get() # 发货地址
get_phone_url = 'https://gateway.cnhnb.com/banana/im/operate/wechatcall'
data = {
'businessType': '1',
'sourceFrom': '2',
'ticket': '"b80e6b3a-38a3-432c-816d-aeb0376228bd"',
'userId': '{}'.format(num_id),
}
head = {
'authority': 'gateway.cnhnb.com',
'method': 'POST',
'path': '/banana/im/operate/wechatcall',
'scheme': 'https',
'accept': 'application/json, text/plain, */*',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9',
'cache-control': 'no-cache',
'content-length': '98',
'content-type': 'application/json',
'origin': 'https://www.cnhnb.com',
'pragma': 'no-cache',
'referer': 'https://www.cnhnb.com/gongying/{}/'.format(page_id),
# 'Cookie': 'deviceIdRenew=1; Hm_lvt_91cf34f62b9bedb16460ca36cf192f4c=1604579356,1604659451; deviceId=d1dd5b9-d191-406b-971d-391916a0e; sessionId=S_0KH64T2IHLHSO77N; lmvid=b24dcd0ad2a8f0b783f248c7ff2675a8; lmvid.sig=w1UBnTUKSDq-GfAlx6TyR_K7SjyujGIlF-1kRjTrOAI; hnUserTicket=b80e6b3a-38a3-432c-816d-aeb0376228bd; hnUserId=870048250; Hm_lpvt_91cf34f62b9bedb16460ca36cf192f4c=1604659904',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-site',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36',
'x-b3-traceid': '0KH64WG5WL1GXPNG',
'x-client-appid': '5',
'x-client-id': 'c10e4e9a-5e19-4ba2-a934-c8c5c56680f5',
'x-client-nonce': '62f080cd-ad30-4590-b362-b1c9e660a8d5',
'x-client-page': '/gongying/{}/'.format(page_id),
'x-client-sid': 'S_0KH64W0GT18JX07L',
'x-client-ticket': 'b80e6b3a-38a3-432c-816d-aeb0376228bd',
'x-client-time': '1604659611092',
'x-hn-job': 'If you see these message, I hope you dont hack us, I hope you can join us! Please visit https://www.cnhnkj.com/job.html',
}
response_3 = requests.post(url=get_phone_url, json=data, headers=head)
html_data = response_3.json()
if html_data['errorCode'] == 0:
dit = {
'标题': title,
'更新时间': update_time,
'价格': price,
'发货地址': ads,
'联系方式': html_data['data']['messageTitle']
}
csv_writer.writerow(dit)
print(dit)
else:
dit = {
'标题': title,
'更新时间': update_time,
'价格': price,
'发货地址': ads,
'联系方式': '商家设置防打扰'
}
csv_writer.writerow(dit)
print(dit)
保存数据
代码语言:javascript复制f = open('惠农网信息.csv', mode='a', encoding='utf-8-sig', newline='')
csv_writer = csv.DictWriter(f, fieldnames=['标题', '更新时间', '价格', '发货地址', '联系方式'])
csv_writer.writeheader()