本文基于python3.4的selenium库打开浏览器,并将浏览器中的登陆cookie保存到本地,那么下次登陆就可以直接利用cookie了:
代码语言:javascript复制 1 # !/usr/bin/python3.4
2 # -*- coding: utf-8 -*-
3
4 from selenium import webdriver
5 import time
6 import requests
7 from bs4 import BeautifulSoup
8 import os
9 import re
10 import random
11 import xlsxwriter
12
13
14 # 找出文件夹下所有xml后缀的文件,可选择递归
15 def listfiles(rootdir, prefix='.xml', iscur=False):
16 file = []
17 for parent, dirnames, filenames in os.walk(rootdir):
18 if parent == rootdir:
19 for filename in filenames:
20 if filename.endswith(prefix):
21 file.append(filename)
22 if not iscur:
23 return file
24 else:
25 if iscur:
26 for filename in filenames:
27 if filename.endswith(prefix):
28 file.append(filename)
29 else:
30 pass
31 return file
32
33
34 # 抓取dp的正则表达式
35 def getdp(string):
36 reg = r'(http. ?/dp/)(. )'
37 all = re.compile(reg)
38 alllist = re.findall(all, string)
39 return alllist[0][1]
40
41
42 # 抓取filter的正则表达式
43 # https://sellercentral.amazon.com/productsearch?filter=grocery&q=fish
44 def getfilter(string):
45 reg = r'(https. ?filter=)(. ?)(&)'
46 all = re.compile(reg)
47 alllist = re.findall(all, string)
48 return alllist[0][1]
49
50
51 # 抓取最大页数的正则
52 def getpagenum(string):
53 reg = r'(. ?()(d )())'
54 all = re.compile(reg)
55 alllist = re.findall(all, string)
56 return alllist[0][1]
57
58
59 # 创建文件夹
60 def createjia(path):
61 try:
62 os.makedirs(path)
63 except:
64 pass
65
66
67 def timetochina(longtime, formats='{}天{}小时{}分钟{}秒'):
68 day = 0
69 hour = 0
70 minutue = 0
71 second = 0
72 try:
73 if longtime > 60:
74 second = longtime % 60
75 minutue = longtime // 60
76 else:
77 second = longtime
78 if minutue > 60:
79 hour = minutue // 60
80 minutue = minutue % 60
81 if hour > 24:
82 day = hour // 24
83 hour = hour % 24
84 return formats.format(day, hour, minutue, second)
85 except:
86 raise Exception('时间非法')
87
88
89 # 打开浏览器抓取cookie
90 def openbrowser(url):
91 # 打开谷歌浏览器
92 # Firefox() Chrome()
93 browser = webdriver.Chrome()
94 # browser = webdriver.Chrome(executable_path='C:/Python34/chromedriver.exe')
95 # 输入网址
96 browser.get(url)
97 # 打开浏览器时间
98 # print("等待10秒打开浏览器...")
99 # time.sleep(10)
100
101 # 找到id="ap_email"的对话框
102 # 清空输入框
103 browser.find_element_by_id("ap_email").clear()
104 browser.find_element_by_id("ap_password").clear()
105
106 # 输入账号密码
107 inputemail = input("请输入账号:")
108 inputpassword = input("请输入密码:")
109 browser.find_element_by_id("ap_email").send_keys(inputemail)
110 browser.find_element_by_id("ap_password").send_keys(inputpassword)
111
112 # 点击登陆sign in
113 # id="signInSubmit"
114 browser.find_element_by_id("signInSubmit").click()
115
116 # 等待登陆10秒
117 # print('等待登陆10秒...')
118 # time.sleep(10)
119 print("等待网址加载完毕...")
120
121 select = input("请观察浏览器网站是否已经登陆(y/n):")
122 while 1:
123 if select == "y" or select == "Y":
124 print("登陆成功!")
125 # 获取cookie
126 cookie = [item["name"] ":" item["value"] for item in browser.get_cookies()]
127 cookiestr = ';'.join(item for item in cookie)
128 print("正在复制网页cookie...")
129
130 # 写入本地txt
131 if "jp" in url:
132 path = "../data/Japcookie.txt"
133 else:
134 path = "../data/Amecookie.txt"
135
136 filecookie = open(path, "w")
137 filecookie.write(cookiestr)
138 filecookie.close()
139
140 time.sleep(1)
141 print("准备关闭浏览器...")
142 browser.quit()
143 # print(cookiestr)
144 break
145
146 elif select == "n" or select == "N":
147 selectno = input("账号密码错误请按0,验证码出现请按1...")
148 # 账号密码错误则重新输入
149 if selectno == "0":
150
151 # 找到id="ap_email"的对话框
152 # 清空输入框
153 browser.find_element_by_id("ap_email").clear()
154 browser.find_element_by_id("ap_password").clear()
155
156 # 输入账号密码
157 inputemail = input("请输入账号:")
158 inputpassword = input("请输入密码:")
159 browser.find_element_by_id("ap_email").send_keys(inputemail)
160 browser.find_element_by_id("ap_password").send_keys(inputpassword)
161 # 点击登陆sign in
162 # id="signInSubmit"
163 browser.find_element_by_id("signInSubmit").click()
164
165 elif selectno == "1":
166 # 验证码的id为id="ap_captcha_guess"的对话框
167 input("请在浏览器中输入验证码并登陆...")
168 select = input("请观察浏览器网站是否已经登陆(y/n):")
169
170 else:
171 print("请输入“y”或者“n”!")
172 select = input("请观察浏览器网站是否已经登陆(y/n):")
173
174 return cookiestr
175
176
177 def gethtml(url):
178 # 读取cookie
179 # 写入字典
180 mycookie = {}
181 if "jp" in url:
182 path = "../data/Japcookie.txt"
183 else:
184 path = "../data/Amecookie.txt"
185
186 try:
187 filecookie = open(path, "r")
188 cookies = filecookie.read().split(";")
189 for items in cookies:
190 item = items.split(":")
191 mycookie[item[0]] = item[1]
192 # print(mycookie)
193 filecookie.close()
194 except:
195 print("cookie为空...")
196
197 if "jp" in url:
198 referer = "https://sellercentral.amazon.co.jp/"
199 host = "www.amazon.co.jp"
200 else:
201 referer = "https://sellercentral.amazon.com/"
202 host = "www.amazon.com"
203
204 # 制作头部
205 header = {
206 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:49.0) Gecko/20100101 Firefox/49.0',
207 'Referer': referer,
208 'Host': host,
209 'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
210 'Connection': 'keep-alive',
211 'Upgrade-Insecure-Requests': '1',
212 'Accept': 'text/html,application/xhtml xml,application/xml;q=0.9,*/*;q=0.8',
213 'Accept-Encoding': 'gzip, deflate, br'
214 }
215
216 htmlget = requests.get(url=url, headers=header, cookies=mycookie, timeout=60)
217 htmlcontent = htmlget.content.decode("UTF-8", "ignore")
218
219 return htmlcontent
220
221
222 def getinfo(html, Loginurl):
223 # BeautifulSoup解析需要的东西
224 soups = BeautifulSoup(html, "html.parser")
225 # 筛选出商品的div
226 sellyours = soups.find_all("div", attrs={"class": "a-box product"})
227 information = []
228 for item in sellyours:
229 # 一个一个商品筛选
230 # 第一次筛选,得到有“出售您的”的商品列表
231 temp = item.find("a", attrs={"class", "a-button-text"})
232
233 if temp != None:
234 if "sellYoursClick" in temp["data-csm"]:
235 # 第二次筛选得到“无数字、无新品”字样的商品列表
236 temp = item.find("span", attrs={"class", "offerCountDetails"})
237 if temp == None:
238 temp = item.find("div", attrs={"class", "a-fixed-right-grid-col description a-col-left"})
239
240 # 得到详情页网址
241 hrefurl = temp.find('a').get('href')
242 # 得到所有当前class下的文本信息
243 # 包括title、UPC、EAN、Rank
244 try:
245 spans = temp.get_text()
246 except:
247 spans = "Nothing"
248 # 将得到的文本信息写入数组里面
249 temparr = spans.strip().split("n")
250 # 正则得到Asin
251 asin = getdp(hrefurl)
252 temparr.append(asin)
253 temparr.append(hrefurl)
254
255 # 这里记录一份副本到txt中,防止程序中断什么都没保存
256 txtcontent = ' '.join(temparr)
257 filename = time.strftime('%Y%m%d', time.localtime())
258 path = "../xls/" filename
259 createjia(path)
260 file = open(path "/" filename ".txt", "a")
261 file.write("n" txtcontent)
262 file.close()
263
264 # 这里是解析详情页,如果详情页有price,就抓取review下来
265 # 并且将抓取的东西储存到数组,并写入excel中
266 # 解析详情页
267 htmldetail = gethtml(hrefurl)
268
269 if 'id="words"' in htmldetail or 'ap_email' in htmldetail or "Amazon.com Page Not Found" in htmldetail:
270 print("抓取得太快!需要重新登陆...")
271 openbrowser(Loginurl)
272 htmldetail = gethtml(hrefurl)
273
274 # BeautifulSoup解析需要的东西
275 soups = BeautifulSoup(htmldetail, "html.parser")
276 # 筛选出商品的centerCol列表
277 centerCols = soups.findAll('div', attrs={'id': "centerCol"})
278 if centerCols:
279 for item in centerCols:
280 temp = item.find("td", attrs={"id": "priceblock_ourprice_lbl"})
281 if temp == None:
282 # 得到评分等级
283 star = item.find("a", attrs={"id": "reviewStarsLinkedCustomerReviews"}).get_text()
284 # 得到评分人数
285 reviews = item.find("span", attrs={"id": "acrCustomerReviewText"}).get_text()
286 # 将抓取的东西写入数组
287 if star:
288 temparr.append(star.strip().replace(" out of 5 stars", ""))
289 else:
290 temparr.append("")
291 if reviews:
292 temparr.append(reviews.strip().replace(" customer reviews", ""))
293 else:
294 temparr.append("")
295
296 information.append(temparr)
297 print(information)
298 else:
299 temparr.append("")
300 temparr.append("")
301 information.append(temparr)
302 print(information)
303 return information
304
305
306 def begin():
307 taoyanbai = '''
308 -----------------------------------------
309 | 欢迎使用后台爬虫系统 |
310 | 时间:2016年10月21日 |
311 | 出品:技术部 |
312 -----------------------------------------
313 '''
314 print(taoyanbai)
315
316
317 if __name__ == "__main__":
318
319 a = time.clock()
320
321 while 1:
322 try:
323 LoginWhere = int(input("抓取美国请按0,日本请按1:"))
324 if LoginWhere == 0:
325 Loginurl = "https://sellercentral.amazon.com/"
326 break
327 elif LoginWhere == 1:
328 Loginurl = "https://sellercentral.amazon.co.jp/"
329 break
330 except:
331 print("请正确输入0或1!!")
332 LoginWhere = int(input("抓取美国请按0,日本请按1:"))
333
334 keywords = input("请输入查找的关键词:")
335 keyword = keywords.replace(" ", " ")
336
337 print("正在检查登陆状态...")
338
339 if "jp" in Loginurl:
340 seekurl = "https://sellercentral.amazon.co.jp/productsearch?q=" str(keyword)
341 else:
342 seekurl = "https://sellercentral.amazon.com/productsearch?q=" str(keyword)
343
344 try:
345 htmlpage = gethtml(seekurl)
346 except Exception as err:
347 input("网络似乎有点问题...")
348 print(err)
349 exit()
350
351 while 1:
352 if 'ap_email' in htmlpage or "Amazon.com Page Not Found" in htmlpage or "<title>404" in htmlpage:
353 print("cookie已经过期,需要重新登陆...")
354 print("等待网页打开...")
355 openbrowser(Loginurl)
356 htmlpage = gethtml(seekurl)
357 else:
358 print("直接使用cookie登陆...")
359 break
360
361 # BeautifulSoup解析需要的东西
362 soups = BeautifulSoup(htmlpage, "html.parser")
363 # 筛选出类别及其网址
364 categorys = soups.findAll('ul', attrs={'class': "a-nostyle a-vertical"})
365 categoryurl = []
366 categoryname = ""
367 pagenum = []
368 filtername = []
369
370 for item in categorys:
371 for temp in item.find_all("a"):
372 hrefurl = temp.get('href')
373 categoryurl.append(hrefurl)
374
375 for temp in item.find_all("span", attrs={"class", "a-color-tertiary"}):
376 spantext = temp.get_text()
377 pagenum.append(getpagenum(spantext))
378 for i in range(0, len(categoryurl)):
379 name = getfilter(categoryurl[i])
380 filtername.append(name)
381 categoryname = categoryname "抓取(" str(name) ")请按" str(i) ","
382
383 # 选择抓取的类型
384 try:
385 print(categoryname)
386 selectcategory = int(input("请选择你要抓取类型的数字号码:"))
387 except:
388 print("请正确输入前面的数字!!!")
389 print(categoryname)
390 selectcategory = int(input("请选择你要抓取类型的数字编码:"))
391
392 filter = filtername[selectcategory]
393 mustpage = int(pagenum[selectcategory]) // 10
394
395 try:
396 print("温馨提醒:(1)后台仅仅展现1000页...(2)你要抓取的类型大约有" str(mustpage) "页...")
397 page = int(input("请问你要抓取多少页?(默认15页):"))
398 if page > 1000:
399 print("后台最多只能看到1000页!!!")
400 page = int(input("后台仅仅展现1000页!!!你要抓取的类型大约有" str(mustpage) "页!!!请问你要抓取多少页?(默认15页):"))
401 except:
402 page = 15
403
404 # 储存抓取到的东西
405 information = []
406 temparr = []
407
408 for i in range(0, page):
409 try:
410 if "jp" in Loginurl:
411 # https://sellercentral.amazon.co.jp/productsearch?filter=sporting&q=空気入れ&page=2
412 openurl = "https://sellercentral.amazon.co.jp/productsearch?filter=" str(filter) "&q=" str(
413 keyword) "&page=" str(i 1)
414 else:
415 # https://sellercentral.amazon.com/productsearch?filter=pets&q=dog
416 openurl = "https://sellercentral.amazon.com/productsearch?filter=" str(filter) "&q=" str(
417 keyword) "&page=" str(i 1)
418
419 print("开始抓取:" str(openurl))
420 openhtml = gethtml(openurl)
421
422 # BeautifulSoup解析需要的东西
423 soups = BeautifulSoup(openhtml, "html.parser")
424 # 筛选出商品的div
425 sellyours = soups.findAll('div', attrs={'class': "product"})
426
427 if 'ap_email' in openhtml or "Amazon.com Page Not Found" in openhtml:
428 print("抓取得太快!需要重新登陆...")
429 openbrowser(Loginurl)
430 openhtml = gethtml(openurl)
431
432 elif sellyours == None:
433 print("已经翻到最后一页了...")
434 break
435 temparr = getinfo(openhtml, Loginurl)
436 except Exception as err:
437 print(err)
438 print("访问抓取过程中出现小错误...")
439 print("暂停20秒记录bug并尝试自我修复...")
440 time.sleep(20)
441
442 if temparr:
443 information.append(temparr[0])
444 loadtime = random.randint(5, 10)
445 print("防止反爬虫设定暂停" str(loadtime) "秒...")
446 time.sleep(loadtime)
447
448 print("抓到的列表如下:")
449 print(information)
450
451 # 这里写入excel
452 # 创建文件夹
453 filename = time.strftime('%Y%m%d', time.localtime())
454 path = "../xls/" filename
455 createjia(path)
456
457 # 写入excel
458 timename = time.strftime('%Y%H%M%S', time.localtime())
459 with xlsxwriter.Workbook(path "/" timename '.xlsx') as workbook:
460 # workbook = xlsxwriter.Workbook(path "/" timename '.xlsx')
461 worksheet = workbook.add_worksheet()
462
463 first = ['title', 'UPC', 'EAN', 'Rank', 'Nothing', 'ASIN', 'DetailUrl', 'Star', 'Reviews']
464 # 写入第一行
465 for i in range(0, len(first)):
466 worksheet.write(0, i, first[i])
467 # 写入后面几行
468 for m in range(0, len(information)):
469 for n in range(0, len(information[m])):
470 insert = str(information[m][n]).replace("UPC: ", "").replace("EAN: ", "").replace("Sales Rank:",
471 "").replace(
472 "customer reviews", "").replace("out of 5 stars", "")
473 worksheet.write(m 1, n, insert)
474 workbook.close()
475
476 b = time.clock()
477 print('运行时间:' timetochina(b - a))
478 input('请关闭窗口') ##防止运行完毕后窗口直接关闭而看不到运行时间
由于selenium库支持低版本的浏览器,例如本文的谷歌浏览器需要下载插件,并将插件放到目录C:Python34即可:
插件为chromedriver.exe,自己搜索,网上很多哒