Pythonweb采集(上)

2021-06-21 14:21:18 浏览数 (1)

一.访问页面

  1. import webbrowser
  2. webbrowser.open('http://www.baidu.com/')
  3. pip3 install requests
  4. import requests
  5. res = requests.get('http://www.gutenberg.org/cache/epub/1112/pg1112.txt')
  6. res.status_code == requests.codes.ok #返回真假
  7. len(res.text) #变量保存
  8. print(res.text[:250])
  9. res.raise_for_status() #下载出错抛出异常,成功则不返回
  10. playFile = open('a.txt', 'wb') #写入二进制文件,保存Unicode编码
  11. for chunk in res.iter_content(100000): #指定字节数
  12.     playFile.write(chunk)
  13. playFile.close()
  14. pip3 install sqlalchemy
  15. import sqlalchemy as sa
  16. conn = sa.create_engine('sqlite://')
  17. meta = sa.MetaData()
  18. zoo = sa.Table('zoo', meta,
  19.     sa.Column('critter', sa.String, primary_key=True),
  20.     sa.Column('count', sa.Integer),
  21.     sa.Column('damages', sa.Float)
  22. )
  23. meta.create_all(conn)
  24. conn.execute(zoo.insert(('bear', 2, 1000.0)))
  25. conn.execute(zoo.insert(('weasel', 1, 2000.0)))
  26. result = conn.execute(zoo.select()) #类似select *
  27. rows = result.fetchall()
  28. print(rows)
  29. #web
  30. import urllib.request as ur
  31. url = 'http://www.iheartquotes.com/api/v1/random'
  32. conn = ur.urlopen(url)
  33. print(conn)
  34. data = conn.read() #获取网页数据
  35. print(data)
  36. conn.status #状态码
  37. print(conn.getheader('Content-Type')) #数据格式
  38. for key, value in conn.getheaders(): #查看所有http头
  39. print(key, value)
  40. pip3 install requests
  41. import requests
  42. url = 'http://www.iheartquotes.com/api/v1/random'
  43. resp = requests.get(url)
  44. resp
  45. <Response [200]>
  46. print(resp.text)

二.页面过滤

  1. pip3 install beautifulsoup4
  2. import requests,bs4
  3. res = requests.get('http://nostarch.com')
  4. res.raise_for_status()
  5. noStarchSoup = bs4.BeautifulSoup(res.text)
  6. exampleFile = open('example.html')
  7. exampleSoup = bs4.BeautifulSoup(exampleFile)
  8. soup.select('p #author')
  9. soup.select('p')[0] #只取第一个放里面
  10. xx.get('id') #返回id的值

0 人点赞