1、解析html并以友好形式显示:BeautifulSoup(html_doc,'html.parser') print(soup.prettify()) html_doc = """ <html><head><title>The Dormouse's story</title></head> <body> <p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>, <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</p>
<p class="story">...</p> """ from bs4 import BeautifulSoup soup = BeautifulSoup(html_doc, 'html.parser')
print(soup.prettify())
2、结构语句: soup.title #获取标题<title>The Dormouse's story</title> sout.title.name soup.title.string #获取标题标签内的内容 The Dormouse's story soup.title.parent.name soup.p #获取第一个标签p soup.p['class'] #获取第一个标签p的class内容 soup.a #获取第一个标签a soup.find_all('a') #获取所有标签a,以列表返回 soup.find(id="link3") #根据属性查找 for link in soup.find_all('a'): print(link.get('href')) # http://example.com/elsie # http://example.com/lacie # http://example.com/tillie
print(soup.get_text()) #获取文档内容,不带任何标签 3、其他组件安装: pip install lxml pip install html5lib 4、几种解析器: BeautifulSoup(markup, "html.parser") BeautifulSoup(markup, "lxml") BeautifulSoup(markup, "html5lib") 5、tag的用法: soup = BeautifulSoup('<b class="boldest">Extremely bold</b>') tag = soup.b tag.name tag.name = "blockquote" tag.string tag.string.replace_with("No longer bold") tag['class'] tag.attrs tag['class'] = 'verybold' tag['id'] = 1 del tag['class'] del tag['id'] 6、tag.contents 将子节点以列表输出。 通过tag的 .children 生成器,可以对tag的子节点进行循环: for child in title_tag.children: print(child) .descendants 属性可以对所有tag的子孙节点进行递归循环 for child in head_tag.descendants: print(child) 7、循环输出不带标签的所有内容: for string in soup.strings: print(repr(string)) 去掉空白 for string in soup.stripped_strings: print(repr(string)) 8、.parent 获得父节点 .parents获得所有父节点 .next_sibling / .previous_sibling 兄弟节点 .next_element 和 .previous_element 指向解析过程中下一个被解析的对象 9、find/find_all 使用正则: import re for tag in soup.find_all(re.compile("^b")): print(tag.name)
body
b
列表 soup.find_all(["a", "b"])
tag.has_attr('id') soup.find_all(href=re.compile("elsie"), id='link1') data_soup.find_all(attrs={"data-foo": "value"}) soup.find_all("a", class_="sister") soup.find_all(string="Elsie")
soup.find_all("a", limit=2) #只返回2个 soup.html.find_all("title", recursive=False) #只检查1级子节点
find_parents() 和 find_parent() find_next_siblings() 合 find_next_sibling() find_previous_siblings() 和 find_previous_sibling() find_all_next() 和 find_next() find_all_previous() 和 find_previous()
css选择器方式查找: soup.select("p nth-of-type(3)")
[<p class="story">...</p>]
soup.select("body a")
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
soup.select("html head title")
[<title>The Dormouse's story</title>]
soup.select("body > a") #>一级子标签,多级的不匹配
兄弟节点
soup.select("#link1 ~ .sister")
[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
soup.select("#link1 .sister")
[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]
查找类:.xx
soup.select(".sister")
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
soup.select("[class~=sister]")
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
通过ID查找:
soup.select("#link1")
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]
soup.select("a#link2")
[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]
soup.select("#link1,#link2")
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]
通过属性查找
soup.select('a[href]')
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
通过属性的值查找:
soup.select('a[href="http://example.com/elsie"]')
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]
soup.select('a[href^="http://example.com/"]')
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
soup.select('a[href$="tillie"]')
[<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
soup.select('a[href*=".com/el"]')
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]
只查找1个
soup.select_one(".sister")
10、append()追加内容 soup = BeautifulSoup("<a>Foo</a>") soup.a.append("Bar")
soup
<html><head></head><body><a>FooBar</a></body></html>
soup.a.contents
[u'Foo', u'Bar']
insert markup = '<a href="http://example.com/">I linked to <i>example.com</i></a>' soup = BeautifulSoup(markup) tag = soup.a
tag.insert(1, "but did not endorse ") tag
I-linked-to-but-did-not-endorse-example.com"><a href="http://example.com/">I linked to but did not endorse <i>example.com</i></a>
tag.contents
[u'I linked to ', u'but did not endorse', <i>example.com</i>]
soup = BeautifulSoup("<b>stop</b>") tag = soup.new_tag("i") tag.string = "Don't" soup.b.string.insert_before(tag) soup.b
<b><i>Don't</i>stop</b>
soup.b.i.insert_after(soup.new_string(" ever ")) soup.b
<b><i>Don't</i> ever stop</b>
soup.b.contents
[<i>Don't</i>, u' ever ', u'stop']
clear()清除string markup = '<a href="http://example.com/">I linked to <i>example.com</i></a>' soup = BeautifulSoup(markup) tag = soup.a
tag.clear() tag
"><a href="http://example.com/"></a>
extract移除元素 markup = '<a href="http://example.com/">I linked to <i>example.com</i></a>' soup = BeautifulSoup(markup) a_tag = soup.a
i_tag = soup.i.extract()
a_tag
I-linked-to"><a href="http://example.com/">I linked to</a>
i_tag
<i>example.com</i>
print(i_tag.parent) None
decompose也是移除元素 markup = '<a href="http://example.com/">I linked to <i>example.com</i></a>' soup = BeautifulSoup(markup) a_tag = soup.a
soup.i.decompose()
a_tag
I-linked-to"><a href="http://example.com/">I linked to</a>
replace_with替换 markup = '<a href="http://example.com/">I linked to <i>example.com</i></a>' soup = BeautifulSoup(markup) a_tag = soup.a
new_tag = soup.new_tag("b") new_tag.string = "example.net" a_tag.i.replace_with(new_tag)
a_tag
I-linked-to-example.net"><a href="http://example.com/">I linked to <b>example.net</b></a>
wrap包装 soup = BeautifulSoup("<p>I wish I was bold.</p>") soup.p.string.wrap(soup.new_tag("b"))
<b>I wish I was bold.</b>
soup.p.wrap(soup.new_tag("div"))
<div><p><b>I wish I was bold.</b></p></div>
unwrap markup = '<a href="http://example.com/">I linked to <i>example.com</i></a>' soup = BeautifulSoup(markup) a_tag = soup.a
a_tag.i.unwrap() a_tag
I-linked-to-example.com"><a href="http://example.com/">I linked to example.com</a>
prettify格式化输出,可以指定编码格式 get_text 获得文档内容,指定分隔符
soup.get_text("|")
u'nI linked to |example.com|n'
如果不知道文档编码,使用UnicodeDamit来自动编码 from bs4 import UnicodeDammit dammit = UnicodeDammit("Sacrxc3xa9 bleu!") print(dammit.unicode_markup)
Sacré bleu!
dammit.original_encoding
'utf-8'
11、lxml解析比其他块 Beautiful Soup对文档的解析速度不会比它所依赖的解析器更快,如果对计算时间要求很高或者计算机的时间比程序员的时间更值钱,那么就应该直接使用 lxml .
换句话说,还有提高Beautiful Soup效率的办法,使用lxml作为解析器.Beautiful Soup用lxml做解析器比用html5lib或Python内置解析器速度快很多.
https://www.crummy.com/software/BeautifulSoup/bs4/doc.zh/