Python 系列文章 —— BeautifulSoup 实战

2022-01-15 22:57:20 浏览数 (3)

  • BeautifulSoup 实战
代码语言:javascript复制
from bs4 import BeautifulSoup

html_doc = """
<html><head><title>index</title></head>
<body>
<p class="title"><b>首页</b></p>
<p class="main">我常用的网站
<a href="https://www.google.com" class="website" id="google">Google</a>
<a href="https://www.baidu.com" class="website" id="baidu">Baidu</a>
<a href="https://cn.bing.com" class="website" id="bing">Bing</a>
</p>
<div><!--这是注释内容--></div>
<p class="content1">...</p>
<p class="content2">...</p>
</body>
"""

# demo 1
soup = BeautifulSoup(html_doc, "lxml")
tags = soup.find_all('b')
print(tags)


# demo 2
import re
for tag in soup.find_all(re.compile("^b")):
    print(tag.name)


# demo 3
for tag in soup.find_all(['a', 'b']):
    print(tag)


# demo 4
for tag in soup.find_all(True):
    print(tag.name, end=', ')


# demo 5
def has_id_class(tag):
    return tag.has_attr('id') and tag.has_attr('class')

tags = soup.find_all(has_id_class)
for tag in tags:
	print(tag)


# demo 6
tags = soup.find_all(id='google')
print(tags[0]['href'])

for tag in soup.find_all(id=True):
	print(tag['href'])


# demo 7
tags = soup.find_all("a", class_="website")
for tag in tags:
	print(tag['href'])

def has_seven_characters(css_class):
    return css_class is not None and len(css_class) == 7

for tag in soup.find_all(class_=has_seven_characters):
	print(tag['id'])


# demo 8
css_soup = BeautifulSoup('<p class="body strikeout"></p>', 'lxml')
tags = css_soup.find_all("p", class_="strikeout")
print(tags)


# demo 9
tags = soup.find_all(text="Google")
print("google : ", tags)

tags = soup.find_all(text=["Baidu", "Bing"])
print("baidu & bing : ", tags)

tags = soup.find_all('a', text="Google")
print("a[text=google] : ", tags)


# demo 10
tag = soup.find_all("a", limit=1)
print(tag)

tags = soup.find_all("p", recursive=False)
print(tags)


# demo 11
tags = soup.select("body a")
for tag in tags:
	print(tag['href'])


# demo 12
tags = soup.select("p > a")
print(tags)

tags = soup.select("p > #google")
print(tags)


# demo 13
tags = soup.select(".website")
for tag in tags:
	print(tag.string)


# demo 14
tags = soup.select("#google")
print(tags)


# demo 15
tags = soup.select('a[href="https://cn.bing.com"]')
print(tags)

0 人点赞