Python 系列文章 —— lxml 详解

2022-01-13 11:06:34 浏览数 (1)

  • lxml.py
代码语言:javascript复制
from lxml import etree
from io import StringIO

test_html = '''
<html>
    <body>
        <div>
            <!-- 这里是注释 -->
            <h4>手机品牌商<span style="margin-left:10px">4</span></h4>
            <ul>
               <li>小米</li>
               <li>华为</li>
               <li class='blank'> OPPO </li>
               <li>苹果</li>
            </ul>
        </div>
        <div id="last_div">
            <h4>电脑品牌商<span style="margin-left:10px">3</span></h4>
            <ul class="ul" style="color:red">
                <li>戴尔</li>
                <li>机械革命</li>
                <li>ThinkPad</li>
            </ul>
        </div>
    </body>
</html>'''

html = etree.parse(StringIO(test_html))
print(html)

li_list = html.xpath('//li')

print("类型:")
print(type(li_list))

print("值:")
print(li_list)

print("个数:")
print(len(li_list))

for l in li_list:
    print("li文本为:"   l.text)

# 根据属性获取
blank_li_list = html.xpath('//li[@class="blank"]')

print("类型:")
print(type(blank_li_list))

print("值:")
print(blank_li_list)

print("个数:")
print(len(blank_li_list))


for l in blank_li_list:
    print("blank_li文本为:"   l.text)


ul = html.find('//ul')
for name, value in ul.attrib.items():
    print('{0}="{1}"'.format(name, value))
ul.set("new_attr", "true")
new_attr = ul.get('new_attr')
print(new_attr)


last_div = html.xpath('//div[last()]')[0]
print("TAG:")
print(last_div.tag)
print("值:")
print(last_div.text)


child = etree.Element("child")
child.text = "这里是新的子元素"
last_div.append(child)
clild_text = last_div.find("child").text
print(clild_text)

first_ul = html.find("//ul")
ul_li = first_ul.xpath("li")
for li in ul_li:
    first_ul.remove(li)

ul_li = first_ul.xpath("li")
if len(ul_li) == 0:
    print("元素被删除了")

body = html.find("body")
for sub in body.iter():
    print(sub.tag)
    print(sub.text)

0 人点赞