Python从入门到入土-网络爬虫(urllib、正则表达式)

2022-11-28 15:48:29 浏览数 (2)

urllib

urllib 获取网页(1)

代码语言:javascript复制
# urlib 获取网页(1)
#
# 将 url 对应的网页下载到本地

import urllib.request


def get_html(url):
    response = urllib.request.urlopen(url)
    buff = response.read()
    html = buff.decode("utf8")
    return html

if __name__ == '__main__':
    url = "http://www.baidu.com"
    html = get_html(url)
    print(html)

urllib 获取网页(2) with header

代码语言:javascript复制
# urlib 获取网页(2) with header
# 将 url 对应的网页下载到本地

# -*- coding: UTF-8 -*-
import urllib.request

# 在此实现带头部信息的网页请求
def get_html(url, headers):
    req = urllib.request.Request(url)
    for key in headers:
        req.add_header(key, headers[key])
    response = urllib.request.urlopen(req)
    buff = response.read()
    html = buff.decode("utf8")
    return html


if __name__ == '__main__':
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36"
    }
    url = "http://www.baidu.com"
    html = get_html(url, headers)
    print(html)

urllib post请求

代码语言:javascript复制
# urllib post请求

import urllib.request
import urllib.parse

# 在此编写代码
def get_response(url, data):
    data = bytes(urllib.parse.urlencode(data), encoding='utf8')
    response = urllib.request.urlopen(
        url, data=data
    )
    buff = response.read()
    result = buff.decode("utf8")
    return result

if __name__ == '__main__':
    data = {
        "key1": "value1",
        "key2": "value2"
    }
    url = "http://httpbin.org/post"
    html = get_response(url, data)
    print(html)

正则表达式

获取中文个数

代码语言:javascript复制
# 获取中文个数

import re

def getnum_of_cn(inputdata):
    '''计算字符串中 中文字符 数量'''
    # 编写正则查询代码
    chi = re.findall(r'[u4E00-u9FFF]', inputdata)
    return len(chi)

def test():
    n = getnum_of_cn('你好,lajfldkjaklda123')
    print(n)

if __name__ == '__main__':
    test()

返回匹配到的第一个

代码语言:javascript复制
# 返回匹配到的第一个

# -*- coding: UTF-8 -*-
import re

def search_text(inputdata):
    '''search返回匹配到的一个'''
    # 在此实现代码
    chi = re.search('nlp', inputdata)
    return chi

def test():
    n = search_text('你好,nlp先生!nlp先生!')
    print(n)

if __name__ == '__main__':
    test()

去除html标签

代码语言:javascript复制
# 去除html标签

import re
from typing import Text

# 在此实现代码
def remove_html(content):
    pattern = re.compile(r'<[^>] >', re.S)
    result = pattern.sub('', content)
    return result

if __name__ == '__main__':
    html = '''
        <html>
            <head>
                <title>这是一个简单的测试页面</title>
            </head>
            <body>
                <p class="item-0">body 元素的内容会显示在浏览器中。</p>
                <p class="item-1">title 元素的内容会显示在浏览器的标题栏中。</p>
            </body>
        </html>
        '''
    Text = remove_html(html)
    print(Text)

查找字符串里含有的全部IPV4和IPV6地址

代码语言:javascript复制
# 查找字符串里含有的全部IPV4和IPV6地址

import re


def find_all_ipv4(text):
    result = []
    ipv4 = r"((b25[0-5]|b2[0-4][0-9]|b[01]?[0-9][0-9]?)(.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)){3})"

    # 请在此匹配ipv4
    ret = re.findall(ipv4, text)

    for m in ret:
        result.append({'type': 'ipv4', 'value': m[0]})
    return result


def find_all_ipv6(text):
    result = []

    ipv6 = r"(([0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,7}:|([0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,5}(:[0-9a-fA-F]{1,4}){1,2}|([0-9a-fA-F]{1,4}:){1,4}(:[0-9a-fA-F]{1,4}){1,3}|([0-9a-fA-F]{1,4}:){1,3}(:[0-9a-fA-F]{1,4}){1,4}|([0-9a-fA-F]{1,4}:){1,2}(:[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:((:[0-9a-fA-F]{1,4}){1,6})|:((:[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(:[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(ffff(:0{1,4}){0,1}:){0,1}((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9]).){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])|([0-9a-fA-F]{1,4}:){1,4}:((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9]).){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9]))"

    # 请在此匹配ipv6
    ret = re.finditer(ipv6, text)

    for m in ret:
        result.append({'type': 'ipv6', 'value': m[0]})
    return result


def find_all_ip(text):
    result = find_all_ipv4(text)   find_all_ipv6(text)
    return result


if __name__ == '__main__':
    input = 'IP地址有IPV4,例如:192.168.100.2,也有IPV6,例如:fe80:0000:0000:0000:0204:61ff:fe9d:f156,以及:fe80:0000:0000:0000:0204:61ff:fe9d:f156,还有 192.168.100.50'
    results = find_all_ip(input)
    for item in results:
        print('type: {}, value: {}'.format(item['type'], item['value']))

本文内容到此结束了, 如有收获欢迎点赞

0 人点赞