urllib
urllib 获取网页(1)
代码语言:javascript复制# urlib 获取网页(1)
#
# 将 url 对应的网页下载到本地
import urllib.request
def get_html(url):
response = urllib.request.urlopen(url)
buff = response.read()
html = buff.decode("utf8")
return html
if __name__ == '__main__':
url = "http://www.baidu.com"
html = get_html(url)
print(html)
urllib 获取网页(2) with header
代码语言:javascript复制# urlib 获取网页(2) with header
# 将 url 对应的网页下载到本地
# -*- coding: UTF-8 -*-
import urllib.request
# 在此实现带头部信息的网页请求
def get_html(url, headers):
req = urllib.request.Request(url)
for key in headers:
req.add_header(key, headers[key])
response = urllib.request.urlopen(req)
buff = response.read()
html = buff.decode("utf8")
return html
if __name__ == '__main__':
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36"
}
url = "http://www.baidu.com"
html = get_html(url, headers)
print(html)
urllib post请求
代码语言:javascript复制# urllib post请求
import urllib.request
import urllib.parse
# 在此编写代码
def get_response(url, data):
data = bytes(urllib.parse.urlencode(data), encoding='utf8')
response = urllib.request.urlopen(
url, data=data
)
buff = response.read()
result = buff.decode("utf8")
return result
if __name__ == '__main__':
data = {
"key1": "value1",
"key2": "value2"
}
url = "http://httpbin.org/post"
html = get_response(url, data)
print(html)
正则表达式
获取中文个数
代码语言:javascript复制# 获取中文个数
import re
def getnum_of_cn(inputdata):
'''计算字符串中 中文字符 数量'''
# 编写正则查询代码
chi = re.findall(r'[u4E00-u9FFF]', inputdata)
return len(chi)
def test():
n = getnum_of_cn('你好,lajfldkjaklda123')
print(n)
if __name__ == '__main__':
test()
返回匹配到的第一个
代码语言:javascript复制# 返回匹配到的第一个
# -*- coding: UTF-8 -*-
import re
def search_text(inputdata):
'''search返回匹配到的一个'''
# 在此实现代码
chi = re.search('nlp', inputdata)
return chi
def test():
n = search_text('你好,nlp先生!nlp先生!')
print(n)
if __name__ == '__main__':
test()
去除html标签
代码语言:javascript复制# 去除html标签
import re
from typing import Text
# 在此实现代码
def remove_html(content):
pattern = re.compile(r'<[^>] >', re.S)
result = pattern.sub('', content)
return result
if __name__ == '__main__':
html = '''
<html>
<head>
<title>这是一个简单的测试页面</title>
</head>
<body>
<p class="item-0">body 元素的内容会显示在浏览器中。</p>
<p class="item-1">title 元素的内容会显示在浏览器的标题栏中。</p>
</body>
</html>
'''
Text = remove_html(html)
print(Text)
查找字符串里含有的全部IPV4和IPV6地址
代码语言:javascript复制# 查找字符串里含有的全部IPV4和IPV6地址
import re
def find_all_ipv4(text):
result = []
ipv4 = r"((b25[0-5]|b2[0-4][0-9]|b[01]?[0-9][0-9]?)(.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)){3})"
# 请在此匹配ipv4
ret = re.findall(ipv4, text)
for m in ret:
result.append({'type': 'ipv4', 'value': m[0]})
return result
def find_all_ipv6(text):
result = []
ipv6 = r"(([0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,7}:|([0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,5}(:[0-9a-fA-F]{1,4}){1,2}|([0-9a-fA-F]{1,4}:){1,4}(:[0-9a-fA-F]{1,4}){1,3}|([0-9a-fA-F]{1,4}:){1,3}(:[0-9a-fA-F]{1,4}){1,4}|([0-9a-fA-F]{1,4}:){1,2}(:[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:((:[0-9a-fA-F]{1,4}){1,6})|:((:[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(:[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(ffff(:0{1,4}){0,1}:){0,1}((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9]).){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])|([0-9a-fA-F]{1,4}:){1,4}:((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9]).){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9]))"
# 请在此匹配ipv6
ret = re.finditer(ipv6, text)
for m in ret:
result.append({'type': 'ipv6', 'value': m[0]})
return result
def find_all_ip(text):
result = find_all_ipv4(text) find_all_ipv6(text)
return result
if __name__ == '__main__':
input = 'IP地址有IPV4,例如:192.168.100.2,也有IPV6,例如:fe80:0000:0000:0000:0204:61ff:fe9d:f156,以及:fe80:0000:0000:0000:0204:61ff:fe9d:f156,还有 192.168.100.50'
results = find_all_ip(input)
for item in results:
print('type: {}, value: {}'.format(item['type'], item['value']))
本文内容到此结束了, 如有收获欢迎点赞