一、字符相关
固定文本
- 匹配字符串中固定的文本
import re
text = "你好wade3maimi,我是你的fans,你是永远的wadegreast3"
data_list1 = re.findall("wade", text)
data_list2 = re.findall("你好", text)
print(data_list1)
print(data_list2)
输出:
['wade', 'wade']
['你好']
匹配字符
- 匹配字符串里a或b或c
import re
text = "你好wade3maimi,我是gfdbc你的fans,abcfgh你是永远的wadegreast3cb"
data_list = re.findall("[abc]", text)
print(data_list)
输出:
['a', 'a', 'b', 'c', 'a', 'a', 'b', 'c', 'a', 'a', 'c', 'b']
- 匹配qaw或qbw或qcw的字符
import re
text = "你好qcwmaimi,我是qbw你的fansqabcw,abcfgh你是qaw永远的wadegreast3cb"
data_list = re.findall("q[abc]w", text)
print(data_list)
输出:
['qcw', 'qbw', 'qaw']
- 字符范围a-z,0-9
import re
text = "tahhhuutywkatbhjklntctz"
data_list = re.findall("t[a-z]", text)
print(data_list)
输出:
['ta', 'ty', 'tb', 'tc', 'tz']
代码语言:txt复制import re
text = "tahhhuutywkat79bhjklntctz"
data_list = re.findall("t[0-9]", text)
print(data_list)
输出:
['t7']
代码语言:txt复制import re
text = "tahhhuutywkatbhjklntctz"
data_list = re.findall("t[0-9]", text)
print(data_list)
输出:
[]
- d 代表1个数字; 代表1个或n个;*代表0个或者n个;?代表0个或者1个;{n}代表固定n个;{n,}代表固定n 个(n个或n个以上);{n,m}代表固定n=<个数<=m;
import re
text = "asd2wyt-yd3hj-jjd123"
data_list = re.findall("dd", text)
print(data_list)
输出:
['d2', 'd3', 'd1']
代码语言:javascript复制import re
text = "rodt-yad3hdd9888j-jjd123"
data_list = re.findall("dd ", text) # ,1个或n个
print(data_list)
输出:
['d3', 'd9888', 'd123']
代码语言:javascript复制import re
text = "rodt-yad3hdd9888j-jjd123"
data_list = re.findall("dd*", text) # *,0个或n个
print(data_list)
输出:
['d', 'd3', 'd', 'd9888', 'd123']
代码语言:javascript复制import re
text = "rodt-yad3hdd9888j-jjd123"
data_list = re.findall("dd?", text) # ?,0个或1个
print(data_list)
输出:
['d', 'd3', 'd', 'd9', 'd1']
代码语言:javascript复制import re
text = "rodt-yad3hdd9888j-jjd123"
data_list = re.findall("dd{2}", text) # {n},固定n个
print(data_list)
输出:
['d98', 'd12']
代码语言:javascript复制import re
text = "rodt-yad3hdd9888j-jjd123"
data_list = re.findall("dd{2,}", text) # {n,},固定n 个
print(data_list)
输出:
['d9888', 'd123']
代码语言:javascript复制import re
text = "rodt-yad32hdd9888j-jjd123-jjd123789"
data_list = re.findall("dd{2,4}", text) # {n,m},固定[n,m]个
print(data_list)
输出:
['d32', 'd9888', 'd123', 'd1237']
代码语言:javascript复制import re
text = "rodt-yad32hdd9888j-jjd123-jjd123789"
data_list = re.findall("d ", text) # d出现1次或者n次
print(data_list)
输出:
['d', 'd', 'dd', 'd', 'd']
- w 字母、数字、下划线(汉字);正则默认是贪婪匹配,如需改成非贪婪匹配则在正则后面加?;
import re
text = "韦德wade迈阿密 韦德3e 哈哈哈韦德maimie 韦德美国_e"
# 韦德开头,e结尾,中间是1个或n个字母、数字、下划线(汉字);中间空格无法识别则分开提取
data_list = re.findall("韦德w e", text)
print(data_list)
输出:
['韦德wade', '韦德3e', '韦德maimie', '韦德美国_e']
代码语言:txt复制import re
text = "韦德wade迈阿密韦德3e哈哈哈韦德maimie韦德美国_e"
# 韦德开头,e结尾,中间是1个或n个字母、数字、下划线(汉字);没有空格会尽可能多去匹配(默认贪婪匹配)
data_list = re.findall("韦德w e", text)
print(data_list)
输出:
['韦德wade迈阿密韦德3e哈哈哈韦德maimie韦德美国_e']
代码语言:txt复制import re
text = "韦德wade迈阿密韦德3e哈哈哈韦德maimie韦德美国_e"
# 韦德开头,e结尾,中间是1个或n个字母、数字、下划线(汉字);找到第一个匹配就不再继续(非贪婪匹配)
data_list = re.findall("韦德w ?e", text)
print(data_list)
输出:
['韦德wade', '韦德3e', '韦德maimie', '韦德美国_e']
- . 代表除换行符以外任意字符
import re
text = "rtyto-raoyuo-rboa"
data_list = re.findall("r.o", text) # .代表任意1个字符
print(data_list)
输出:
['rao', 'rbo']
代码语言:javascript复制import re
text = "rtyto-raoyuo-rboa"
data_list = re.findall("r. o", text) # . 代表1个或n个字符,默认贪婪匹配
print(data_list)
输出:
['rtyto-raoyuo-rbo']
代码语言:javascript复制import re
text = "rtyto-raoyuo-rboa"
data_list = re.findall("r. ?o", text) # . 代表1个或n个字符,非贪婪匹配
print(data_list)
输出:
['rtyto', 'rao', 'rbo']
- s 代表任意空白字符
import re
text = "root admin add admin"
data_list = re.findall("aw sw ", text) # s代表1个空格
print(data_list)
输出:
['admin add']
代码语言:javascript复制import re
text = "root admin fdd admin"
data_list = re.findall("aw sw ", text) # s代表1个空格,两个空格或者是tab则无法匹配
print(data_list)
输出:
[]
代码语言:javascript复制import re
text = "root admin fdd admin"
data_list = re.findall("aw ssw ", text) # ss代表2个空格
print(data_list)
输出:
['admin fdd']
二、数量相关总结
- *,0或n
- ,1或n
- ?,0或n
- {n},固定n个
- {n,},n 个
- {n,m},n-m闭区间个
注意:默认贪婪匹配,非贪婪匹配数量 ?
三、分组
- 提取数据区域
import re
text = "楼主手机13046788791,邮箱789987666@126.com;群主号码13046787654,邮箱是8976777@qq.com"
data_list = re.findall("130467d{5}", text) #匹配130467开头,后面5位是数字的字符
print(data_list)
输出:
['13046788791', '13046787654']
代码语言:javascript复制import re
text = "楼主手机13046788791,邮箱789987666@126.com;群主号码13046787654,邮箱是8976777@qq.com"
data_list = re.findall("13046(7d{5})", text) # 匹配后只截取()部分字符
print(data_list)
输出:
['788791', '787654']
代码语言:javascript复制import re
text = "楼主手机13046788791,邮箱789987666@126.com;群主号码13046787654,邮箱是8976777@qq.com"
data_list = re.findall("(1d{2})46(7d{5})", text) # 匹配后将多个分组以元组形式保存列表
print(data_list)
输出:
[('130', '788791'), ('130', '787654')]
- 提取数据区域 或
import re
text = "楼主手机13046root,邮箱789987666@126.com;群主号码13046787654,邮箱是8976777@qq.com"
# 匹配13046(7d{5})
# 匹配13046(rw t)
data_list = re.findall("13046(7d{5}|rw t)", text) #将匹配结果截取7d{5}或rw t部分
print(data_list)
输出:
['root', '787654']
四、练习题
- 提取身份证号
import re
text = "我的身份证号是360722199808073032,周杰伦的身份证号是45678919790921675X"
data_list = re.findall("d{17}[dX]", text) #匹配前面17位是数字,最后一位是数字或者X
print(data_list)
输出:
['360722199808073032', '45678919790921675X']
- 提取身份证中的出生年月
import re
text = "我的身份证号是360722199808073032,周杰伦的身份证号是45678919790921675X"
data_list = re.findall("d{6}(d{4})(d{2})d{5}[dX]", text) # 将年份和月份分别提取出来
print(data_list)
输出:
[('1998', '08'), ('1979', '09')]
- 提取邮箱
import re
text = "楼主手机13046root,邮箱789987666@126.com;群主号码13046787654,邮箱是8976777@qq.com"
# .代表任意字符,如果特定提取.则用.转义
data_list = re.findall("w @w .w ", text)
print(data_list)
输出
['邮箱789987666@126.com', '邮箱是8976777@qq.com']
代码语言:javascript复制import re
text = "楼主手机13046root,邮箱789987666@126.com;群主号码13046787654,邮箱是8976777@qq.com"
# 带上re.ASCII后,w就不包含中文
# .代表任意字符,如果特定提取.则用.转义
data_list = re.findall("w @w .w ", text, re.ASCII)
print(data_list)
输出:
['789987666@126.com', '8976777@qq.com']
代码语言:javascript复制import re
text = "楼主手机13046root,邮箱789987666@126.com;群主号码13046787654,邮箱是8976777@qq.com"
# .代表任意字符,如果特定提取.则用.转义
data_list = re.findall("[a-zA-Z0-9_-] @[a-zA-Z0-9_-] .[a-zA-Z0-9_-] ", text)
print(data_list)
输出:
['789987666@126.com', '8976777@qq.com']
五、re模块
- re.findall,获取匹配成功的所有结果
import re
text = "换了斗地主,2B逗3B"
# .代表任意字符,如果特定提取.则用.转义
data_list = re.findall("dB", text)
print(data_list)
输出:
['2B', '3B']
- re.match,从开始进行匹配,开头未匹配成功就不再向后看,返回第一个对象
import re
text = "换了斗地主,2B逗3B"
# .代表任意字符,如果特定提取.则用.转义
data_list = re.match("dB", text)
print(data_list)
输出:
None
代码语言:javascript复制import re
text = "2B逗3B"
# .代表任意字符,如果特定提取.则用.转义
data_list = re.match("dB", text) # 返回<re.Match object; span=(0, 2), match='2B'>对象
print(data_list)
content = data_list.group() # 通过group接收对象
print(content)
输出
<re.Match object; span=(0, 2), match='2B'>
2B
match拓展应用
代码语言:javascript复制import re
mobile = input("请输入手机号:")
mobile = mobile.strip() # 去除空格
# 1.校验手机号是否正确
result = re.match("^1[3-9]d{9}$", mobile) # ^表示开头,$结尾,手机号非1开头或超出11位都会校验住
if result:
print("格式正确")
else:
print("格式错误")
- re.search,浏览整个正则字符串去匹配,返回第一个对象
import re
text = "2B逗3B"
# .代表任意字符,如果特定提取.则用.转义
data_list = re.search("dB", text) # 返回<re.Match object; span=(0, 2), match='2B'>对象
print(data_list)
content = data_list.group() # 通过group接收对象
print(content)
输出:
<re.Match object; span=(0, 2), match='2B'>
2B