正则表达式
正则表达式简介
搜索(searching),即在字符串任意部分中搜索匹配的模式;
匹配(matching)是指判断一个字符串能否从起始处全部或者部分地匹配某个模式。
(|),使用择一匹配符号匹配多个正则表达式模式
匹配任意单个字符点号或者句点(.)符号匹配除了换行符n 以外的任何字符(匹配句点需要转义)
特殊字符b 和B 可以用来匹配字符边界。
正则表达式和Python 语言
re 模块来支持正则表达式(替换regex 模块和regsub 模块,若导入会触发importError 异常。)
re.compile()预编译来提升执行性能
group()要么返回整个匹配对象,要么根据要求返回特定子组。
groups()则仅返回一个包含唯一或者全部子组的元组。
如果没有子组的要求,那么当group()仍然返回整个匹配时,groups()返回一个空元组。
从字符串的起始部分对模式进行匹配。如果匹配成功,就返回一个匹配对象;如果匹配失败,就返回None,匹配对象的group()方法能够用于显示那个成功的匹配。
>>> m = re.match('foo', 'foo') # 模式匹配字符串
>>> if m is not None: # 如果匹配成功,就输出匹配内容
... m.group()
...
'foo'
>>> m # 确认返回的匹配对象
#失败的匹配示例,它返回None
>>> m = re.match('foo', 'bar')# 模式并不能匹配字符串
>>> if m is not None: m.group() # (实际中最好不要省略,单行版本的if 语句)
...
>> >
>>> m = re.match('foo', 'food on the table') # 匹配成功
>>> m.group()
'f oo'
#如果匹配失败,将会抛出AttributeError 异常。
>>> re.match('foo', 'food on the table').group()
'foo'
使用search()在一个字符串中查找模式(搜索与匹配的对比)
>>> m = re.search('foo', 'seafood') # 使用 search() 代替
>>> if m is not None: m.group()
...
'foo' # 搜索成功,但是匹配失败
>> >
匹配多个字符串
>>> bt = 'bat|bet|bit' # 正则表达式模式: bat、bet、bit >>> m = re.match(bt, 'bat') # 'bat' 是一个匹配 >>> if m is not None: m.group() ... 'bat' >>> m = re.match(bt, 'blt') # 对于 'blt' 没有匹配 >>> if m is not None: m.group() ... >>> m = re.match(bt, 'He bit me!') # 不能匹配字符串 >>> if m is not None: m.group() ... >>> m = re.search(bt, 'He bit me!') # 通过搜索查找 'bit' >>> if m is not None: m.group() ... 'b it'匹配任何单个字符
>>> anyend = '.end'
>>> m = re.match(anyend, 'bend') # 点号匹配 'b'
>>> if m is not None: m.group()
...
'bend'
>>> m = re.match(anyend, 'end') # 不匹配任何字符
>>> if m is not None: m.group()
...
>>> m = re.match(anyend, 'nend') # 除了 n 之外的任何字符
>>> if m is not None: m.group()
...
>>> m = re.search('.end', 'The end.')# 在搜索中匹配 ' '
>>> if m is not None: m.group()
...
' end'
>>> patt314 = '3.14' # 表示正则表达式的点号
>>> pi_patt = '3.14' # 表示字面量的点号 (dec. point)
>>> m = re.match(pi_patt, '3.14') # 精确匹配
>>> if m is not None: m.group()
...
'3.14'
>>> m = re.match(patt314, '3014') # 点号匹配'0'
>>> if m is not None: m.group()
...
'3014'
>>> m = re.match(patt314, '3.14') # 点号匹配 '.'
>>> if m is not None: m.group()
...
'3 .14'
创建字符集([ ])
>>> m = re.match('[cr][23][dp][o2]', 'c3po')# 匹配 'c3po'
>>> if m is not None: m.group()
...
'c3po'
>>> m = re.match('[cr][23][dp][o2]', 'c2do')# 匹配 'c2do'
>>> if m is not None: m.group()
...
'c2do'
>>> m = re.match('r2d2|c3po', 'c2do')# 不匹配 'c2do'
>>> if m is not None: m.group()
...
>>> m = re.match('r2d2|c3po', 'r2d2')# 匹配 'r2d2'
>>> if m is not None: m.group()
...
'r 2d2'
重复、特殊字符以及分组
>>> patt = 'w+@(w+.)?w+.com'
>>> re.match(patt, 'nobody@xxx.com').group()
'nobody@xxx.com'
>>> re.match(patt, 'nobody@www.xxx.com').group()
'n obody@www.xxx.com'
>>> patt = 'w+@(w+.)*w+.com'
>>> re.match(patt, 'nobody@www.xxx.yyy.zzz.com').group()
'n obody@www.xxx.yyy.zzz.com'
>>> m = re.match('www-ddd', 'abc-123')
>>> if m is not None: m.group()
...
'abc-123'
>>> m = re.match('www-ddd', 'abc-xyz')
>>> if m is not None: m.group()
...
>>>
group()通常用于以普通方式显示所有的匹配部分,但也能用于获取各个匹配的子组。
>>> m = re.match('(www)-(ddd)', 'abc-123')
>>> m.group() # 完整匹配
'abc-123'
>>> m.group(1) # 子组 1
'abc'
>>> m.group(2) # 子组 2
'123'
>>> m.groups() # 全部子组
(' abc', '123')
groups()方法来获取一个包含所有匹配子字符串的元组
>>> m = re.match('ab', 'ab') # 没有子组
>>> m.group() # 完整匹配
'ab'
>>> m.groups() # 所有子组
()
>>>
>>> m = re.match('(ab)', 'ab') # 一个子组
>>> m.group() # 完整匹配
'ab'
>>> m.group(1) # 子组 1
'ab'
>>> m.groups() # 全部子组
('ab',)
>>>
>>> m = re.match('(a)(b)', 'ab') # 两个子组
>>> m.group() # 完整匹配
'ab'
>>> m.group(1) # 子组 1
'a'
>>> m.group(2) # 子组 2
'b'
>>> m.groups() # 所有子组
匹配字符串的起始和结尾以及单词边界
>>> m = re.search('^The', 'The end.') # 匹配
>>> if m is not None: m.group()
...
'The'
>>> m = re.search('^The', 'end. The') # 不作为起始
>>> if m is not None: m.group()
...
>>> m = re.search(r'bthe', 'bite the dog') # 在边界
>>> if m is not None: m.group()
...
'the'
>>> m = re.search(r'bthe', 'bitethe dog') # 有边界
>>> if m is not None: m.group()
...
>>> m = re.search(r'Bthe', 'bitethe dog') # 没有边界
>>> if m is not None: m.group()
...
't he'
使用findall()和finditer()查找每一次出现的位置
findall()查询字符串中某个正则表达式模式全部的非重复出现情况。
>>> re.findall('car', 'car')
['car']
>>> re.findall('car', 'scary')
['car']
>>> re.findall('car', 'carry the barcardi to the car')
[' car', 'car', 'car']
>>> s = 'This and that.'
>>> re.findall(r'(thw+) and (thw+)', s, re.I)
[('This', 'that')]
>>> re.finditer(r'(thw+) and (thw+)', s,
... re.I).next().groups()
('This', 'that')
>>> re.finditer(r'(thw+) and (thw+)', s,
... re.I).next().group(1)
'This'
>>> re.finditer(r'(thw+) and (thw+)', s,
... re.I).next().group(2)
'that'
>>> [g.groups() for g in re.finditer(r'(thw+) and (thw+)',
... s, re.I)]
[( 'This', 'that')]
>>> re.findall(r'(thw+)', s, re.I)
['This', 'that']
>>> it = re.finditer(r'(thw+)', s, re.I)
>>> g = it.next()
>>> g.groups()
('This',)
>>> g.group(1)
'This'
>>> g = it.next()
>>> g.groups()
('that',)
>>> g.group(1)
'that'
>>> [g.group(1) for g in re.finditer(r'(thw+)', s, re.I)]
[' This', 'that']
使用sub()和subn()搜索与替换
实现搜索和替换功能:sub()和subn()。
subn()和sub()一样,但subn()还返回一个表示替换的总数,替换后的字符串和表示替换总数的数字一起作为一个拥有两个
元素的元组返回。
>>> re.sub('X', 'Mr. Smith', 'attn: XnnDear X,n')
'attn: Mr. Smith 12 12Dear Mr. Smith, 12'
>>>
>>> re.subn('X', 'Mr. Smith', 'attn: XnnDear X,n')
('attn: Mr. Smith 12 12Dear Mr. Smith, 12', 2)
>>>
>>> print re.sub('X', 'Mr. Smith', 'attn: XnnDear X,n')
attn: Mr. Smith
Dear Mr. Smith,
>>> re.sub('[ae]', 'X', 'abcdef')
'XbcdXf'
>>> re.subn('[ae]', 'X', 'abcdef')
(' XbcdXf', 2)
>>> re.sub(r'(d{1,2})/(d{1,2})/(d{2}|d{4})',
... r'2/1/3', '2/20/91') # Yes, Python is...
'20/2/91'
>>> re.sub(r'(d{1,2})/(d{1,2})/(d{2}|d{4})',
... r'2/1/3', '2/20/1991') # ... 20+ years old!
'20/2/1991'
在限定模式上使用split()分隔字符串
>>> re.split(':', 'str1:str2:str3')
[' str1', 'str2', 'str3']
>>> import re
>>> DATA = (
... 'Mountain View, CA 94040',
... 'Sunnyvale, CA',
... 'Los Altos, 94023',
... 'Cupertino 95014',
... 'Palo Alto CA',
... )
>>> for datum in DATA:
... print re.split(', |(?= (?:d{5}|[A-Z]{2})) ', datum)
...
['Mountain View', 'CA', '94040']
['Sunnyvale', 'CA']
['Los Altos', '94023']
['Cupertino', '95014']
[' Palo Alto', 'CA']
扩展符号
>>> re.findall(r'(?i)yes', 'yes? Yes. YES!!') ['yes', 'Yes', 'YES'] >>> re.findall(r'(?i)thw+', 'The quickest way is through this tunnel.') ['The', 'through', 'this'] >>> re.findall(r'(?im)(^th[w ]+)', """ ... This line is the first, ... another line, ... that line, it's the best ... """) [' This line is the first', 'that line']
>>> re.findall(r'th.+', ''' ... The first line ... the second line ... the third line ... ''') ['the second line', 'the third line'] >>> re.findall(r'(?s)th.+', ''' ... The first line ... the second line ... the third line ... ''') [' the second linenthe third linen']
>>> re.search(r'''(?x)
... ((d{3})) # 区号
... [ ] # 空白符
... (d{3}) # 前缀
... - # 横线
... (d{4}) # 终点数字
... ''', '(800) 555-1212').groups()
(' 800', '555', '1212')
>>> re.findall(r'http://(?:w+.)*(w+.com)',
... 'http://google.com http://www.google.com http://
code.google.com')
['google.com', 'google.com', 'google.com']
>>> re.search(r'((?Pd{3})) (?Pd{3})-(?:d{4})',
... '(800) 555-1212').groupdict()
{' areacode': '800', 'prefix': '555'}
>>> re.sub(r'((?Pd{3})) (?Pd{3})-(?:d{4})',
... '(g) g-xxxx', '(800) 555-1212')
'( 800) 555-xxxx'
>>> bool(re.match(r'((?Pd{3})) (?Pd{3})-
(?Pd{4}) (?P=areacode)-(?P=prefix)-(?P=number)
1(?P=areacode)(?P=prefix)(?P=number)',
... '(800) 555-1212 800-555-1212 18005551212'))
True
>>> bool(re.match(r'''(?x)
...
... # match (800) 555-1212, save areacode, prefix, no.
... ((?Pd{3}))[ ](?Pd{3})-(?Pd{4})
...
... # space
... [ ]
...
... # match 800-555-1212
... (?P=areacode)-(?P=prefix)-(?P=number)
...
... # space
... [ ]
...
... # match 18005551212
... 1(?P=areacode)(?P=prefix)(?P=number)
...
... ''', '(800) 555-1212 800-555-1212 18005551212'))
True
>>> re.findall(r'w+(?= van Rossum)', ... ''' ... Guido van Rossum ... Tim Peters ... Alex Martelli ... Just van Rossum ... Raymond Hettinger ... ''') ['Guido', 'Just'] >>> re.findall(r'(?m)^s+(?!noreply|postmaster)(w+)', ... ''' ... sales@phptr.com ... postmaster@phptr.com ... eng@phptr.com ... noreply@phptr.com ... admin@phptr.com ... ''') ['sales', 'eng', 'admin'] >>> ['%s@aw.com' % e.group(1) for e in re.finditer(r'(?m)^s+(?!noreply|postmaster)(w+)', ... ''' ... sales@phptr.com ... postmaster@phptr.com ... eng@phptr.com ... noreply@phptr.com ... admin@phptr.com ... ''')] [' sales@aw.com', 'eng@aw.com', 'admin@aw.com'] >>> bool(re.search(r'(?:(x)|y)(?(1)y|x)', 'xy')) True >>> bool(re.search(r'(?:(x)|y)(?(1)y|x)', 'xx')) False正则表达式示例
#!/usr/bin/env python
import os
from distutils.log import warn as printf #兼容python2
import re
with os.popen('who','r') as f:
for eachLine in f:
print(re.split(r'ss+|t',eachLine.strip()))
处理DOS 环境下tasklist 命令的输出(retasklist.py)
#!/usr/bin/env python
import os
import re
f = os.popen('tasklist /nh', 'r')
for eachLine in f:
print re.findall(
'([w.]+(?: [w.]+)*)ss+(d+) w+ss+d+ss+([d,]+ K)',
eachLine.rstrip())
f.close()
用于正则表达式练习的数据生成器(gendata.py)
#!/usr/bin/env python
from distutils.log import warn as printf
from random import randrange, choice
from string import ascii_lowercase as lc
from time import ctime
tlds = ( 'com', 'edu', 'net', 'org', 'gov' )
if hasattr(__builtins__, 'xrange'):
myrng = xrange
else:
myrng = range
for i in myrng(randrange(5, 11)):
dtint = randrange(2**32) # pick date
dtstr = ctime(dtint) # date string
llen = randrange(4, 7) # login is shorter
login = ''.join(choice(lc) for j in myrng(llen))
dlen = randrange(llen, 13) # domain is longer
dom = ''.join(choice(lc) for j in myrng(dlen))
printf('%s::%s@%s.%s::%d-%d-%d' % (dtstr, login,
dom, choice(tlds), dtint, llen, dlen))



