python正则表达式

正则表达式

正则表达式简介
搜索（searching），即在字符串任意部分中搜索匹配的模式；
匹配（matching）是指判断一个字符串能否从起始处全部或者部分地匹配某个模式。

择一匹配的管道符号

（|）,使用择一匹配符号匹配多个正则表达式模式

匹配任意单个字符

点号或者句点（.）符号匹配除了换行符n 以外的任何字符(匹配句点需要转义)

从字符串起始或者结尾或者单词边界匹配

特殊字符b 和B 可以用来匹配字符边界。

创建字符集

限定范围和否定

使用闭包操作符实现存在性和频数匹配

表示字符集的特殊字符

使用圆括号指定分组

扩展表示法

正则表达式和Python 语言
re 模块来支持正则表达式（替换regex 模块和regsub 模块，若导入会触发importError 异常。）

re.compile()预编译来提升执行性能

匹配对象以及group()和groups()方法

group()要么返回整个匹配对象，要么根据要求返回特定子组。
groups()则仅返回一个包含唯一或者全部子组的元组。
如果没有子组的要求，那么当group()仍然返回整个匹配时，groups()返回一个空元组。

使用match()方法匹配字符串

从字符串的起始部分对模式进行匹配。如果匹配成功，就返回一个匹配对象；如果匹配失败，就返回None，匹配对象的group()方法能够用于显示那个成功的匹配。

>>> m = re.match('foo', 'foo') # 模式匹配字符串
>>> if m is not None: # 如果匹配成功，就输出匹配内容
... m.group()
...
'foo'
>>> m # 确认返回的匹配对象

#失败的匹配示例，它返回None
>>> m = re.match('foo', 'bar')# 模式并不能匹配字符串
>>> if m is not None: m.group() # （实际中最好不要省略，单行版本的if 语句）
...
>> >

>>> m = re.match('foo', 'food on the table') # 匹配成功
>>> m.group()
'f oo'
#如果匹配失败，将会抛出AttributeError 异常。
>>> re.match('foo', 'food on the table').group()
'foo'

使用search()在一个字符串中查找模式（搜索与匹配的对比）

>>> m = re.search('foo', 'seafood') # 使用 search() 代替
>>> if m is not None: m.group()
...
'foo' # 搜索成功，但是匹配失败
>> >

匹配多个字符串

>>> bt = 'bat|bet|bit' # 正则表达式模式: bat、bet、bit
>>> m = re.match(bt, 'bat') # 'bat' 是一个匹配
>>> if m is not None: m.group()
...
'bat'
>>> m = re.match(bt, 'blt') # 对于 'blt' 没有匹配
>>> if m is not None: m.group()
...
>>> m = re.match(bt, 'He bit me!') # 不能匹配字符串
>>> if m is not None: m.group()
...
>>> m = re.search(bt, 'He bit me!') # 通过搜索查找 'bit'
>>> if m is not None: m.group()
...
'b it'

匹配任何单个字符

>>> anyend = '.end'
>>> m = re.match(anyend, 'bend') # 点号匹配 'b'
>>> if m is not None: m.group()
...
'bend'
>>> m = re.match(anyend, 'end') # 不匹配任何字符
>>> if m is not None: m.group()
...
>>> m = re.match(anyend, 'nend') # 除了 n 之外的任何字符
>>> if m is not None: m.group()
...
>>> m = re.search('.end', 'The end.')# 在搜索中匹配 ' '
>>> if m is not None: m.group()
...
' end'
>>> patt314 = '3.14' # 表示正则表达式的点号
>>> pi_patt = '3.14' # 表示字面量的点号 (dec. point)
>>> m = re.match(pi_patt, '3.14') # 精确匹配
>>> if m is not None: m.group()
...
'3.14'
>>> m = re.match(patt314, '3014') # 点号匹配'0'
>>> if m is not None: m.group()
...
'3014'
>>> m = re.match(patt314, '3.14') # 点号匹配 '.'
>>> if m is not None: m.group()
...
'3 .14'

创建字符集（[ ]）

>>> m = re.match('[cr][23][dp][o2]', 'c3po')# 匹配 'c3po'
>>> if m is not None: m.group()
...
'c3po'
>>> m = re.match('[cr][23][dp][o2]', 'c2do')# 匹配 'c2do'
>>> if m is not None: m.group()
...
'c2do'
>>> m = re.match('r2d2|c3po', 'c2do')# 不匹配 'c2do'
>>> if m is not None: m.group()
...
>>> m = re.match('r2d2|c3po', 'r2d2')# 匹配 'r2d2'
>>> if m is not None: m.group()
...
'r 2d2'

重复、特殊字符以及分组

>>> patt = 'w+@(w+.)?w+.com'
>>> re.match(patt, 'nobody@xxx.com').group()
'nobody@xxx.com'
>>> re.match(patt, 'nobody@www.xxx.com').group()
'n obody@www.xxx.com'

>>> patt = 'w+@(w+.)*w+.com'
>>> re.match(patt, 'nobody@www.xxx.yyy.zzz.com').group()
'n obody@www.xxx.yyy.zzz.com'

>>> m = re.match('www-ddd', 'abc-123')
>>> if m is not None: m.group()
...
'abc-123'
>>> m = re.match('www-ddd', 'abc-xyz')
>>> if m is not None: m.group()
...
>>>

group()通常用于以普通方式显示所有的匹配部分，但也能用于获取各个匹配的子组。

>>> m = re.match('(www)-(ddd)', 'abc-123')
>>> m.group() # 完整匹配
'abc-123'
>>> m.group(1) # 子组 1
'abc'
>>> m.group(2) # 子组 2
'123'
>>> m.groups() # 全部子组
(' abc', '123')

groups()方法来获取一个包含所有匹配子字符串的元组

>>> m = re.match('ab', 'ab') # 没有子组
>>> m.group() # 完整匹配
'ab'
>>> m.groups() # 所有子组
()
>>>
>>> m = re.match('(ab)', 'ab') # 一个子组
>>> m.group() # 完整匹配
'ab'
>>> m.group(1) # 子组 1
'ab'
>>> m.groups() # 全部子组
('ab',)
>>>
>>> m = re.match('(a)(b)', 'ab') # 两个子组
>>> m.group() # 完整匹配
'ab'
>>> m.group(1) # 子组 1
'a'
>>> m.group(2) # 子组 2
'b'
>>> m.groups() # 所有子组

匹配字符串的起始和结尾以及单词边界

>>> m = re.search('^The', 'The end.') # 匹配
>>> if m is not None: m.group()
...
'The'
>>> m = re.search('^The', 'end. The') # 不作为起始
>>> if m is not None: m.group()
...
>>> m = re.search(r'bthe', 'bite the dog') # 在边界
>>> if m is not None: m.group()
...
'the'
>>> m = re.search(r'bthe', 'bitethe dog') # 有边界
>>> if m is not None: m.group()
...
>>> m = re.search(r'Bthe', 'bitethe dog') # 没有边界
>>> if m is not None: m.group()
...
't he'

使用findall()和finditer()查找每一次出现的位置

findall()查询字符串中某个正则表达式模式全部的非重复出现情况。

>>> re.findall('car', 'car')
['car']
>>> re.findall('car', 'scary')
['car']
>>> re.findall('car', 'carry the barcardi to the car')
[' car', 'car', 'car']

>>> s = 'This and that.'
>>> re.findall(r'(thw+) and (thw+)', s, re.I)
[('This', 'that')]
>>> re.finditer(r'(thw+) and (thw+)', s,
... re.I).next().groups()
('This', 'that')
>>> re.finditer(r'(thw+) and (thw+)', s,
... re.I).next().group(1)
'This'
>>> re.finditer(r'(thw+) and (thw+)', s,
... re.I).next().group(2)
'that'
>>> [g.groups() for g in re.finditer(r'(thw+) and (thw+)',
... s, re.I)]
[( 'This', 'that')]


>>> re.findall(r'(thw+)', s, re.I)
['This', 'that']
>>> it = re.finditer(r'(thw+)', s, re.I)
>>> g = it.next()
>>> g.groups()
('This',)
>>> g.group(1)
'This'
>>> g = it.next()
>>> g.groups()
('that',)
>>> g.group(1)
'that'
>>> [g.group(1) for g in re.finditer(r'(thw+)', s, re.I)]
[' This', 'that']

使用sub()和subn()搜索与替换

实现搜索和替换功能：sub()和subn()。
subn()和sub()一样，但subn()还返回一个表示替换的总数，替换后的字符串和表示替换总数的数字一起作为一个拥有两个
元素的元组返回。

>>> re.sub('X', 'Mr. Smith', 'attn: XnnDear X,n')
'attn: Mr. Smith1212Dear Mr. Smith,12'
>>>
>>> re.subn('X', 'Mr. Smith', 'attn: XnnDear X,n')
('attn: Mr. Smith1212Dear Mr. Smith,12', 2)
>>>
>>> print re.sub('X', 'Mr. Smith', 'attn: XnnDear X,n')
attn: Mr. Smith
Dear Mr. Smith,
>>> re.sub('[ae]', 'X', 'abcdef')
'XbcdXf'
>>> re.subn('[ae]', 'X', 'abcdef')
(' XbcdXf', 2)


>>> re.sub(r'(d{1,2})/(d{1,2})/(d{2}|d{4})',
... r'2/1/3', '2/20/91') # Yes, Python is...
'20/2/91'
>>> re.sub(r'(d{1,2})/(d{1,2})/(d{2}|d{4})',
... r'2/1/3', '2/20/1991') # ... 20+ years old!
'20/2/1991'

在限定模式上使用split()分隔字符串

>>> re.split(':', 'str1:str2:str3')
[' str1', 'str2', 'str3']


>>> import re
>>> DATA = (
... 'Mountain View, CA 94040',
... 'Sunnyvale, CA',
... 'Los Altos, 94023',
... 'Cupertino 95014',
... 'Palo Alto CA',
... )
>>> for datum in DATA:
... print re.split(', |(?= (?:d{5}|[A-Z]{2})) ', datum)
...
['Mountain View', 'CA', '94040']
['Sunnyvale', 'CA']
['Los Altos', '94023']
['Cupertino', '95014']
[' Palo Alto', 'CA']

扩展符号

>>> re.findall(r'(?i)yes', 'yes? Yes. YES!!')
['yes', 'Yes', 'YES']
>>> re.findall(r'(?i)thw+', 'The quickest way is through this
tunnel.')
['The', 'through', 'this']
>>> re.findall(r'(?im)(^th[w ]+)', """
... This line is the first,
... another line,
... that line, it's the best
... """)
[' This line is the first', 'that line']

>>> re.findall(r'th.+', '''
... The first line
... the second line
... the third line
... ''')
['the second line', 'the third line']
>>> re.findall(r'(?s)th.+', '''
... The first line
... the second line
... the third line
... ''')
[' the second linenthe third linen']

>>> re.search(r'''(?x)
... ((d{3})) # 区号
... [ ] # 空白符
... (d{3}) # 前缀
... - # 横线
... (d{4}) # 终点数字
... ''', '(800) 555-1212').groups()
(' 800', '555', '1212')


>>> re.findall(r'http://(?:w+.)*(w+.com)',
... 'http://google.com http://www.google.com http://
code.google.com')
['google.com', 'google.com', 'google.com']
>>> re.search(r'((?Pd{3})) (?Pd{3})-(?:d{4})',
... '(800) 555-1212').groupdict()
{' areacode': '800', 'prefix': '555'}

>>> re.sub(r'((?Pd{3})) (?Pd{3})-(?:d{4})',
... '(g) g-xxxx', '(800) 555-1212')
'( 800) 555-xxxx'


>>> bool(re.match(r'((?Pd{3})) (?Pd{3})-
(?Pd{4}) (?P=areacode)-(?P=prefix)-(?P=number)
1(?P=areacode)(?P=prefix)(?P=number)',
... '(800) 555-1212 800-555-1212 18005551212'))
True
>>> bool(re.match(r'''(?x)
...
... # match (800) 555-1212, save areacode, prefix, no.
... ((?Pd{3}))[ ](?Pd{3})-(?Pd{4})
...
... # space
... [ ]
...
... # match 800-555-1212
... (?P=areacode)-(?P=prefix)-(?P=number)
...
... # space
... [ ]
...
... # match 18005551212
... 1(?P=areacode)(?P=prefix)(?P=number)
...
... ''', '(800) 555-1212 800-555-1212 18005551212'))
True

>>> re.findall(r'w+(?= van Rossum)',
... '''
... Guido van Rossum
... Tim Peters
... Alex Martelli
... Just van Rossum
... Raymond Hettinger
... ''')
['Guido', 'Just']
>>> re.findall(r'(?m)^s+(?!noreply|postmaster)(w+)',
... '''
... sales@phptr.com
... postmaster@phptr.com
... eng@phptr.com
... noreply@phptr.com
... admin@phptr.com
... ''')
['sales', 'eng', 'admin']
>>> ['%s@aw.com' % e.group(1) for e in 
re.finditer(r'(?m)^s+(?!noreply|postmaster)(w+)',
... '''
... sales@phptr.com
... postmaster@phptr.com
... eng@phptr.com
... noreply@phptr.com
... admin@phptr.com
... ''')]
[' sales@aw.com', 'eng@aw.com', 'admin@aw.com']



>>> bool(re.search(r'(?:(x)|y)(?(1)y|x)', 'xy'))
True
>>> bool(re.search(r'(?:(x)|y)(?(1)y|x)', 'xx'))
False

正则表达式示例

#！/usr/bin/env python

import os
from distutils.log import warn as printf #兼容python2
import re

with os.popen('who','r') as f:
	for eachLine in f:
		print(re.split(r'ss+|t',eachLine.strip()))

处理DOS 环境下tasklist 命令的输出（retasklist.py）

#!/usr/bin/env python

import os
import re

f = os.popen('tasklist /nh', 'r')
for eachLine in f:
    print re.findall(
        '([w.]+(?: [w.]+)*)ss+(d+) w+ss+d+ss+([d,]+ K)',
        eachLine.rstrip())
f.close()

用于正则表达式练习的数据生成器（gendata.py）

#!/usr/bin/env python

from distutils.log import warn as printf
from random import randrange, choice
from string import ascii_lowercase as lc
from time import ctime

tlds = ( 'com', 'edu', 'net', 'org', 'gov' )

if hasattr(__builtins__, 'xrange'):
    myrng = xrange
else:
    myrng = range

for i in myrng(randrange(5, 11)):
    dtint = randrange(2**32)    # pick date
    dtstr = ctime(dtint)        # date string
    llen = randrange(4, 7)      # login is shorter
    login = ''.join(choice(lc) for j in myrng(llen))
    dlen = randrange(llen, 13)  # domain is longer
    dom = ''.join(choice(lc) for j in myrng(dlen))
    printf('%s::%s@%s.%s::%d-%d-%d' % (dtstr, login,
        dom, choice(tlds), dtint, llen, dlen))

python正则表达式

Python相关栏目本月热门文章