总是有正则表达式;只需在方括号内列出所有令人反感的字符,如下所示:
import reprint re.sub(r'[xc2x99]'," ","Helloxc2Therex99")
打印:’Hello There’,用空格替换不需要的字符。
或者,如果每个字符都有不同的替换字符:
# remove annoying characterschars = { 'xc2x82' : ',', # High pre comma 'xc2x84' : ',,', # High pre double comma 'xc2x85' : '...', # Tripple dot 'xc2x88' : '^', # High carat 'xc2x91' : 'x27', # Forward single quote 'xc2x92' : 'x27', # Reverse single quote 'xc2x93' : 'x22', # Forward double quote 'xc2x94' : 'x22', # Reverse double quote 'xc2x95' : ' ', 'xc2x96' : '-', # High hyphen 'xc2x97' : '--', # Double hyphen 'xc2x99' : ' ', 'xc2xa0' : ' ', 'xc2xa6' : '|', # Split vertical bar 'xc2xab' : '<<', # Double less than 'xc2xbb' : '>>', # Double greater than 'xc2xbc' : '1/4', # one quarter 'xc2xbd' : '1/2', # one half 'xc2xbe' : '3/4', # three quarters 'xcaxbf' : 'x27', # c-single quote 'xccxa8' : '', # modifier - under curve 'xccxb1' : '' # modifier - under line}def replace_chars(match): char = match.group(0) return chars[char]return re.sub('(' + '|'.join(chars.keys()) + ')', replace_chars, text)


