从大型结构化文本文件中提取信息

很好下面是一些建议，如果您喜欢，请告诉我：
import reimport pprintimport sysclass Despacho(object):    """    Class to parse each line, applying the regexp and storing the results    for future use    """    #used a dict with the keys instead of functions.    regexp = {        ('processo',          'data',          'despacho'): re.compile(r'No.([d]{9})  ([d]{2}/[d]{2}/[d]{4})  (.*)'),        ('titular',): re.compile(r'Tit.(.*)'),        ('procurador',): re.compile(r'Procurador: (.*)'),        ('documento',): re.compile(r'C.N.P.J./C.I.C./N INPI :(.*)'),        ('apresentacao',         'natureza'): re.compile(r'Apres.: (.*) ; Nat.: (.*)'),        ('marca',): re.compile(r'Marca: (.*)'),        ('classe',): re.compile(r'Clas.Prod/Serv: (.*)'),        ('complemento',): re.compile(r'*(.*)'),    }    def __init__(self):        """        'complemento' is the only field that can be multiple in a single registry        """        self.complemento = []    def read(self, line):        for attrs, pattern in Despacho.regexp.iteritems(): m = pattern.match(line) if m:     for groupn, attr in enumerate(attrs):         # special case complemento:         if attr == 'complemento':  self.complemento.append(m.group(groupn + 1))         else:  # set the attribute on the object  setattr(self, attr, m.group(groupn + 1))    def __repr__(self):        # defines object printed representation        d = {}        for attrs in self.regexp: for attr in attrs:     d[attr] = getattr(self, attr, None)        return pprint.pformat(d)def process(rpi):    """    read data and process each group    """    #Useless line, since you're doing a for anyway    #rpi = (line for line in rpi)    group = False    for line in rpi:        if line.startswith('No.'): group = True d = Despacho()        if not line.strip() and group: # empty line - end of block yield d group = False        d.read(line)def main():    arquivo = open('rm1972.txt') # file to process    for desp in process(arquivo):        print desp # can print directly here.        print('-' * 20)    return 0if __name__ == '__main__':    main()
从大型结构化文本文件中提取信息

面试问答相关栏目本月热门文章