很好 下面是一些建议,如果您喜欢,请告诉我:
import reimport pprintimport sysclass Despacho(object): """ Class to parse each line, applying the regexp and storing the results for future use """ #used a dict with the keys instead of functions. regexp = { ('processo', 'data', 'despacho'): re.compile(r'No.([d]{9}) ([d]{2}/[d]{2}/[d]{4}) (.*)'), ('titular',): re.compile(r'Tit.(.*)'), ('procurador',): re.compile(r'Procurador: (.*)'), ('documento',): re.compile(r'C.N.P.J./C.I.C./N INPI :(.*)'), ('apresentacao', 'natureza'): re.compile(r'Apres.: (.*) ; Nat.: (.*)'), ('marca',): re.compile(r'Marca: (.*)'), ('classe',): re.compile(r'Clas.Prod/Serv: (.*)'), ('complemento',): re.compile(r'*(.*)'), } def __init__(self): """ 'complemento' is the only field that can be multiple in a single registry """ self.complemento = [] def read(self, line): for attrs, pattern in Despacho.regexp.iteritems(): m = pattern.match(line) if m: for groupn, attr in enumerate(attrs): # special case complemento: if attr == 'complemento': self.complemento.append(m.group(groupn + 1)) else: # set the attribute on the object setattr(self, attr, m.group(groupn + 1)) def __repr__(self): # defines object printed representation d = {} for attrs in self.regexp: for attr in attrs: d[attr] = getattr(self, attr, None) return pprint.pformat(d)def process(rpi): """ read data and process each group """ #Useless line, since you're doing a for anyway #rpi = (line for line in rpi) group = False for line in rpi: if line.startswith('No.'): group = True d = Despacho() if not line.strip() and group: # empty line - end of block yield d group = False d.read(line)def main(): arquivo = open('rm1972.txt') # file to process for desp in process(arquivo): print desp # can print directly here. print('-' * 20) return 0if __name__ == '__main__': main()


