lxml有一些
与处理Unipre有关的问题。最好在明确指定字符编码时使用字节(目前):
#!/usr/bin/env pythonimport globfrom lxml import htmlfrom bs4 import UnipreDammitfor filename in glob.glob('*.html'): with open(filename, 'rb') as file: content = file.read() doc = UnipreDammit(content, is_html=True) parser = html.HTMLParser(encoding=doc.original_encoding) root = html.document_fromstring(content, parser=parser) title = root.find('.//title').text_content() print(title)输出量
Unipre Chars: 은 —’Unipre Chars: 은 —’Unipre Chars: 은 —’



