UPDATe (删除了先前的功能)
UPDATE2已 修复和简化。
我的第一个功能是错误的。这是另一个,它正在工作,但需要测试:
#!/usr/bin/env python# -*- coding: utf-8 -*-from collections import defaultdictdef table_to_list(table): dct = table_to_2d_dict(table) return list(iter_2d_dict(dct))def table_to_2d_dict(table): result = defaultdict(lambda : defaultdict(unipre)) for row_i, row in enumerate(table.xpath('./tr')): for col_i, col in enumerate(row.xpath('./td|./th')): colspan = int(col.get('colspan', 1)) rowspan = int(col.get('rowspan', 1)) col_data = col.text_content() while row_i in result and col_i in result[row_i]: col_i += 1 for i in range(row_i, row_i + rowspan): for j in range(col_i, col_i + colspan): result[i][j] = col_data return resultdef iter_2d_dict(dct): for i, row in sorted(dct.items()): cols = [] for j, col in sorted(row.items()): cols.append(col) yield colsif __name__ == '__main__': import lxml.html from pprint import pprint doc = lxml.html.parse('tables.html') for table_el in doc.xpath('//table'): table = table_to_list(table_el) pprint(table)table.html :
<table border="1"> <tr> <td>1 </td> <td>1 </td> <td>1 </td> <td rowspan="4">Thing</td> <td>1 </td> </tr> <tr> <td>2 </td> <td>2 </td> <td>2 </td> <td>2 </td> </tr> <tr> <td>3 </td> <td>3 </td> <td>3 </td> <td>3 </td> </tr> <tr> <td>4 </td> <td>4 </td> <td>4 </td> <td>4 </td> </tr></table><table border="1"><tr> <td colspan="2" rowspan="4">#1</td> <td rowspan="4">#2</td> <td rowspan="2">#3</td> <td rowspan="2">#4</td></tr><tr></tr><tr> <td rowspan="2">#5</td> <td rowspan="2">#6</td></tr><tr></tr></table>
输出:
[['1 ', '1 ', '1 ', 'Thing', '1 '], ['2 ', '2 ', '2 ', 'Thing', '2 '], ['3 ', '3 ', '3 ', 'Thing', '3 '], ['4 ', '4 ', '4 ', 'Thing', '4 ']][['#1', '#1', '#2', '#3', '#4'], ['#1', '#1', '#2', '#3', '#4'], ['#1', '#1', '#2', '#5', '#6'], ['#1', '#1', '#2', '#5', '#6']]



