pandas
设定
from io import StringIOimport pandas as pdimport numpy as nptxt = """pos M1 M2 M3 M4 M5 M6 M7 M8 hybrid_block S1 S2 S3 S4 S5 S6 S7 S81 A T T A A G A C A|C C G C T T A G A2 T G C T G T T G T|A A T A T C A A T3 C A A C A G T C C|G G A C G C G C G4 G T G T A T C T G|T C T T T A T C T """df = pd.read_csv(StringIO(txt), delim_whitespace=True, index_col='pos')df
解
大多pandas
与numpy
- 拆分混合柱
- 在相同的第一行之前
- 加上self的偏移版本以获取
'AgA'
类型字符串
d1 = pd.concat([df.loc[[1]].rename(index={1: 0}), df])d1 = pd.concat([ df.filter(like='M'), df.hybrid_block.str.split('|', expand=True).rename(columns='H{}'.format), df.filter(like='S') ], axis=1)d1 = pd.concat([d1.loc[[1]].rename(index={1: 0}), d1])d1 = d1.add('g').add(d1.shift()).dropna()d1将方便的块分配给自己的变量名
m = d1.filter(like='M')s = d1.filter(like='S')h = d1.filter(like='H')
计算每个块中有多少并连接
mcounts = pd.Dataframe( (m.values[:, :, None] == h.values[:, None, :]).sum(1), h.index, h.columns)scounts = pd.Dataframe( (s.values[:, :, None] == h.values[:, None, :]).sum(1), h.index, h.columns)counts = pd.concat([mcounts, scounts], axis=1, keys=['M', 'S'])counts
如果你真的想要字典
d = defaultdict(lambda:defaultdict(list))dict_df = counts.stack().join(h.stack().rename('condition')).unstack()for pos, row in dict_df.iterrows(): d['M']['H0'].append((row.loc[('condition', 'H0')], row.loc[('M', 'H0')])) d['S']['H0'].append((row.loc[('condition', 'H0')], row.loc[('S', 'H0')])) d['M']['H1'].append((row.loc[('condition', 'H1')], row.loc[('M', 'H1')])) d['S']['H1'].append((row.loc[('condition', 'H1')], row.loc[('S', 'H1')]))dict(d){'M': defaultdict(list, {'H0': [('AgA', 4), ('TgA', 3), ('CgT', 2), ('GgC', 1)], 'H1': [('CgC', 1), ('AgC', 0), ('GgA', 0), ('TgG', 1)]}), 'S': defaultdict(list, {'H0': [('AgA', 2), ('TgA', 1), ('CgT', 0), ('GgC', 0)], 'H1': [('CgC', 2), ('AgC', 2), ('GgA', 2), ('TgG', 3)]})}


