import pandas as pd
# 设置参数
pd.set_option("display.max_rows", 999)
pd.set_option("display.max_columns", 999)
pd.set_option("precision", 5)
pd.set_option('display.max_colwidth', -1)
各类文件读取
name_sample = pd.read_csv("./name_sample_usedirectly.csv") # csv读取
name_sample = pd.read_parquet("./name_sample_usedirectly.parquet") # parquet读取
name_sample = pd.read_excel("./name_sample_usedirectly.xls") # xls读取
print(len(name_sample), name_sample.columns) # 获取数据条数和列名
print(name_sample.shape) # 获取数据shape
print(name_sample.head(3)) # 获取前三行内容
print(name_sample[:3]) # 获取前三行内容
获取各列信息内容
# 获取满足某个条件或某几个条件的数据
a = name_sample[name_sample['c1'] == 123] # 可以写多个条件
# a = name_sample[(name_sample['c1'] == 123) & (name_sample['c2'] == 1)]
# 获取满足某个条件或某几个条件的数据
b = name_sample.loc[name_sample.c1 == 123] #可以写多个条件
# b = name_sample.loc[(name_sample.c1 == 100663) & (name_sample['c2'] == 1)]
# b = name_sample.loc[name_sample.c1 == 100663].loc[name_sample.c2 == 1]
# 获取值属于某个列表内的所有数据
c = name_sample[name_sample.c1.isin([123])] # 可以获取多个item_id对应的数据
# c = name_sample[name_sample.c1.isin([123, 234])]
# 获取值在某个范围内的数据
d = name_sample[name_sample['date'].between(datetime.date(2021, 12, 6), datetime.date(2021, 12, 8))]
# e = name_sample[name_sample['c1'].between(122, 130)]
# 获取值在另一个数据表内出现的所有数据,类似sql的left semi join,被isin的只能用于过滤
e = name_sample[name_sample.c1.isin(other_dataframe.d1)].reset_index()
获取某几列信息,修改列名,排序
name_sample = name_sample.rename(columns={
"c1": "new_c1",
"c2": "new_c2"
})
name_sample = name_sample[['c1','c3']]
g = name_sample.sort_values('c1', ascending=False).reset_index()
和其他表merge
# 如果是和其他表按照某列merge
h = name_sample.merge(other_table, on="c1", how="inner") # how可取left,outer等
# on可以取多列merge
# h = name_sample.merge(other_table, on=["c1",'c2'], how="inner") # how可取left,outer等
# 如果不按照某列merge,而是按照行对应join的话按index merge, 如模型predict结果是series,需先转换成y_pred0 = pd.Dataframe(y_pred)
i = name_sample.merge(y_pred0, left_index=True,right_index=True,how='outer')