比赛地址链接:https://www.datafountain.cn/competitions/520
数据:
代码:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
from sklearn.model_selection import KFold
from category_encoders import TargetEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
import math
from sklearn import ensemble
from datetime import datetime
# 读取数据
train = pd.read_csv('train_data.csv',encoding='gb18030')
test = pd.read_csv('A_test_data.csv',encoding='gb18030')
data = pd.concat([train, test]).reset_index(drop=True)
# 特征处理
for col in data.columns:
if col not in ['ret','time','id']:
le = LabelEncoder()
data[col] = le.fit_transform(data[col])
# 时间戳的处理
data['time']= pd.to_datetime(data['time'], format='%Y-%m-%d')
data['month']= data['time'].dt.month
data['day']= data['time'].dt.day
data['weekday']= data['time'].dt.weekday
data['hour'] = data['time'].dt.hour
# 类别特征
data = pd.get_dummies(data, columns=['vlan','group'],drop_first=True)
# 训练
train_x = data[data['ret'].notnull()]
train_y = train['ret']
test_A = data[data['ret'].isnull()]
target=train['ret']
features=[x for x in data.columns if x not in ['ret', 'time', 'id']]
predictions_lgb= np.zeros((len(test)))
KF = KFold(n_splits=5, shuffle=True, random_state=2021)
params={"objective": "regression",
"learning_rate": 0.05,
"max_depth": 7,
"num_leaves": 32,
"bagging_fraction": 0.8,
"feature_fraction": 0.8,
"num_threads": 64,
'metric': 'rmse',
"verbose": -1}
oof_lgb = np.zeros(len(train_x))
for fold_, (trn_idx, val_idx) in enumerate(KF.split(train_x.values, train_y.values)):
print("fold n°{}".format(fold_))
trn_data = lgb.Dataset(train_x.iloc[trn_idx][features], label=train_y.iloc[trn_idx])
val_data = lgb.Dataset(train_x.iloc[val_idx][features], label=train_y.iloc[val_idx])
num_round = 6000
clf = lgb.train(
params,
trn_data,
num_round,
valid_sets=[trn_data, val_data],
verbose_eval=500,
)
oof_lgb[val_idx] = clf.predict(train_x.iloc[val_idx][features], num_iteration=clf.best_iteration)
predictions_lgb[:] += clf.predict(test_A[features], num_iteration=clf.best_iteration)
predictions_lgb= predictions_lgb / 5
print('score:', (1/((math.sin(math.atan(np.sqrt(mean_squared_error(oof_lgb,target)))))+1)))
test['ret'] = predictions_lgb
test[['id', 'ret']].to_csv("submission.csv", index=False)
参考:
https://blog.csdn.net/qq_44694861/article/details/120423658



