在做点击率预估GBDT+LR模型时,使用lightgbm训练时,报错
# create dataset for lightgbm
lgb_train = lgb.Dataset(X_train, label = y_train)
lgb_eval = lgb.Dataset(X_test, label = y_test)
params = {
'task': 'train',
'boosting_type': 'gbdt',
'objective': 'binary',
'metric': {'binary_logloss'},
'num_leaves': 64,
'num_trees': 100,
'learning_rate': 0.01,
'feature_fraction': 0.9,
'bagging_fraction': 0.8,
'bagging_freq': 5,
'verbose': 0
}
# number of leaves,will be used in feature transformation
num_leaf = 64
print('Start training...')
# train
gbm = lgb.train(params,
lgb_train,
num_boost_round=100,
valid_sets=lgb_train)
有如下报错
ValueError Traceback (most recent call last)in () 25 lgb_train, 26 num_boost_round=100, ---> 27 valid_sets=lgb_train) ~/anaconda3/lib/python3.7/site-packages/lightgbm/engine.py in train(params, train_set, num_boost_round, valid_sets, valid_names, fobj, feval, init_model, feature_name, categorical_feature, early_stopping_rounds, evals_result, verbose_eval, learning_rates, keep_training_booster, callbacks) 269 # construct booster 270 try: --> 271 booster = Booster(params=params, train_set=train_set) 272 if is_valid_contain_train: 273 booster.set_train_data_name(train_data_name) ~/anaconda3/lib/python3.7/site-packages/lightgbm/basic.py in __init__(self, params, train_set, model_file, model_str, silent) 2603 ) 2604 # construct booster object -> 2605 train_set.construct() 2606 # copy the parameters from train_set 2607 params.update(train_set.get_params()) ~/anaconda3/lib/python3.7/site-packages/lightgbm/basic.py in construct(self) 1817 init_score=self.init_score, predictor=self._predictor, 1818 silent=self.silent, feature_name=self.feature_name, -> 1819 categorical_feature=self.categorical_feature, params=self.params) 1820 if self.free_raw_data: 1821 self.data = None ~/anaconda3/lib/python3.7/site-packages/lightgbm/basic.py in _lazy_init(self, data, label, reference, weight, group, init_score, predictor, silent, feature_name, categorical_feature, params) 1475 feature_name, 1476 categorical_feature, -> 1477 self.pandas_categorical) 1478 label = _label_from_pandas(label) 1479 ~/anaconda3/lib/python3.7/site-packages/lightgbm/basic.py in _data_from_pandas(data, feature_name, categorical_feature, pandas_categorical) 564 if isinstance(data, pd_Dataframe): 565 if len(data.shape) != 2 or data.shape[0] < 1: --> 566 raise ValueError('Input data must be 2 dimensional and non empty.') 567 if feature_name == 'auto' or feature_name is None: 568 data = data.rename(columns=str) ValueError: Input data must be 2 dimensional and non empty.
报错字面的意思就是“输入数据必须是二维且非空的”,于是检查我的X_train和y_train,发现X_train是二维的,而y_train是 pandas.core.series.Series
ok,fine。。。是我切分数据集的时候的锅
X = df[cols_all[1:]] # training dataset # label encode lbl = preprocessing.LabelEncoder() X['site_domain'] = lbl.fit_transform(X['site_domain'].astype(str))#将提示的包含错误数据类型这一列进行转换 X['site_id'] = lbl.fit_transform(X['site_id'].astype(str)) X['site_category'] = lbl.fit_transform(X['site_category'].astype(str)) X['app_id'] = lbl.fit_transform(X['app_id'].astype(str)) X['app_category'] = lbl.fit_transform(X['app_category'].astype(str)) X_train = X.iloc[:-2000] X_test = X.iloc[-2000:] # testing dataset y = df['click'] y_train = y.iloc[:-2000] # training label y_test = y.iloc[-2000:] # testing label解决方案
1、不要用这种方式拆分数据集
x = data.iloc[:, 2:] y = data.loc['Survived'] # 使用train_test_split函数划分数据集(训练集占75%,测试集占25%) x_train, x_test, y_train,y_test = train_test_split(x, y, test_size=0.25, ramdon_state=0)
2、如果已经使用,那么再将pandas.core.series.Series转化为dataframe吧
y_train.to_frame()
tip:这种方法不能将index一起转换,如果需要转化index的话,那就
ok,fine



