import pandas as pd
import gc
import pandas as pd
train_df = pd.read_csv('train_set.csv', sep='t')
test_df=pd.read_csv('test_a.csv', sep='t')
train_df['word'] = train_df['text'].apply(lambda x: len(x.split(' ')))
#train_df['word']=train_df['text'].apply(lambda x:x.count(' ')+1)
train_df['word'].mean()
train_df.groupby('label')['word'].mean()
train_df['label'].value_counts(normalize=True)
label_dict=dict()
for i in range(14):
tem_dict={}
tem=train_df.loc[train_df.label==i,'text']
for sen in tem.values:
sen_list=sen.split(' ')
for word in sen_list:
try:
tem_dict[word] +=1
except:
tem_dict[word] = 1
sort_list=sorted(tem_dict.items(),key=lambda x:x[1],reverse=True)
label_dict[i]=[i for i,j in sort_list[:5]]
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
tfidf=TfidfVectorizer(stop_words=['3750','648'],max_df=0.95,min_df=0.05).fit(train_df['text'])
train_f=tfidf.transform(train_df['text'])
test_f=tfidf.transform(test_df['text'])
gc.collect()
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(random_state=2021,
solver='newton-cg',
max_iter=1e5,
penalty='l2').fit(train_f, train_df['label'])
test_y_pred_lr = lr.predict(test_f)
from xgboost import XGBClassifier
xgb = XGBClassifier(booster='gbtree',
n_estimators=10,
max_depth=5,
learning_rate=0.1126,
objective='multi:softmax',
subsample=0.9,
random_state=2021,
#num_classes=14,
colsample_bytree=0.8,
#early_stopping_rounds=5,
#verbose=10,
#eval_metric='auc'
)
xgb.fit(train_f, train_df['label'])
test_y_pred_xgb = xgb.predict(test_f)
submit=pd.Dataframe()
submit['pred_lr']=test_y_pred_lr
submit['pred_xgb']=test_y_pred_xgb
#submit[(submit['pred_lr']-submit['pred_xgb'])!=0]