在ML分类器中编码文本

您需要具有一个测试BOW函数，该函数应该重用在训练阶段构建的计数向量化器模型。
考虑使用管道来减少代码的冗长性。
from sklearn.naive_bayes import MultinomialNBimport stringfrom nltk.corpus import stopwordsimport refrom sklearn.model_selection import train_test_splitfrom io import StringIOfrom sklearn.feature_extraction.text import CountVectorizerfrom nltk.tokenize import RegexpTokenizerfrom sklearn.utils import resamplefrom sklearn.metrics import f1_score, precision_score, recall_score, accuracy_scoredef fun(text):    remove_punc = [c for c in text if c not in string.punctuation]    remove_punc = ''.join(remove_punc)    cleaned = [w for w in remove_punc.split() if w.lower()    not in stopwords.words('english')]    return cleaned# Testing Count Vectorizerdef BOW(data):    df_temp = data.copy(deep=True)    df_temp = basic_preprocessing(df_temp)    count_vectorizer = CountVectorizer(analyzer=fun)    count_vectorizer.fit(df_temp['Text'])    list_corpus = df_temp["Text"].tolist()    list_labels = df_temp["Label"].tolist()    X = count_vectorizer.transform(list_corpus)    return X, list_labels, count_vectorizerdef test_BOW(data, count_vectorizer):    df_temp = data.copy(deep=True)    df_temp = basic_preprocessing(df_temp)    list_corpus = df_temp["Text"].tolist()    list_labels = df_temp["Label"].tolist()    X = count_vectorizer.transform(list_corpus)    return X, list_labelsdef basic_preprocessing(df):    df_temp = df.copy(deep=True)    df_temp = df_temp.rename(index=str, columns={'Clean_Titles_2': 'Text'})    df_temp.loc[:, 'Text'] = [text_prepare(x) for x in df_temp['Text'].values]    tokenizer = RegexpTokenizer(r'w+')    df_temp["Tokens"] = df_temp["Text"].apply(tokenizer.tokenize)    return df_tempdef text_prepare(text):    REPLACe_BY_SPACE_RE = re.compile('[/(){}[]|@,;]')    BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')    STOPWORDS = set(stopwords.words('english'))    text = text.lower()    # replace REPLACE_BY_SPACE_RE symbols by space in text    text = REPLACE_BY_SPACE_RE.sub('', text)    # delete symbols which are in BAD_SYMBOLS_RE from text    text = BAD_SYMBOLS_RE.sub('', text)    words = text.split()    i = 0    while i < len(words):        if words[i] in STOPWORDS: words.pop(i)        else: i += 1    text = ' '.join(map(str, words))  # delete stopwords from text    return texts = """Label      TextYear1         bla bla bla     20000         add some words  20121         this is just an example    19980         unfortunately the pre does not work  20180         where should I apply the encoding?    20000         What am I missing here?    2005"""df = pd.read_csv(StringIO(s), sep='s{2,}')X = df[['Text']]y = df['Label']X_train, X_test, y_train, y_test = train_test_split(    X, y, test_size=0.2, random_state=40)# Returning to one dataframetraining_set = pd.concat([X_train, y_train], axis=1)# Separating classesspam = training_set[training_set.Label == 1]not_spam = training_set[training_set.Label == 0]# Undersampling the majorityundersample = resample(not_spam, replace=True, # set the number of samples to equal the number of the minority class n_samples=len(spam), random_state=40)# Returning to new training setundersample_train = pd.concat([spam, undersample])full_result = pd.Dataframe(columns=['Preprocessing', 'Model', 'Precision',   'Recall', 'F1-score', 'Accuracy'])train_x, train_y, count_vectorizer  = BOW(undersample_train)testing_set = pd.concat([X_test, y_test], axis=1)test_x, test_y = test_BOW(testing_set, count_vectorizer)def training_naive(X_train_naive, X_test_naive, y_train_naive, y_test_naive, preproc):    clf = MultinomialNB() # Gaussian Naive Bayes    clf.fit(X_train_naive, y_train_naive)    res = pd.Dataframe(columns = ['Preprocessing', 'Model', 'Precision', 'Recall', 'F1-score', 'Accuracy'])    y_pred = clf.predict(X_test_naive)    f1 = f1_score(y_pred, y_test_naive, average = 'weighted')    pres = precision_score(y_pred, y_test_naive, average = 'weighted')    rec = recall_score(y_pred, y_test_naive, average = 'weighted')    acc = accuracy_score(y_pred, y_test_naive)    res = res.append({'Preprocessing': preproc, 'Model': 'Naive Bayes', 'Precision': pres,'Recall': rec, 'F1-score': f1, 'Accuracy': acc}, ignore_index = True)    return resfull_result = full_result.append(training_naive(train_x, test_x, train_y, test_y, 'Count Vectorize'), ignore_index = True)
在ML分类器中编码文本

面试问答相关栏目本月热门文章