感谢Radim和Larsmans。我的目标是要拥有一个与您在R tm中获得的DTM类似的DTM。我决定使用scikit-
learn,部分受此博客文章的启发。这是我想出的代码。
我将其发布在这里,希望其他人会发现它有用。
import pandas as pdfrom sklearn.feature_extraction.text import CountVectorizerdef fn_tdm_df(docs, xColNames = None, **kwargs): ''' create a term document matrix as pandas Dataframe with **kwargs you can pass arguments of CountVectorizer if xColNames is given the dataframe gets columns Names''' #initialize the vectorizer vectorizer = CountVectorizer(**kwargs) x1 = vectorizer.fit_transform(docs) #create dataframe df = pd.Dataframe(x1.toarray().transpose(), index = vectorizer.get_feature_names()) if xColNames is not None: df.columns = xColNames return df
在目录中的文本列表上使用它
DIR = 'C:/Data/'def fn_CorpusFromDIR(xDIR): ''' functions to create corpus from a Directories Input: Directory Output: A dictionary with Names of files ['ColNames'] the text in corpus ['docs']''' import os Res = dict(docs = [open(os.path.join(xDIR,f)).read() for f in os.listdir(xDIR)], ColNames = map(lambda x: 'P_' + x[0:6], os.listdir(xDIR))) return Res
创建数据框
d1 = fn_tdm_df(docs = fn_CorpusFromDIR(DIR)['docs'], xColNames = fn_CorpusFromDIR(DIR)['ColNames'],stop_words=None, charset_error = 'replace')



