from nltk.tokenize import RegexpTokenizer #from stop_words import get_stop_words from gensim import corpora, models import gensim import os from os import path from time import sleep filename_2 = "buisness1.txt" file1 = open(filename_2, encoding='utf-8') Reader = file1.read() tdm = [] # Tokenized the text to individual terms and created the stop list tokens = Reader.split() #insert stopwords files stopwordfile = open("StopWords.txt", encoding='utf-8') # Use this to read file content as a stream readstopword = stopwordfile.read() stop_words = readstopword.split() for r in tokens: if not r in stop_words: #stopped_tokens = [i for i in tokens if not i in en_stop] tdm.append(r) dictionary = corpora.Dictionary(tdm) corpus = [dictionary.doc2bow(i) for i in tdm] sleep(3) #Implemented the LdaModel ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=10, id2word = dictionary) print(ldamodel.print_topics(num_topics=1, num_words=1))
I am trying to remove stop words using a separate txt file which contains stop words. And after I remove the stop words I am appending words of the text file which is not there in the stop words. I am getting the error doc2bow expects an array of unicode tokens on input, not a single string
at dictionary = corpora.Dictionary(tdm)
.
Can anyone help me correct my code
Advertisement
Answer
This is almost certainly a duplicate, but use this instead:
dictionary = corpora.Dictionary([tdm])