Below is the code of training Naive Bayes Classifier
on movie_reviews
dataset for unigram
model. I want to train and analyze its performance by considering bigram
, trigram
model. How can we do it.
JavaScript
x
28
28
1
import nltk.classify.util
2
from nltk.classify import NaiveBayesClassifier
3
from nltk.corpus import movie_reviews
4
from nltk.corpus import stopwords
5
from nltk.tokenize import word_tokenize
6
7
def create_word_features(words):
8
useful_words = [word for word in words if word not in stopwords.words("english")]
9
my_dict = dict([(word, True) for word in useful_words])
10
return my_dict
11
12
pos_data = []
13
for fileid in movie_reviews.fileids('pos'):
14
words = movie_reviews.words(fileid)
15
pos_data.append((create_word_features(words), "positive"))
16
17
neg_data = []
18
for fileid in movie_reviews.fileids('neg'):
19
words = movie_reviews.words(fileid)
20
neg_data.append((create_word_features(words), "negative"))
21
22
train_set = pos_data[:800] + neg_data[:800]
23
test_set = pos_data[800:] + neg_data[800:]
24
25
classifier = NaiveBayesClassifier.train(train_set)
26
27
accuracy = nltk.classify.util.accuracy(classifier, test_set)
28
Advertisement
Answer
Simply change your featurizer
JavaScript
1
7
1
from nltk import ngrams
2
3
def create_ngram_features(words, n=2):
4
ngram_vocab = ngrams(words, n)
5
my_dict = dict([(ng, True) for ng in ngram_vocab])
6
return my_dict
7
BTW, your code will be a lot faster if you change your featurizer to do use a set for your stopword list and initialize it only once.
JavaScript
1
7
1
stoplist = set(stopwords.words("english"))
2
3
def create_word_features(words):
4
useful_words = [word for word in words if word not in stoplist]
5
my_dict = dict([(word, True) for word in useful_words])
6
return my_dict
7
Someone should really tell the NLTK people to convert the stopwords list into a set type since it’s “technically” a unique list (i.e. a set).
JavaScript
1
6
1
>>> from nltk.corpus import stopwords
2
>>> type(stopwords.words('english'))
3
<class 'list'>
4
>>> type(set(stopwords.words('english')))
5
<class 'set'>
6
For the fun of benchmarking
JavaScript
1
31
31
1
import nltk.classify.util
2
from nltk.classify import NaiveBayesClassifier
3
from nltk.corpus import movie_reviews
4
from nltk.corpus import stopwords
5
from nltk.tokenize import word_tokenize
6
from nltk import ngrams
7
8
def create_ngram_features(words, n=2):
9
ngram_vocab = ngrams(words, n)
10
my_dict = dict([(ng, True) for ng in ngram_vocab])
11
return my_dict
12
13
for n in [1,2,3,4,5]:
14
pos_data = []
15
for fileid in movie_reviews.fileids('pos'):
16
words = movie_reviews.words(fileid)
17
pos_data.append((create_ngram_features(words, n), "positive"))
18
19
neg_data = []
20
for fileid in movie_reviews.fileids('neg'):
21
words = movie_reviews.words(fileid)
22
neg_data.append((create_ngram_features(words, n), "negative"))
23
24
train_set = pos_data[:800] + neg_data[:800]
25
test_set = pos_data[800:] + neg_data[800:]
26
27
classifier = NaiveBayesClassifier.train(train_set)
28
29
accuracy = nltk.classify.util.accuracy(classifier, test_set)
30
print(str(n)+'-gram accuracy:', accuracy)
31
[out]:
JavaScript
1
6
1
1-gram accuracy: 0.735
2
2-gram accuracy: 0.7625
3
3-gram accuracy: 0.8275
4
4-gram accuracy: 0.8125
5
5-gram accuracy: 0.74
6
Your original code returns an accuracy of 0.725.
Use more orders of ngrams
JavaScript
1
30
30
1
import nltk.classify.util
2
from nltk.classify import NaiveBayesClassifier
3
from nltk.corpus import movie_reviews
4
from nltk.corpus import stopwords
5
from nltk.tokenize import word_tokenize
6
from nltk import everygrams
7
8
def create_ngram_features(words, n=2):
9
ngram_vocab = everygrams(words, 1, n)
10
my_dict = dict([(ng, True) for ng in ngram_vocab])
11
return my_dict
12
13
for n in range(1,6):
14
pos_data = []
15
for fileid in movie_reviews.fileids('pos'):
16
words = movie_reviews.words(fileid)
17
pos_data.append((create_ngram_features(words, n), "positive"))
18
19
neg_data = []
20
for fileid in movie_reviews.fileids('neg'):
21
words = movie_reviews.words(fileid)
22
neg_data.append((create_ngram_features(words, n), "negative"))
23
24
train_set = pos_data[:800] + neg_data[:800]
25
test_set = pos_data[800:] + neg_data[800:]
26
classifier = NaiveBayesClassifier.train(train_set)
27
28
accuracy = nltk.classify.util.accuracy(classifier, test_set)
29
print('1-gram to', str(n)+'-gram accuracy:', accuracy)
30
[out]:
JavaScript
1
6
1
1-gram to 1-gram accuracy: 0.735
2
1-gram to 2-gram accuracy: 0.7625
3
1-gram to 3-gram accuracy: 0.7875
4
1-gram to 4-gram accuracy: 0.8
5
1-gram to 5-gram accuracy: 0.82
6