-
Notifications
You must be signed in to change notification settings - Fork 33
/
nltk_classifiers.py
74 lines (55 loc) · 2.13 KB
/
nltk_classifiers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
from __future__ import print_function
import os
import pickle
from __init__ import *
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import WordPunctTokenizer
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
from nltk.classify import NaiveBayesClassifier
def extract_words(text):
stemmer = PorterStemmer()
tokenizer = WordPunctTokenizer()
tokens = tokenizer.tokenize(text)
bigram_finder = BigramCollocationFinder.from_words(tokens)
bigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 500)
for bigram_tuple in bigrams:
x = "%s %s" % bigram_tuple
tokens.append(x)
result = [stemmer.stem(x.lower()) for x in tokens
if x not in stopwords.words('english') and len(x) > 1]
return result
def get_feature(word):
return dict([(word, True)])
def bag_of_words(words):
return dict([(word, True) for word in words])
def create_training_dict(text, sense):
''' returns a dict ready for a classifier's test method '''
tokens = extract_words(text)
return [(bag_of_words(tokens), sense)]
def get_train_set(texts):
train_set = []
for sense, file in texts.iteritems():
print("training %s " % sense)
text = open(file, 'r').read()
features = extract_words(text)
train_set = train_set + [(get_feature(word), sense)
for word in features]
return train_set
if __name__ == '__main__':
texts = {}
texts['neg'] = 'data/neg-tokens'
texts['pos'] = 'data/pos-tokens'
if not os.path.exists('classifier.pickle'):
train_set = get_train_set(texts)
classifier = NaiveBayesClassifier.train(train_set)
pickle.dump(classifier, open('classifier.pickle', 'w'))
else:
classifier = pickle.load(open('classifier.pickle', 'r'))
#classifier.show_most_informative_features(20)
for line in open("data/sample_review.txt", 'r'):
tokens = bag_of_words(extract_words(line))
decision = classifier.classify(tokens)
result = "%s - %s" % (decision, line)
print(result)