-
Notifications
You must be signed in to change notification settings - Fork 19
/
naives.py
66 lines (52 loc) · 2.04 KB
/
naives.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import pickle, nltk
from stopwords import stopwords, is_stopword
from nltk import TextCollection, Text
from pprint import pprint
from copy import copy
import time
from nltk.corpus import brown
with open("l2.pickle") as f:
lyrics = pickle.load(f)
tags = ["love", "motivation", "sadness"]
print tags
with open("/usr/share/dict/american-english") as dico:
dict_words = list(dico.readlines())
def valid(tag):
return tag and (tag.startswith("VB") or tag.startswith("JJ"))
def all_words(lyrics):
for l in lyrics:
for w in nltk.word_tokenize(" ".join(l["lyrics"])):
# print w, unigram_tagger.tag([w])
if l.get("title") and (not is_stopword(w)) and valid(unigram_tagger.tag([w])[0][1]):
yield w.lower()
def document_features(document):
document_words = set(document)
features = {}
for word in word_features:
features['contains(%s)' % word] = (word in document_words)
return features
def divide_lyrics(lyric):
for t in lyric["tags"]:
if t in tags:
l = copy(lyric)
l["tags"] = t
yield l
brown_tagged_sents = brown.tagged_sents(categories='news')
brown_sents = brown.sents(categories='news')
unigram_tagger = nltk.UnigramTagger(brown_tagged_sents)
unigram_tagger.tag(brown_sents[2007])
lyrics = [l for l in lyrics if l.get("title")]
aw = nltk.FreqDist(all_words(lyrics))
word_features = sorted(aw.keys())
#all_words = nltk.FreqDist(w.lower() for l in lyrics for w in nltk.word_tokenize(" ".join(l["lyrics"])) if l.get("title") if not is_stopword(w))
#word_features = sorted(all_words.keys())
print word_features
lyrics = [dl for l in lyrics for dl in divide_lyrics(l)]
t = time.time()
featuresets = [(document_features(l["lyrics"]), l["tags"]) for l in lyrics]
train_set, test_set = featuresets[100:], featuresets[:100]
classifier = nltk.NaiveBayesClassifier.train(train_set)
#print document_features(lyrics[1]["lyrics"])
print time.time() - t
print nltk.classify.accuracy(classifier, test_set)
classifier.show_most_informative_features(100)