-
Notifications
You must be signed in to change notification settings - Fork 0
/
find_tags.py
65 lines (45 loc) · 1.56 KB
/
find_tags.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
from __future__ import division
import os
import json
import math
from textblob import TextBlob as tb
def tf(word, blob):
"""
Text Frequency.
Find normalized occurrence of a word in a blob.
"""
return blob.words.count(word) / len(blob.words)
def n_containing(word, bloblist):
""" Find number of blobs which contain a particular word. """
return sum(1 for blob in bloblist if word in blob)
def idf(word, bloblist):
"""
Inverse Document Frequency.
The inverse document frequency is a measure of how much
information the word provides, that is, whether the term
is common or rare across all documents.
"""
return math.log(len(bloblist) / (1 + n_containing(word, bloblist)))
def tfidf(word, blob, bloblist):
"""
Text Frequency Inverse Document Frequency.
http:https://en.wikipedia.org/wiki/Tf-idf
"""
return tf(word, blob) * idf(word, bloblist)
# Create a bloblist of all the problems
bloblist = []
for f in range(1, 502):
with open("data/" + str(f) + ".json") as fp:
bloblist.append(tb(json.load(fp)["text"]))
# A set of all tags
tags = set()
for i, blob in enumerate(bloblist):
# Tf-Idf of every word
scores = {word: tfidf(word, blob, bloblist) for word in blob.words
if (len(word) > 4) and (not any(i.isdigit() for i in word))}
# Sort the dictionary based on scores
sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True)
# Print the top 5 words of every blob
for word, score in sorted_words[:1]:
tags.add(word.lower())
print(tags)