import nltk from math import log nltk.download("book") nltk.download('omw-1.4') class Docs(): def __init__(self,*doc_list): self.doc_list = doc_list def doc_hote(self, word): res = [] for doc in self.doc_list: if word in doc.racinisation(): res.append(doc.name) return res def word_frequency(self, word): res = [] for doc in self.doc_list: if doc.name in self.doc_hote(word): all_words = [] for w in doc.racinisation(): all_words.append(w.lower()) all_words = nltk.FreqDist(all_words) res.append((word,all_words[word],doc.name)) return res def weight(self, word): res = [] for doc in self.doc_list: if doc.name in self.doc_hote(word): all_words = [] for w in doc.racinisation(): all_words.append(w.lower()) all_words = nltk.FreqDist(all_words) formule = (1+log(all_words[word]))*log(len(self.doc_list) / len(self.doc_hote(word))) res.append((word,formule,doc.name)) return res def tf_idf(self,word): texts = [] for doc in self.doc_list: if doc.name in self.doc_hote(word): texts.append(doc.racinisation()) mytexts = nltk.TextCollection(texts) tf = [] for t in texts: for doc in self.doc_list: if doc.racinisation() == t: tf.append((mytexts.tf(word, t),doc.name)) continue return tf def most_relevant(self, word): tf_idf = self.tf_idf(word) max = tf_idf[0][0] doc = tf_idf[0][1] for i in range(1,len(tf_idf)): if tf_idf[i][0]>max: max = tf_idf[i][0] doc = tf_idf[i][1] return doc