{ "cells": [ { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[nltk_data] Downloading package wordnet to\n", "[nltk_data] C:\\Users\\jx\\AppData\\Roaming\\nltk_data...\n", "[nltk_data] Package wordnet is already up-to-date!\n" ] } ], "source": [ "import wikionly #script name is wikionly (no summary), class name is wiki\n", "import re as re\n", "import nltk\n", "nltk.download('wordnet')\n", "from nltk.corpus import wordnet\n", "import math\n", "\n", "class similar:\n", " def __init__(self,text1,text2):\n", "\n", " self.wn = nltk.corpus.wordnet #the corpus reader\n", "\n", " #check if both arguments input are string format\n", " checkstr = False\n", " if isinstance(text1, str) == True:\n", " if isinstance(text2, str) == True:\n", " self.text1 = text1\n", " self.text2 = text2\n", " checkstr = True\n", " else:\n", " print('Error! The second argument is not a string format!') \n", " else:\n", " print('Error! The first argument is not a string format!')\n", " \n", " #run internal wikipedia python file for processing for both wiki titles\n", " if checkstr == True:\n", " self.wiki1 = wikionly.wiki(text1)\n", " self.wiki2 = wikionly.wiki(text2)\n", " \n", " #call the function that calculates percentage\n", " self.percent(self.wiki1,self.wiki2)\n", " \n", " #call the function that shows list of words for both Wiki sites, disabled\n", " #self.words()\n", " \n", " #retrieve top 40 common words from wiki page, slice up and append .n01 for NLTK usage\n", " def percent(self,input1,input2):\n", " self.dotn01 = ('.','n','.','0','1')\n", " self.wiki1list = []\n", " for key in self.wiki1.commonwords(40):\n", " self.wiki1slice = list(key)\n", " for letter in self.dotn01:\n", " self.wiki1slice.append(letter)\n", " self.wiki1slice = ''.join(self.wiki1slice)\n", " self.wiki1list.append(self.wiki1slice)\n", "\n", " self.wiki2list = []\n", " for key in self.wiki2.commonwords(40):\n", " self.wiki2slice = list(key)\n", " for letter in self.dotn01:\n", " self.wiki2slice.append(letter)\n", " self.wiki2slice = ''.join(self.wiki2slice)\n", " self.wiki2list.append(self.wiki2slice)\n", " \n", " #count and sum for calculating similarity\n", " self.count = 0\n", " self.sum = 0\n", " #A count for the ranking of the word (how often it appears in both wiki passages)\n", " self.topten1 = 0\n", " self.topten2 = 0\n", "\n", " #For words that are 1-10th and 11-21st in popularity, if both wiki pages have the word, they get more points\n", " for word1 in self.wiki1list:\n", " #Reset self.topten2\n", " self.topten2 = 0\n", " self.topten1 += 1\n", " for word2 in self.wiki2list:\n", " self.topten2 += 1\n", " #reinitialize to zero to prevent old sums from going into maxsum\n", " self.sum1 = 0\n", " self.sum2 = 0\n", " self.sum3 = 0\n", " self.sum4 = 0\n", " self.maxsum = 0\n", " \n", " if self.topten1 < 11 and self.topten2 < 11:\n", " self.expvalue = 4.5 #3.5\n", " elif self.topten1 < 21 and self.topten2 < 21:\n", " self.expvalue = 2\n", " else:\n", " self.expvalue = 1.5\n", " \n", " try:\n", " if re.findall(r\"\\d+.n.01\", word1) == [] and re.findall(r\"\\d+.n.01\", word2) == []: #check both words not numbers\n", " #since words have many meanings, for every pair of words, use top two meanings n.01 and n.02 for comparison\n", " #two for loops will check every permutation pair of words between wiki pages, two meanings for each word, \n", " #Take the max similarity value taken for computation of similarity index\n", " #e.g. money.n.01 may have highest value with value.n.02 because value.n.01 has the obvious meaning of worth/significance and secondary for money\n", " word11 = word1.replace('n.01','n.02')\n", " word22 = word2.replace('n.01','n.02')\n", " #print(word11,word22)\n", " self.x = self.wn.synset(word1)\n", " self.y = self.wn.synset(word2)\n", " #get default similarity value of 1st definitions of word\n", " self.sum1 = self.x.path_similarity(self.y) * math.exp(self.expvalue * self.x.path_similarity(self.y)) + 10 * math.log(0.885+self.x.path_similarity(self.y))\n", " try: #get 2nd definitions of words and their similarity values, if it exist\n", " self.xx = self.wn.synset(word11)\n", " self.yy = self.wn.synset(word22)\n", " self.sum2 = self.xx.path_similarity(self.y) * math.exp(self.expvalue * self.xx.path_similarity(self.y)) + 10 * math.log(0.89+self.xx.path_similarity(self.y))\n", " self.sum3 = self.x.path_similarity(self.yy) * math.exp(self.expvalue * self.x.path_similarity(self.yy)) + 10 * math.log(0.89+self.x.path_similarity(self.yy))\n", " self.sum4 = self.xx.path_similarity(self.yy) * math.exp(self.expvalue * self.xx.path_similarity(self.yy)) + 10 * math.log(0.89+self.xx.path_similarity(self.yy))\n", " except:\n", " continue\n", " self.maxsum = max(self.sum1,self.sum2,self.sum3,self.sum4) #get the max similarity value between 2 words x 2 meanings = 4 comparisons\n", " #print(word1, word2, self.maxsum)\n", " self.sum += self.maxsum\n", " self.count += 1\n", " except:\n", " if word1 == word2 and re.findall(r\"\\d+.n.01\", word1) == []: #remove years/numbers being counted as match yyyy.n.01\n", " self.sum += math.exp(self.expvalue) + 10 * math.log(1.89)\n", " self.count += 1\n", " else:\n", " continue\n", "\n", " if self.count != 0:\n", " self.percent = round(self.sum/self.count*100)\n", " if self.percent > 100:\n", " self.percent = 100\n", " elif self.percent < 0:\n", " self.percent = 0\n", " print('Probability of topics being related is ' + str(self.percent) + '%')\n", " print('Count is ' + str(self.count) + ' and sum is ' + str(self.sum))\n", " print('\\n')\n", " else:\n", " print('No relation index can be calculated as words are all foreign')\n", " \n", " return self.percent\n", " \n", " def words(self):\n", " print(self.wiki1list)\n", " print('\\n')\n", " print(self.wiki2list)\n", " \n", " def ans(self):\n", " self.listans = [self.text1,self.text2,self.percent]\n", " if self.percent > 49:\n", " self.listans.append('Yes')\n", " else:\n", " self.listans.append('No')\n", " return self.listans\n", " \n", " def help(self):\n", " print(\"To start, assign var = comparewiki.similar('arg1','arg2'). To get values in a list for storage, use .ans(). To get the 40 common words for comparison, use .words()\")\n", " \n", "\n", " " ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.5" } }, "nbformat": 4, "nbformat_minor": 2 }