{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[nltk_data] Downloading package wordnet to\n",
      "[nltk_data]     C:\\Users\\jx\\AppData\\Roaming\\nltk_data...\n",
      "[nltk_data]   Package wordnet is already up-to-date!\n"
     ]
    }
   ],
   "source": [
    "import wikionly #script name is wikionly (no summary), class name is wiki\n",
    "import re as re\n",
    "import nltk\n",
    "nltk.download('wordnet')\n",
    "from nltk.corpus import wordnet\n",
    "import math\n",
    "\n",
    "class similar:\n",
    "    def __init__(self,text1,text2):\n",
    "\n",
    "        self.wn = nltk.corpus.wordnet #the corpus reader\n",
    "\n",
    "        #check if both arguments input are string format\n",
    "        checkstr = False\n",
    "        if isinstance(text1, str) == True:\n",
    "            if isinstance(text2, str) == True:\n",
    "                self.text1 = text1\n",
    "                self.text2 = text2\n",
    "                checkstr = True\n",
    "            else:\n",
    "                print('Error! The second argument is not a string format!')        \n",
    "        else:\n",
    "            print('Error! The first argument is not a string format!')\n",
    "        \n",
    "        #run internal wikipedia python file for processing for both wiki titles\n",
    "        if checkstr == True:\n",
    "            self.wiki1 = wikionly.wiki(text1)\n",
    "            self.wiki2 = wikionly.wiki(text2)\n",
    "        \n",
    "        #call the function that calculates percentage\n",
    "        self.percent(self.wiki1,self.wiki2)\n",
    "        \n",
    "        #call the function that shows list of words for both Wiki sites, disabled\n",
    "        #self.words()\n",
    "        \n",
    "    #retrieve top 40 common words from wiki page, slice up and append .n01 for NLTK usage\n",
    "    def percent(self,input1,input2):\n",
    "        self.dotn01 = ('.','n','.','0','1')\n",
    "        self.wiki1list = []\n",
    "        for key in self.wiki1.commonwords(40):\n",
    "            self.wiki1slice = list(key)\n",
    "            for letter in self.dotn01:\n",
    "                self.wiki1slice.append(letter)\n",
    "            self.wiki1slice = ''.join(self.wiki1slice)\n",
    "            self.wiki1list.append(self.wiki1slice)\n",
    "\n",
    "        self.wiki2list = []\n",
    "        for key in self.wiki2.commonwords(40):\n",
    "            self.wiki2slice = list(key)\n",
    "            for letter in self.dotn01:\n",
    "                self.wiki2slice.append(letter)\n",
    "            self.wiki2slice = ''.join(self.wiki2slice)\n",
    "            self.wiki2list.append(self.wiki2slice)\n",
    "        \n",
    "        #count and sum for calculating similarity\n",
    "        self.count = 0\n",
    "        self.sum = 0\n",
    "        #A count for the ranking of the word (how often it appears in both wiki passages)\n",
    "        self.topten1 = 0\n",
    "        self.topten2 = 0\n",
    "\n",
    "        #For words that are 1-10th and 11-21st in popularity, if both wiki pages have the word, they get more points\n",
    "        for word1 in self.wiki1list:\n",
    "            #Reset self.topten2\n",
    "            self.topten2 = 0\n",
    "            self.topten1 += 1\n",
    "            for word2 in self.wiki2list:\n",
    "                self.topten2 += 1\n",
    "                #reinitialize to zero to prevent old sums from going into maxsum\n",
    "                self.sum1 = 0\n",
    "                self.sum2 = 0\n",
    "                self.sum3 = 0\n",
    "                self.sum4 = 0\n",
    "                self.maxsum = 0\n",
    "                \n",
    "                if self.topten1 < 11 and self.topten2 < 11:\n",
    "                    self.expvalue = 4.5 #3.5\n",
    "                elif self.topten1 < 21 and self.topten2 < 21:\n",
    "                    self.expvalue = 2\n",
    "                else:\n",
    "                    self.expvalue = 1.5\n",
    "                \n",
    "                try:\n",
    "                    if re.findall(r\"\\d+.n.01\", word1) == [] and re.findall(r\"\\d+.n.01\", word2) == []: #check both words not numbers\n",
    "                        #since words have many meanings, for every pair of words, use top two meanings n.01 and n.02 for comparison\n",
    "                        #two for loops will check every permutation pair of words between wiki pages, two meanings for each word, \n",
    "                        #Take the max similarity value taken for computation of similarity index\n",
    "                        #e.g. money.n.01 may have highest value with value.n.02 because value.n.01 has the obvious meaning of worth/significance and secondary for money\n",
    "                        word11 = word1.replace('n.01','n.02')\n",
    "                        word22 = word2.replace('n.01','n.02')\n",
    "                        #print(word11,word22)\n",
    "                        self.x = self.wn.synset(word1)\n",
    "                        self.y = self.wn.synset(word2)\n",
    "                        #get default similarity value of 1st definitions of word\n",
    "                        self.sum1 = self.x.path_similarity(self.y) * math.exp(self.expvalue * self.x.path_similarity(self.y)) + 10 * math.log(0.885+self.x.path_similarity(self.y))\n",
    "                        try: #get 2nd definitions of words and their similarity values, if it exist\n",
    "                            self.xx = self.wn.synset(word11)\n",
    "                            self.yy = self.wn.synset(word22)\n",
    "                            self.sum2 = self.xx.path_similarity(self.y) * math.exp(self.expvalue * self.xx.path_similarity(self.y)) + 10 * math.log(0.89+self.xx.path_similarity(self.y))\n",
    "                            self.sum3 = self.x.path_similarity(self.yy) * math.exp(self.expvalue * self.x.path_similarity(self.yy)) + 10 * math.log(0.89+self.x.path_similarity(self.yy))\n",
    "                            self.sum4 = self.xx.path_similarity(self.yy) * math.exp(self.expvalue * self.xx.path_similarity(self.yy)) + 10 * math.log(0.89+self.xx.path_similarity(self.yy))\n",
    "                        except:\n",
    "                            continue\n",
    "                        self.maxsum = max(self.sum1,self.sum2,self.sum3,self.sum4) #get the max similarity value between 2 words x 2 meanings = 4 comparisons\n",
    "                        #print(word1, word2, self.maxsum)\n",
    "                        self.sum += self.maxsum\n",
    "                        self.count += 1\n",
    "                except:\n",
    "                    if word1 == word2 and re.findall(r\"\\d+.n.01\", word1) == []: #remove years/numbers being counted as match yyyy.n.01\n",
    "                        self.sum += math.exp(self.expvalue) + 10 * math.log(1.89)\n",
    "                        self.count += 1\n",
    "                    else:\n",
    "                        continue\n",
    "\n",
    "        if self.count != 0:\n",
    "            self.percent = round(self.sum/self.count*100)\n",
    "            if self.percent > 100:\n",
    "                self.percent = 100\n",
    "            elif self.percent < 0:\n",
    "                self.percent = 0\n",
    "            print('Probability of topics being related is ' + str(self.percent) + '%')\n",
    "            print('Count is ' + str(self.count) + ' and sum is ' + str(self.sum))\n",
    "            print('\\n')\n",
    "        else:\n",
    "            print('No relation index can be calculated as words are all foreign')\n",
    "            \n",
    "        return self.percent\n",
    "        \n",
    "    def words(self):\n",
    "        print(self.wiki1list)\n",
    "        print('\\n')\n",
    "        print(self.wiki2list)\n",
    "        \n",
    "    def ans(self):\n",
    "        self.listans = [self.text1,self.text2,self.percent]\n",
    "        if self.percent > 49:\n",
    "            self.listans.append('Yes')\n",
    "        else:\n",
    "            self.listans.append('No')\n",
    "        return self.listans\n",
    "    \n",
    "    def help(self):\n",
    "        print(\"To start, assign var = comparewiki.similar('arg1','arg2'). To get values in a list for storage, use .ans(). To get the 40 common words for comparison, use .words()\")\n",
    "            \n",
    "\n",
    "            "
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}