Moved cooccurrence code to its own script

hetio · May 15, 2015 · 754efa6 · 754efa6
1 parent 51f9836
commit 754efa6
Show file tree

Hide file tree

Showing 2 changed files with 91 additions and 57 deletions.
diff --git a/cooccurrence.py b/cooccurrence.py
@@ -0,0 +1,57 @@
+import itertools
+
+import scipy.stats
+import pandas
+
+def score_pmid_cooccurrence(term0_to_pmids, term1_to_pmids, term0_name='term_0', term1_name='term_1', verbose=True):
+ """
+ Find pubmed cooccurrence between topics of two classes.
+
+ term0_to_pmids -- a dictionary that returns the pubmed_ids for each term of class 0
+ term0_to_pmids -- a dictionary that returns the pubmed_ids for each term of class 1
+ """
+ all_pmids0 = set.union(*term0_to_pmids.values())
+ all_pmids1 = set.union(*term1_to_pmids.values())
+ pmids_in_both = all_pmids0 & all_pmids1
+ total_pmids = len(pmids_in_both)
+ if verbose:
+ print('Total articles containing a {}: {}'.format(term0_name, len(all_pmids0)))
+ print('Total articles containing a {}: {}'.format(term1_name, len(all_pmids1)))
+ print('Total articles containing both a {} and {}: {}'.format(term0_name, term1_name, total_pmids))
+
+ term0_to_pmids = term0_to_pmids.copy()
+ term1_to_pmids = term1_to_pmids.copy()
+ for d in term0_to_pmids, term1_to_pmids:
+ for key, value in list(d.items()):
+ d[key] = value & pmids_in_both
+ if not d[key]:
+ del d[key]
+
+ if verbose:
+ print('\nAfter removing terms without any cooccurences:')
+ print('+ {} {}s remain'.format(len(term0_to_pmids), term0_name))
+ print('+ {} {}s remain'.format(len(term1_to_pmids), term1_name))
+
+ rows = list()
+ for term0, term1 in itertools.product(term0_to_pmids, term1_to_pmids):
+ pmids0 = term0_to_pmids[term0]
+ pmids1 = term1_to_pmids[term1]
+
+ a = len(pmids0 & pmids1)
+ b = len(pmids0) - a
+ c = len(pmids1) - a
+ d = total_pmids - len(pmids0 | pmids1)
+ contingency_table = [[a, b], [c, d]]
+
+ expected = len(pmids0) * len(pmids1) / total_pmids
+ enrichment = a / expected
+
+ oddsratio, pvalue = scipy.stats.fisher_exact(contingency_table, alternative='greater')
+ rows.append([term0, term1, a, expected, enrichment, oddsratio, pvalue])
+
+ columns = [term0_name, term1_name, 'cooccurrence', 'expected', 'enrichment', 'odds_ratio', 'p_fisher']
+ df = pandas.DataFrame(rows, columns=columns)
+
+ if verbose:
+ print('\nCooccurrence scores calculated for {} {} -- {} pairs'.format(len(df), term0_name, term1_name))
+ return df
diff --git a/symptoms.ipynb b/symptoms.ipynb
@@ -2,22 +2,21 @@
  "cells": [
  {
  "cell_type": "code",
- "execution_count": 160,
+ "execution_count": 165,
  "metadata": {
  "collapsed": false
  },
  "outputs": [],
  "source": [
  "import io\n",
- "import functools\n",
- "import itertools\n",
  "import gzip\n",
  "\n",
  "import pandas\n",
  "import requests\n",
  "import networkx\n",
  "\n",
- "import eutility"
+ "import eutility\n",
+ "import cooccurrence"
  ]
  },
  {
@@ -591,63 +590,30 @@
  },
  {
  "cell_type": "code",
- "execution_count": 150,
+ "execution_count": 176,
  "metadata": {
- "collapsed": true
+ "collapsed": false
  },
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "<module 'cooccurrence' from '/home/dhimmels/Documents/serg/rephetio/construct/medline/cooccurrence.py'>"
+ ]
+ },
+ "execution_count": 176,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
  "source": [
- "def score_pmid_cooccurrence(term0_to_pmids, term1_to_pmids, term0_name='term_0', term1_name='term_1', verbose=True):\n",
- " all_pmids0 = set.union(*term0_to_pmids.values())\n",
- " all_pmids1 = set.union(*term1_to_pmids.values())\n",
- " pmids_in_both = all_pmids0 & all_pmids1\n",
- " total_pmids = len(pmids_in_both)\n",
- " if verbose:\n",
- " print('Total articles containing a {}: {}'.format(term0_name, len(all_pmids0)))\n",
- " print('Total articles containing a {}: {}'.format(term1_name, len(all_pmids1)))\n",
- " print('Total articles containing both a {} and {}: {}'.format(term0_name, term1_name, total_pmids))\n",
- " \n",
- " term0_to_pmids = term0_to_pmids.copy()\n",
- " term1_to_pmids = term1_to_pmids.copy()\n",
- " for d in term0_to_pmids, term1_to_pmids:\n",
- " for key, value in list(d.items()):\n",
- " d[key] = value & pmids_in_both\n",
- " if not d[key]:\n",
- " del d[key]\n",
- " \n",
- " if verbose:\n",
- " print('\\nAfter removing terms without any cooccurences:')\n",
- " print('+ {} {}s remain'.format(len(term0_to_pmids), term0_name))\n",
- " print('+ {} {}s remain'.format(len(term1_to_pmids), term1_name))\n",
- " \n",
- " rows = list()\n",
- " for term0, term1 in itertools.product(term0_to_pmids, term1_to_pmids):\n",
- " pmids0 = term0_to_pmids[term0]\n",
- " pmids1 = term1_to_pmids[term1]\n",
- "\n",
- " a = len(pmids0 & pmids1)\n",
- " b = len(pmids0) - a\n",
- " c = len(pmids1) - a\n",
- " d = total_pmids - len(pmids0 | pmids1)\n",
- " contingency_table = [[a, b], [c, d]]\n",
- "\n",
- " expected = len(pmids0) * len(pmids1) / total_pmids\n",
- " enrichment = a / expected\n",
- " \n",
- " oddsratio, pvalue = scipy.stats.fisher_exact(contingency_table, alternative='greater')\n",
- " rows.append([term0, term1, a, expected, enrichment, oddsratio, pvalue])\n",
- " \n",
- " columns = [term0_name, term1_name, 'cooccurrence', 'expected', 'enrichment', 'odds_ratio', 'p_fisher']\n",
- " df = pandas.DataFrame(rows, columns=columns)\n",
- " \n",
- " if verbose:\n",
- " print('\\nCooccurrence scores calculated for {} {} -- {} pairs'.format(len(df), term0_name, term1_name))\n",
- " return df"
+ "import importlib\n",
+ "importlib.reload(cooccurrence)"
  ]
  },
  {
  "cell_type": "code",
- "execution_count": 151,
+ "execution_count": 177,
  "metadata": {
  "collapsed": false
  },
@@ -666,7 +632,19 @@
  "\n",
  "Cooccurrence scores calculated for 49608 doid_code -- mesh_id pairs\n"
  ]
- },
+ }
+ ],
+ "source": [
+ "cooc_df = cooccurrence.score_pmid_cooccurrence(disease_to_pmids, symptom_to_pmids, 'doid_code', 'mesh_id')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 178,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
  {
  "data": {
  "text/html": [
@@ -767,13 +745,12 @@
  "26423 251 10.884701 23.059889 35.878792 1.152183e-261 "
  ]
  },
- "execution_count": 151,
+ "execution_count": 178,
  "metadata": {},
  "output_type": "execute_result"
  }
  ],
  "source": [
- "cooc_df = score_pmid_cooccurrence(disease_to_pmids, symptom_to_pmids, 'doid_code', 'mesh_id')\n",
  "cooc_df = symptom_df[['mesh_id', 'mesh_name']].merge(cooc_df)\n",
  "cooc_df = disease_df[['doid_code', 'doid_name']].merge(cooc_df)\n",
  "cooc_df = cooc_df.sort(['doid_name', 'p_fisher'])\n",