Skip to content

Commit

Permalink
Moved cooccurrence code to its own script
Browse files Browse the repository at this point in the history
  • Loading branch information
dhimmel committed May 15, 2015
1 parent 51f9836 commit 754efa6
Show file tree
Hide file tree
Showing 2 changed files with 91 additions and 57 deletions.
57 changes: 57 additions & 0 deletions cooccurrence.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
import itertools

import scipy.stats
import pandas

def score_pmid_cooccurrence(term0_to_pmids, term1_to_pmids, term0_name='term_0', term1_name='term_1', verbose=True):
"""
Find pubmed cooccurrence between topics of two classes.
term0_to_pmids -- a dictionary that returns the pubmed_ids for each term of class 0
term0_to_pmids -- a dictionary that returns the pubmed_ids for each term of class 1
"""
all_pmids0 = set.union(*term0_to_pmids.values())
all_pmids1 = set.union(*term1_to_pmids.values())
pmids_in_both = all_pmids0 & all_pmids1
total_pmids = len(pmids_in_both)
if verbose:
print('Total articles containing a {}: {}'.format(term0_name, len(all_pmids0)))
print('Total articles containing a {}: {}'.format(term1_name, len(all_pmids1)))
print('Total articles containing both a {} and {}: {}'.format(term0_name, term1_name, total_pmids))

term0_to_pmids = term0_to_pmids.copy()
term1_to_pmids = term1_to_pmids.copy()
for d in term0_to_pmids, term1_to_pmids:
for key, value in list(d.items()):
d[key] = value & pmids_in_both
if not d[key]:
del d[key]

if verbose:
print('\nAfter removing terms without any cooccurences:')
print('+ {} {}s remain'.format(len(term0_to_pmids), term0_name))
print('+ {} {}s remain'.format(len(term1_to_pmids), term1_name))

rows = list()
for term0, term1 in itertools.product(term0_to_pmids, term1_to_pmids):
pmids0 = term0_to_pmids[term0]
pmids1 = term1_to_pmids[term1]

a = len(pmids0 & pmids1)
b = len(pmids0) - a
c = len(pmids1) - a
d = total_pmids - len(pmids0 | pmids1)
contingency_table = [[a, b], [c, d]]

expected = len(pmids0) * len(pmids1) / total_pmids
enrichment = a / expected

oddsratio, pvalue = scipy.stats.fisher_exact(contingency_table, alternative='greater')
rows.append([term0, term1, a, expected, enrichment, oddsratio, pvalue])

columns = [term0_name, term1_name, 'cooccurrence', 'expected', 'enrichment', 'odds_ratio', 'p_fisher']
df = pandas.DataFrame(rows, columns=columns)

if verbose:
print('\nCooccurrence scores calculated for {} {} -- {} pairs'.format(len(df), term0_name, term1_name))
return df
91 changes: 34 additions & 57 deletions symptoms.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2,22 +2,21 @@
"cells": [
{
"cell_type": "code",
"execution_count": 160,
"execution_count": 165,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"import io\n",
"import functools\n",
"import itertools\n",
"import gzip\n",
"\n",
"import pandas\n",
"import requests\n",
"import networkx\n",
"\n",
"import eutility"
"import eutility\n",
"import cooccurrence"
]
},
{
Expand Down Expand Up @@ -591,63 +590,30 @@
},
{
"cell_type": "code",
"execution_count": 150,
"execution_count": 176,
"metadata": {
"collapsed": true
"collapsed": false
},
"outputs": [],
"outputs": [
{
"data": {
"text/plain": [
"<module 'cooccurrence' from '/home/dhimmels/Documents/serg/rephetio/construct/medline/cooccurrence.py'>"
]
},
"execution_count": 176,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"def score_pmid_cooccurrence(term0_to_pmids, term1_to_pmids, term0_name='term_0', term1_name='term_1', verbose=True):\n",
" all_pmids0 = set.union(*term0_to_pmids.values())\n",
" all_pmids1 = set.union(*term1_to_pmids.values())\n",
" pmids_in_both = all_pmids0 & all_pmids1\n",
" total_pmids = len(pmids_in_both)\n",
" if verbose:\n",
" print('Total articles containing a {}: {}'.format(term0_name, len(all_pmids0)))\n",
" print('Total articles containing a {}: {}'.format(term1_name, len(all_pmids1)))\n",
" print('Total articles containing both a {} and {}: {}'.format(term0_name, term1_name, total_pmids))\n",
" \n",
" term0_to_pmids = term0_to_pmids.copy()\n",
" term1_to_pmids = term1_to_pmids.copy()\n",
" for d in term0_to_pmids, term1_to_pmids:\n",
" for key, value in list(d.items()):\n",
" d[key] = value & pmids_in_both\n",
" if not d[key]:\n",
" del d[key]\n",
" \n",
" if verbose:\n",
" print('\\nAfter removing terms without any cooccurences:')\n",
" print('+ {} {}s remain'.format(len(term0_to_pmids), term0_name))\n",
" print('+ {} {}s remain'.format(len(term1_to_pmids), term1_name))\n",
" \n",
" rows = list()\n",
" for term0, term1 in itertools.product(term0_to_pmids, term1_to_pmids):\n",
" pmids0 = term0_to_pmids[term0]\n",
" pmids1 = term1_to_pmids[term1]\n",
"\n",
" a = len(pmids0 & pmids1)\n",
" b = len(pmids0) - a\n",
" c = len(pmids1) - a\n",
" d = total_pmids - len(pmids0 | pmids1)\n",
" contingency_table = [[a, b], [c, d]]\n",
"\n",
" expected = len(pmids0) * len(pmids1) / total_pmids\n",
" enrichment = a / expected\n",
" \n",
" oddsratio, pvalue = scipy.stats.fisher_exact(contingency_table, alternative='greater')\n",
" rows.append([term0, term1, a, expected, enrichment, oddsratio, pvalue])\n",
" \n",
" columns = [term0_name, term1_name, 'cooccurrence', 'expected', 'enrichment', 'odds_ratio', 'p_fisher']\n",
" df = pandas.DataFrame(rows, columns=columns)\n",
" \n",
" if verbose:\n",
" print('\\nCooccurrence scores calculated for {} {} -- {} pairs'.format(len(df), term0_name, term1_name))\n",
" return df"
"import importlib\n",
"importlib.reload(cooccurrence)"
]
},
{
"cell_type": "code",
"execution_count": 151,
"execution_count": 177,
"metadata": {
"collapsed": false
},
Expand All @@ -666,7 +632,19 @@
"\n",
"Cooccurrence scores calculated for 49608 doid_code -- mesh_id pairs\n"
]
},
}
],
"source": [
"cooc_df = cooccurrence.score_pmid_cooccurrence(disease_to_pmids, symptom_to_pmids, 'doid_code', 'mesh_id')"
]
},
{
"cell_type": "code",
"execution_count": 178,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
Expand Down Expand Up @@ -767,13 +745,12 @@
"26423 251 10.884701 23.059889 35.878792 1.152183e-261 "
]
},
"execution_count": 151,
"execution_count": 178,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cooc_df = score_pmid_cooccurrence(disease_to_pmids, symptom_to_pmids, 'doid_code', 'mesh_id')\n",
"cooc_df = symptom_df[['mesh_id', 'mesh_name']].merge(cooc_df)\n",
"cooc_df = disease_df[['doid_code', 'doid_name']].merge(cooc_df)\n",
"cooc_df = cooc_df.sort(['doid_name', 'p_fisher'])\n",
Expand Down

0 comments on commit 754efa6

Please sign in to comment.