Skip to content

Commit

Permalink
Added more info for Lin's similarity score #120
Browse files Browse the repository at this point in the history
  • Loading branch information
dvklopfenstein committed Feb 14, 2019
1 parent 7267a54 commit 8269c2c
Show file tree
Hide file tree
Showing 4 changed files with 81 additions and 36 deletions.
1 change: 0 additions & 1 deletion goatools/gosubdag/gosubdag.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
class GoSubDag(object):
"""Manages a user-specified subset of a GO DAG."""


def __init__(self, go_sources, go2obj, relationships=None, **kws):
# kws _Init: rcntobj relationships
# tic = timeit.default_timer()
Expand Down
Binary file added notebooks/images/nb_lin_go.png
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
98 changes: 71 additions & 27 deletions notebooks/semantic_similarity.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -23,16 +23,13 @@
},
{
"cell_type": "code",
"execution_count": 18,
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The autoreload extension is already loaded. To reload it, use:\n",
" %reload_ext autoreload\n",
"go-basic.obo: fmt(1.2) rel(2019-01-12) 47,374 GO Terms\n",
"go-basic.obo: fmt(1.2) rel(2019-01-12) 47,374 GO Terms\n"
]
}
Expand All @@ -42,14 +39,12 @@
"%autoreload 2\n",
"\n",
"from goatools.obo_parser import GODag\n",
"godag = GODag(\"go-basic.obo\")\n",
"\n",
"go = obo_parser.GODag(\"go-basic.obo\")"
"godag = GODag(\"go-basic.obo\")"
]
},
{
"cell_type": "code",
"execution_count": 19,
"execution_count": 2,
"metadata": {},
"outputs": [
{
Expand All @@ -63,9 +58,9 @@
],
"source": [
"go_id3 = 'GO:0048364'\n",
"go_id4 = 'GO:0044707'\n",
"print(go[go_id3])\n",
"print(go[go_id4])"
"go_id4 = 'GO:0032501'\n",
"print(godag[go_id3])\n",
"print(godag[go_id4])"
]
},
{
Expand All @@ -77,27 +72,76 @@
},
{
"cell_type": "code",
"execution_count": 20,
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" READ 237,219 associations: C:\\Users\\note2\\Data\\git\\goatools\\notebooks\\gene_association.tair\n"
" READ 236,943 associations: C:\\Users\\note2\\Data\\git\\goatools\\notebooks\\tair.gaf\n"
]
}
],
"source": [
"# from goatools.associations import read_gaf\n",
"# associations = read_gaf(\"gene_association.tair\")\n",
"# associations = read_gaf(\"tair.gaf\")\n",
"\n",
"import os\n",
"from goatools.associations import dnld_assc\n",
"fin_gaf = os.path.join(os.getcwd(), \"gene_association.tair\")\n",
"fin_gaf = os.path.join(os.getcwd(), \"tair.gaf\")\n",
"associations = dnld_assc(fin_gaf, godag)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"GO:0008150\n"
]
}
],
"source": [
"# Find deepest common ancestor\n",
"from goatools.semantic import deepest_common_ancestor\n",
"go_root = deepest_common_ancestor([go_id3, go_id4], godag)\n",
"print(go_root)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Plot the two terms of interest and highlight their deepest common ancestor\n",
"\n",
"\n",
"|color |color | GO Term | Description\n",
"|------|-------|------------|------------------------\n",
"|blue |#d5ffff| GO:0008150 | deepest common ancestor\n",
"|green |#d1ffbd| GO:0048364 | User GO Term\n",
"|green |#d1ffbd| GO:0032501 | User GO Term\n",
"\n",
"```\n",
"$ scripts/go_plot.py GO:0008150#d5ffff GO:0048364#d1ffbd GO:0032501#d1ffdb -o aaa_lin.png --gaf=tair.gaf\n",
"\n",
"go-basic.obo: fmt(1.2) rel(2019-02-07) 47,387 GO Terms\n",
" READ 236,943 associations: tair.gaf\n",
"#d5ffff GO:0008150 # BP 29699 3.30 L00 D00 biological_process\n",
"#f1fbfd GO:0032502 # BP 3220 5.02 L01 D01 A developmental process\n",
"#d1ffdb GO:0032501 # BP 1003 5.48 L01 D01 B multicellular organismal process\n",
" GO:0048856 # BP 1040 5.46 L02 D02 A anatomical structure development\n",
" GO:0099402 # BP 17 6.90 L03 D03 A plant organ development\n",
"#d1ffbd GO:0048364 # BP 4 7.56 L04 D04 A root development\n",
"```\n",
"\n",
"<img src=\"images/nb_lin_go.png\" width=\"600pt\">\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand All @@ -107,21 +151,21 @@
},
{
"cell_type": "code",
"execution_count": 21,
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The semantic similarity between terms GO:0048364 and GO:0044707 is 0.2.\n"
"The semantic similarity between terms GO:0048364 and GO:0032501 is 0.2.\n"
]
}
],
"source": [
"from goatools.semantic import semantic_similarity\n",
"\n",
"sim = semantic_similarity(go_id3, go_id4, go)\n",
"sim = semantic_similarity(go_id3, go_id4, godag)\n",
"print('The semantic similarity between terms {} and {} is {}.'.format(go_id3, go_id4, sim))"
]
},
Expand All @@ -134,22 +178,22 @@
},
{
"cell_type": "code",
"execution_count": 22,
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Information content (GO:0048364) = 7.563008960488486\n"
"Information content (GO:0048364) = 7.561470454226915\n"
]
}
],
"source": [
"from goatools.semantic import TermCounts, get_info_content\n",
"\n",
"# First get the counts of each GO term.\n",
"termcounts = TermCounts(go, associations)\n",
"termcounts = TermCounts(godag, associations)\n",
"\n",
"# Calculate the information content\n",
"go_id = \"GO:0048364\"\n",
Expand All @@ -166,21 +210,21 @@
},
{
"cell_type": "code",
"execution_count": 23,
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Resnik similarity score (GO:0048364, GO:0044707) = 3.2948111361632906\n"
"Resnik similarity score (GO:0048364, GO:0032501) = 3.2954984390672677\n"
]
}
],
"source": [
"from goatools.semantic import resnik_sim\n",
"\n",
"sim_r = resnik_sim(go_id3, go_id4, go, termcounts)\n",
"sim_r = resnik_sim(go_id3, go_id4, godag, termcounts)\n",
"print('Resnik similarity score ({}, {}) = {}'.format(go_id3, go_id4, sim_r))"
]
},
Expand All @@ -201,21 +245,21 @@
},
{
"cell_type": "code",
"execution_count": 24,
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Lin similarity score (GO:0048364, GO:0044707) = -0.5053524651601573\n"
"Lin similarity score (GO:0048364, GO:0032501) = -0.5055771852871896\n"
]
}
],
"source": [
"from goatools.semantic import lin_sim\n",
"\n",
"sim_l = lin_sim(go_id3, go_id4, go, termcounts)\n",
"sim_l = lin_sim(go_id3, go_id4, godag, termcounts)\n",
"print('Lin similarity score ({}, {}) = {}'.format(go_id3, go_id4, sim_l))"
]
}
Expand Down
18 changes: 10 additions & 8 deletions tests/semantic_i88.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,11 @@

import os
from goatools import obo_parser
from goatools.gosubdag.gosubdag import GoSubDag
from goatools.associations import dnld_assc
from goatools.semantic import semantic_similarity
from goatools.semantic import TermCounts, get_info_content
from goatools.semantic import deepest_common_ancestor
from goatools.semantic import resnik_sim
from goatools.semantic import lin_sim

Expand All @@ -18,26 +20,26 @@
def test_semantic_i88():
"""Computing basic semantic similarities between GO terms."""
godag = obo_parser.GODag("go-basic.obo")
goids = set(go for go, o in godag.items() if go == o.id)
goids = set(godag.keys())
# Get all the annotations from arabidopsis.
fin_gaf = os.path.join(REPO, "tair.gaf")
# dnld_assc includes read_gaf
associations = dnld_assc(fin_gaf, godag, prt=None)

# First get the counts and information content for each GO term.
termcounts = TermCounts(godag, associations)
gosubdag = GoSubDag(goids, godag, tcntobj=termcounts)

# Now we can calculate the semantic distance and semantic similarity, as so:
# "The semantic similarity between terms GO:0048364 and GO:0044707 is 0.25.
go_id3 = 'GO:0048364' # BP level-03 depth-04 root development
go_id4 = 'GO:0044707' # BP level-02 depth-02 single-multicellular organism process
go_root = deepest_common_ancestor([go_id3, go_id4], godag)
sim = semantic_similarity(go_id3, go_id4, godag)
print('\nThe semantic similarity between terms {GO1} and {GO2} is {VAL}.'.format(
GO1=go_id3, GO2=go_id4, VAL=sim))
print(godag[go_id3])
print(godag[go_id4])

# Then we can calculate the information content of the single term, <code>GO:0048364</code>.
# "Information content (GO:0048364) = 7.75481392334

# First get the counts of each GO term.
termcounts = TermCounts(godag, associations)
gosubdag.prt_goids([go_root, go_id3, go_id4])

# Calculate the information content
go_id = "GO:0048364"
Expand Down

0 comments on commit 8269c2c

Please sign in to comment.