From 21ef80e0fc54e4c35328b8a308bcdca251fb8058 Mon Sep 17 00:00:00 2001 From: Haibao Tang Date: Sat, 19 Mar 2016 17:05:59 -0700 Subject: [PATCH] [goatools] add base and semantic; rename notebook to notebooks --- README.rst | 14 +- goatools/associations.py | 7 +- goatools/base.py | 105 +++++++ goatools/gaf_reader.py | 35 +-- goatools/obo_parser.py | 2 - goatools/semantic.py | 166 +++++++++++ .../annotation_coverage.ipynb | 0 {notebook => notebooks}/cell_cycle.ipynb | 0 {notebook => notebooks}/goea_nbt3102.ipynb | 0 {notebook => notebooks}/images/README.md | 0 {notebook => notebooks}/images/nbt3102_CC.png | Bin .../images/nbt3102_MF_RNA_Symbols.png | Bin .../images/nbt3102_MF_RNA_genecnt.png | Bin {notebook => notebooks}/makefile | 0 .../report_depth_level.ipynb | 0 notebooks/semantic_similarity.ipynb | 268 ++++++++++++++++++ 16 files changed, 570 insertions(+), 27 deletions(-) create mode 100644 goatools/base.py create mode 100644 goatools/semantic.py rename {notebook => notebooks}/annotation_coverage.ipynb (100%) rename {notebook => notebooks}/cell_cycle.ipynb (100%) rename {notebook => notebooks}/goea_nbt3102.ipynb (100%) rename {notebook => notebooks}/images/README.md (100%) rename {notebook => notebooks}/images/nbt3102_CC.png (100%) rename {notebook => notebooks}/images/nbt3102_MF_RNA_Symbols.png (100%) rename {notebook => notebooks}/images/nbt3102_MF_RNA_genecnt.png (100%) rename {notebook => notebooks}/makefile (100%) rename {notebook => notebooks}/report_depth_level.ipynb (100%) create mode 100644 notebooks/semantic_similarity.ipynb diff --git a/README.rst b/README.rst index 869a80b7..dd6262d2 100755 --- a/README.rst +++ b/README.rst @@ -38,7 +38,7 @@ Description This package contains a Python library to - process over- and under-representation of certain GO terms, based on Fisher's - exact test. With numerous multiple correction routines including locally + exact test. With numerous multiple correction routines including locally implemented routines for Bonferroni, Sidak, Holm, and false discovery rate. Also included are multiple test corrections from `statsmodels `_: FDR Benjamini/Hochberg, FDR Benjamini/Yekutieli, Holm-Sidak, Simes-Hochberg, @@ -218,19 +218,23 @@ iPython Notebooks Run a Gene Ontology Enrichment Analysis (GOEA) :::::::::::::::::::::::::::::::::::::::::::::: -https://github.com/tanghaibao/goatools/blob/master/notebook/goea_nbt3102.ipynb +https://github.com/tanghaibao/goatools/blob/master/notebooks/goea_nbt3102.ipynb Report level and depth counts of a set of GO terms :::::::::::::::::::::::::::::::::::::::::::::::::: -https://github.com/tanghaibao/goatools/blob/master/notebook/report_depth_level.ipynb +https://github.com/tanghaibao/goatools/blob/master/notebooks/report_depth_level.ipynb Find all human protein-coding genes associated with cell cycle :::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: -https://github.com/tanghaibao/goatools/blob/master/notebook/cell_cycle.ipynb +https://github.com/tanghaibao/goatools/blob/master/notebooks/cell_cycle.ipynb Calculate annotation coverage of GO terms on various species :::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: -https://github.com/tanghaibao/goatools/blob/master/notebook/annotation_coverage.ipynb +https://github.com/tanghaibao/goatools/blob/master/notebooks/annotation_coverage.ipynb + +Determine the semantic similarities between GO terms +:::::::::::::::::::::::::::::::::::::::::::::::::::: +https://github.com/tanghaibao/goatools/blob/master/notebooks/semantic_similarity.ipynb Reference diff --git a/goatools/associations.py b/goatools/associations.py index 4be34c7e..38101acc 100755 --- a/goatools/associations.py +++ b/goatools/associations.py @@ -78,7 +78,7 @@ def read_ncbi_gene2go(fin_gene2go, taxids=None, **kws): """Read NCBI's gene2go. Return gene2go data for user-specified taxids.""" # Written by DV Klopfenstein # kws: taxid2asscs evidence_set - # Simple associations + # Simple associations id2gos = defaultdict(set) # Optional detailed associations split by taxid and having both ID2GOs & GO2IDs # e.g., taxid2asscs = defaultdict(lambda: defaultdict(lambda: defaultdict(set)) @@ -117,12 +117,11 @@ def read_gaf(fin_gaf, **kws): # Written by DV Klopfenstein # kws: taxid2asscs evidence_set from goatools.gaf_reader import GafReader - # Simple associations + # Simple associations id2gos = defaultdict(set) # Optional detailed associations split by taxid and having both ID2GOs & GO2IDs taxid2asscs = kws['taxid2asscs'] if 'taxid2asscs' in kws else None - gafobj = GafReader() - gafnts = gafobj.read_gaf(fin_gaf) + gafnts = GafReader(fin_gaf).associations # Optionaly specify a subset of GOs based on their evidence. evs = kws['evidence_set'] if 'evidence_set' in kws else None for nt in gafnts: diff --git a/goatools/base.py b/goatools/base.py new file mode 100644 index 00000000..1c6ced73 --- /dev/null +++ b/goatools/base.py @@ -0,0 +1,105 @@ +# Stolen from brentp: +# + +import bz2 +import gzip +import sys +import urllib + + +if sys.version_info[0] < 3: + int_types = (int, long) + urlopen = urllib.urlopen +else: + int_types = (int,) + basestring = str + from urllib.request import urlopen + + +def nopen(f, mode="r"): + r""" + open a file that's gzipped or return stdin for '-' + if f is a number, the result of nopen(sys.argv[f]) is returned. + >>> nopen('-') == sys.stdin, nopen('-', 'w') == sys.stdout + (True, True) + >>> nopen(sys.argv[0]) + <...file...> + # expands user and vars ($HOME) + >>> nopen("~/.bashrc").name == nopen("$HOME/.bashrc").name + True + # an already open file. + >>> nopen(open(sys.argv[0])) + <...file...> + >>> nopen(0) + <...file...> + Or provide nicer access to Popen.stdout + >>> files = list(nopen("|ls")) + >>> assert 'setup.py\n' in files or b'setup.py\n' in files, files + """ + if isinstance(f, int_types): + return nopen(sys.argv[f], mode) + + if not isinstance(f, basestring): + return f + if f.startswith("|"): + # using shell explicitly makes things like process substitution work: + # http://stackoverflow.com/questions/7407667/python-subprocess-subshells-and-redirection + # use sys.stderr so we dont have to worry about checking it... + p = Popen(f[1:], stdout=PIPE, stdin=PIPE, + stderr=sys.stderr if mode == "r" else PIPE, + shell=True, bufsize=-1, # use system default for buffering + preexec_fn=prefunc, + close_fds=False, executable=os.environ.get('SHELL')) + if sys.version_info[0] > 2: + import io + p.stdout = io.TextIOWrapper(p.stdout) + p.stdin = io.TextIOWrapper(p.stdin) + if mode != "r": + p.stderr = io.TextIOWrapper(p.stderr) + + if mode and mode[0] == "r": + return process_iter(p, f[1:]) + return p + + if f.startswith(("http://", "https://", "ftp://")): + fh = urlopen(f) + if f.endswith(".gz"): + return ungzipper(fh) + if sys.version_info[0] < 3: + return fh + import io + return io.TextIOWrapper(fh) + f = op.expanduser(op.expandvars(f)) + if f.endswith((".gz", ".Z", ".z")): + fh = gzip.open(f, mode) + if sys.version_info[0] < 3: + return fh + import io + return io.TextIOWrapper(fh) + elif f.endswith((".bz", ".bz2", ".bzip2")): + fh = bz2.BZ2File(f, mode) + if sys.version_info[0] < 3: + return fh + import io + return io.TextIOWrapper(fh) + + return {"r": sys.stdin, "w": sys.stdout}[mode[0]] if f == "-" \ + else open(f, mode) + + +def ungzipper(fh, blocksize=16384): + """ + work-around to get streaming download of http://.../some.gz + """ + import zlib + uzip = zlib.decompressobj(16 + zlib.MAX_WBITS) + data = uzip.decompress(fh.read(blocksize)).split("\n") + + while len(data[0]): + # last chunk might not be a full line. + save = data.pop() + for line in data: + yield line + data = uzip.decompress(fh.read(blocksize)).split("\n") + # first line is prepended with saved chunk from end of last set. + data[0] = save + data[0] diff --git a/goatools/gaf_reader.py b/goatools/gaf_reader.py index 25860825..fe059913 100755 --- a/goatools/gaf_reader.py +++ b/goatools/gaf_reader.py @@ -10,6 +10,7 @@ import sys import re from collections import namedtuple +from base import nopen __copyright__ = "Copyright (C) 2016, DV Klopfenstein, H Tang. All rights reserved." __author__ = "DV Klopfenstein" @@ -55,8 +56,10 @@ class GafReader(object): # Expected values for a Qualifier exp_qualifiers = set(['NOT', 'contributes_to', 'colocalizes_with']) - def __init__(self, log=sys.stdout): + def __init__(self, filename, log=sys.stdout): + self.filename = filename self.log = log + self.associations = self.read_gaf(filename) def _get_ntgaf(self, ntgafobj, flds, ver): """Convert fields from string to preferred format for GAF ver 2.1 and 2.0.""" @@ -114,22 +117,22 @@ def _rd_fld_vals(name, val, set_list_ft=True, qty_min=0, qty_max=None): return vals if set_list_ft else set(vals) def read_gaf(self, fin_gaf): - """Read GAF file.""" + """Read GAF file. HTTP address okay. GZIPPED/BZIPPED file okay.""" ga_lst = [] - with open(fin_gaf) as ifstrm: - ver = None - ntgafobj = None - exp_numcol = None - for line in ifstrm: - if ntgafobj is not None and not line.startswith('!'): - flds = self._split_line(line, exp_numcol) - ntgaf = self._get_ntgaf(ntgafobj, flds, ver) - ga_lst.append(ntgaf) - elif ntgafobj is None and line.startswith('!gaf-version:'): - ver = line[13:].strip() - ntgafobj = namedtuple("ntgafobj", " ".join(self.gaf_columns[ver])) - exp_numcol = self.gaf_numcol[ver] - self.log.write(" READ {N} items: {FIN}\n".format(N=len(ga_lst), FIN=fin_gaf)) + ifstrm = nopen(fin_gaf) + ver = None + ntgafobj = None + exp_numcol = None + for line in ifstrm: + if ntgafobj is not None and not line.startswith('!'): + flds = self._split_line(line, exp_numcol) + ntgaf = self._get_ntgaf(ntgafobj, flds, ver) + ga_lst.append(ntgaf) + elif ntgafobj is None and line.startswith('!gaf-version:'): + ver = line[13:].strip() + ntgafobj = namedtuple("ntgafobj", " ".join(self.gaf_columns[ver])) + exp_numcol = self.gaf_numcol[ver] + self.log.write(" READ {N} items: {FIN}\n".format(N=len(ga_lst), FIN=fin_gaf)) return ga_lst @staticmethod diff --git a/goatools/obo_parser.py b/goatools/obo_parser.py index ba40e795..daf0e138 100755 --- a/goatools/obo_parser.py +++ b/goatools/obo_parser.py @@ -11,8 +11,6 @@ import os import re -import collections as cx - GraphEngines = ("pygraphviz", "pydot") __copyright__ = "Copyright (C) 2010-2016, H Tang et al., All rights reserved." diff --git a/goatools/semantic.py b/goatools/semantic.py new file mode 100644 index 00000000..16341775 --- /dev/null +++ b/goatools/semantic.py @@ -0,0 +1,166 @@ +#!/usr/bin/env python +# -*- coding: UTF-8 -*- + +""" +Compute semantic similarities between GO terms. Borrowed from book chapter from +Christophe Dessimoz (thanks). For details, please check out: +notebooks/semantic_similarity.ipynb +""" + +import math +from collections import Counter + + +class TermCounts: + ''' + TermCounts counts the term counts for each + ''' + def __init__(self, go, annots): + ''' + Initialise the counts and + ''' + # Backup + self._go = go + + # Initialise the counters + self._counts = Counter() + self._aspect_counts = Counter() + + # Fill the counters... + self._count_terms(go, annots) + + def _count_terms(self, go, annots): + ''' + Fills in the counts and overall aspect counts. + ''' + for gene, terms in annots.items(): + # Make a union of all the terms for a gene, if term parents are + # propagated but they won't get double-counted for the gene + allterms = set(terms) + for go_id in terms: + allterms |= go[go_id].get_all_parents() + for p in allterms: + self._counts[p] += 1 + + for go_id, c in self._counts.items(): + # Group by namespace + namespace = go[go_id].namespace + self._aspect_counts[namespace] += c + + def get_count(self, go_id): + ''' + Returns the count of that GO term observed in the annotations. + ''' + return self._counts[go_id] + + def get_total_count(self, aspect): + ''' + Gets the total count that's been precomputed. + ''' + return self._aspect_counts[aspect] + + def get_term_freq(self, go_id): + ''' + Returns the frequency at which a particular GO term has + been observed in the annotations. + ''' + try: + namespace = self._go[go_id].namespace + freq = float(self.get_count(go_id)) / float(self.get_total_count(namespace)) + #print self.get_count(go_id), self.get_total_count(namespace), freq + except ZeroDivisionError: + freq = 0 + + return freq + + +def ic(go_id, termcounts): + ''' + Calculates the information content of a GO term. + ''' + # Get the observed frequency of the GO term + freq = termcounts.get_term_freq(go_id) + + # Calculate the information content (i.e., -log("freq of GO term") + return -1.0 * math.log(freq) if freq else 0 + + +def resnik_sim(go_id1, go_id2, go, termcounts): + ''' + Computes Resnik's similarity measure. + ''' + msca = deepest_common_ancestor([go_id1, go_id2], go) + return ic(msca, termcounts) + + +def lin_sim(go_id1, go_id2, go, termcounts): + ''' + Computes Lin's similarity measure. + ''' + sim_r = resnik_sim(go_id1, go_id2, go, termcounts) + + return (-2*sim_r)/(ic(go_id1, termcounts) + ic(go_id2, termcounts)) + + +def common_parent_go_ids(terms, go): + ''' + This function finds the common ancestors in the GO + tree of the list of terms in the input. + ''' + # Find candidates from first + rec = go[terms[0]] + candidates = rec.get_all_parents() + candidates.update({terms[0]}) + + # Find intersection with second to nth term + for term in terms[1:]: + rec = go[term] + parents = rec.get_all_parents() + parents.update({term}) + + # Find the intersection with the candidates, and update. + candidates.intersection_update(parents) + + return candidates + + +def deepest_common_ancestor(terms, go): + ''' + This function gets the nearest common ancestor + using the above function. + Only returns single most specific - assumes unique exists. + ''' + # Take the element at maximum depth. + return max(common_parent_go_ids(terms, go), key=lambda t: go[t].depth) + + +def min_branch_length(go_id1, go_id2, go): + ''' + Finds the minimum branch length between two terms in the GO DAG. + ''' + # First get the deepest common ancestor + dca = deepest_common_ancestor([go_id1, go_id2], go) + + # Then get the distance from the DCA to each term + dca_depth = go[dca].depth + d1 = go[go_id1].depth - dca_depth + d2 = go[go_id2].depth - dca_depth + + # Return the total distance - i.e., to the deepest common ancestor and back. + return d1 + d2 + + +def semantic_distance(go_id1, go_id2, go): + ''' + Finds the semantic distance (minimum number of connecting branches) + between two GO terms. + ''' + return min_branch_length(go_id1, go_id2, go) + + +def semantic_similarity(go_id1, go_id2, go): + ''' + Finds the semantic similarity (inverse of the semantic distance) + between two GO terms. + ''' + return 1.0 / float(semantic_distance(go_id1, go_id2, go)) diff --git a/notebook/annotation_coverage.ipynb b/notebooks/annotation_coverage.ipynb similarity index 100% rename from notebook/annotation_coverage.ipynb rename to notebooks/annotation_coverage.ipynb diff --git a/notebook/cell_cycle.ipynb b/notebooks/cell_cycle.ipynb similarity index 100% rename from notebook/cell_cycle.ipynb rename to notebooks/cell_cycle.ipynb diff --git a/notebook/goea_nbt3102.ipynb b/notebooks/goea_nbt3102.ipynb similarity index 100% rename from notebook/goea_nbt3102.ipynb rename to notebooks/goea_nbt3102.ipynb diff --git a/notebook/images/README.md b/notebooks/images/README.md similarity index 100% rename from notebook/images/README.md rename to notebooks/images/README.md diff --git a/notebook/images/nbt3102_CC.png b/notebooks/images/nbt3102_CC.png similarity index 100% rename from notebook/images/nbt3102_CC.png rename to notebooks/images/nbt3102_CC.png diff --git a/notebook/images/nbt3102_MF_RNA_Symbols.png b/notebooks/images/nbt3102_MF_RNA_Symbols.png similarity index 100% rename from notebook/images/nbt3102_MF_RNA_Symbols.png rename to notebooks/images/nbt3102_MF_RNA_Symbols.png diff --git a/notebook/images/nbt3102_MF_RNA_genecnt.png b/notebooks/images/nbt3102_MF_RNA_genecnt.png similarity index 100% rename from notebook/images/nbt3102_MF_RNA_genecnt.png rename to notebooks/images/nbt3102_MF_RNA_genecnt.png diff --git a/notebook/makefile b/notebooks/makefile similarity index 100% rename from notebook/makefile rename to notebooks/makefile diff --git a/notebook/report_depth_level.ipynb b/notebooks/report_depth_level.ipynb similarity index 100% rename from notebook/report_depth_level.ipynb rename to notebooks/report_depth_level.ipynb diff --git a/notebooks/semantic_similarity.ipynb b/notebooks/semantic_similarity.ipynb new file mode 100644 index 00000000..9d826a86 --- /dev/null +++ b/notebooks/semantic_similarity.ipynb @@ -0,0 +1,268 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Computing basic semantic similarities between GO terms" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Adapted from book chapter written by _Christophe Dessimoz_" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this section we look at how to compute semantic similarity between GO terms. First we need to write a function that calculates the minimum number of branches connecting two GO terms." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "load obo file ../go-basic.obo\n", + "46268" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "../go-basic.obo: format-version(1.2) data-version(releases/2016-03-19)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " nodes imported\n" + ] + } + ], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2\n", + "\n", + "import sys\n", + "\n", + "sys.path.insert(0, \"..\")\n", + "from goatools import obo_parser\n", + "\n", + "go = obo_parser.GODag(\"../go-basic.obo\")" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "GO:0048364\tlevel-03\tdepth-04\troot development [biological_process] \n", + "GO:0044707\tlevel-02\tdepth-02\tsingle-multicellular organism process [biological_process] \n" + ] + } + ], + "source": [ + "go_id3 = 'GO:0048364'\n", + "go_id4 = 'GO:0044707'\n", + "print(go[go_id3])\n", + "print(go[go_id4])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's get all the annotations from arabidopsis." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " READ 211727 items: http://geneontology.org/gene-associations/gene_association.tair.gz\n" + ] + } + ], + "source": [ + "from goatools.associations import read_gaf\n", + "\n", + "associations = read_gaf(\"http://geneontology.org/gene-associations/gene_association.tair.gz\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we can calculate the semantic distance and semantic similarity, as so:" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The semantic similarity between terms GO:0048364 and GO:0044707 is 0.25.\n" + ] + } + ], + "source": [ + "from goatools.semantic import semantic_similarity\n", + "\n", + "sim = semantic_similarity(go_id3, go_id4, go)\n", + "print('The semantic similarity between terms {} and {} is {}.'.format(go_id3, go_id4, sim))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Then we can calculate the information content of the single term, GO:0048364." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Information content (GO:0048364) = 7.75481392334\n" + ] + } + ], + "source": [ + "from goatools.semantic import TermCounts, ic\n", + "\n", + "# First get the counts of each GO term.\n", + "termcounts = TermCounts(go, associations)\n", + "\n", + "# Calculate the information content\n", + "go_id = \"GO:0048364\"\n", + "infocontent = ic(go_id, termcounts)\n", + "print('Information content ({}) = {}'.format(go_id, infocontent))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Resnik's similarity measure is defined as the information content of the most informative common ancestor. That is, the most specific common parent-term in the GO. Then we can calculate this as follows:" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Resnik similarity score (GO:0048364, GO:0044707) = 4.0540784252\n" + ] + } + ], + "source": [ + "from goatools.semantic import resnik_sim\n", + "\n", + "sim_r = resnik_sim(go_id3, go_id4, go, termcounts)\n", + "print('Resnik similarity score ({}, {}) = {}'.format(go_id3, go_id4, sim_r))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Lin's similarity measure is defined as:\n", + "$$ \\textrm{sim}_{\\textrm{Lin}}(t_{1}, t_{2}) = \\frac{-2*\\textrm{sim}_{\\textrm{Resnik}}(t_1, t_2)}{IC(t_1) + IC(t_2)} $$" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Then we can calculate this as" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Lin similarity score (GO:0048364, GO:0044707) = -0.607721957763\n" + ] + } + ], + "source": [ + "from goatools.semantic import lin_sim\n", + "\n", + "sim_l = lin_sim(go_id3, go_id4, go, termcounts)\n", + "print('Lin similarity score ({}, {}) = {}'.format(go_id3, go_id4, sim_l))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.11" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +}