[goatools] add base and semantic; rename notebook to notebooks

tanghaibao · Mar 20, 2016 · 21ef80e · 21ef80e
1 parent ceaeb44
commit 21ef80e
Show file tree

Hide file tree

Showing 16 changed files with 570 additions and 27 deletions.
diff --git a/README.rst b/README.rst
@@ -38,7 +38,7 @@ Description
 This package contains a Python library to
 
 - process over- and under-representation of certain GO terms, based on Fisher's
- exact test. With numerous multiple correction routines including locally 
+ exact test. With numerous multiple correction routines including locally
  implemented routines for Bonferroni, Sidak, Holm, and false discovery rate. Also included are
  multiple test corrections from `statsmodels <http:https://www.statsmodels.org/stable/index.html>`_:
  FDR Benjamini/Hochberg, FDR Benjamini/Yekutieli, Holm-Sidak, Simes-Hochberg,
@@ -218,19 +218,23 @@ iPython Notebooks
 
 Run a Gene Ontology Enrichment Analysis (GOEA)
 ::::::::::::::::::::::::::::::::::::::::::::::
-https://github.com/tanghaibao/goatools/blob/master/notebook/goea_nbt3102.ipynb
+https://github.com/tanghaibao/goatools/blob/master/notebooks/goea_nbt3102.ipynb
 
 Report level and depth counts of a set of GO terms
 ::::::::::::::::::::::::::::::::::::::::::::::::::
-https://github.com/tanghaibao/goatools/blob/master/notebook/report_depth_level.ipynb
+https://github.com/tanghaibao/goatools/blob/master/notebooks/report_depth_level.ipynb
 
 Find all human protein-coding genes associated with cell cycle
 ::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
-https://github.com/tanghaibao/goatools/blob/master/notebook/cell_cycle.ipynb
+https://github.com/tanghaibao/goatools/blob/master/notebooks/cell_cycle.ipynb
 
 Calculate annotation coverage of GO terms on various species
 ::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
-https://github.com/tanghaibao/goatools/blob/master/notebook/annotation_coverage.ipynb
+https://github.com/tanghaibao/goatools/blob/master/notebooks/annotation_coverage.ipynb
+
+Determine the semantic similarities between GO terms
+::::::::::::::::::::::::::::::::::::::::::::::::::::
+https://github.com/tanghaibao/goatools/blob/master/notebooks/semantic_similarity.ipynb
 
 
 Reference

diff --git a/goatools/associations.py b/goatools/associations.py
@@ -78,7 +78,7 @@ def read_ncbi_gene2go(fin_gene2go, taxids=None, **kws):
  """Read NCBI's gene2go. Return gene2go data for user-specified taxids."""
  # Written by DV Klopfenstein
  # kws: taxid2asscs evidence_set
- # Simple associations 
+ # Simple associations
  id2gos = defaultdict(set)
  # Optional detailed associations split by taxid and having both ID2GOs & GO2IDs
  # e.g., taxid2asscs = defaultdict(lambda: defaultdict(lambda: defaultdict(set))
@@ -117,12 +117,11 @@ def read_gaf(fin_gaf, **kws):
  # Written by DV Klopfenstein
  # kws: taxid2asscs evidence_set
  from goatools.gaf_reader import GafReader
- # Simple associations 
+ # Simple associations
  id2gos = defaultdict(set)
  # Optional detailed associations split by taxid and having both ID2GOs & GO2IDs
  taxid2asscs = kws['taxid2asscs'] if 'taxid2asscs' in kws else None
- gafobj = GafReader()
- gafnts = gafobj.read_gaf(fin_gaf)
+ gafnts = GafReader(fin_gaf).associations
  # Optionaly specify a subset of GOs based on their evidence.
  evs = kws['evidence_set'] if 'evidence_set' in kws else None
  for nt in gafnts:

diff --git a/goatools/base.py b/goatools/base.py
@@ -0,0 +1,105 @@
+# Stolen from brentp:
+# <https://github.com/brentp/toolshed/blob/master/toolshed/files.py>
+
+import bz2
+import gzip
+import sys
+import urllib
+
+
+if sys.version_info[0] < 3:
+ int_types = (int, long)
+ urlopen = urllib.urlopen
+else:
+ int_types = (int,)
+ basestring = str
+ from urllib.request import urlopen
+
+
+def nopen(f, mode="r"):
+ r"""
+ open a file that's gzipped or return stdin for '-'
+ if f is a number, the result of nopen(sys.argv[f]) is returned.
+ >>> nopen('-') == sys.stdin, nopen('-', 'w') == sys.stdout
+ (True, True)
+ >>> nopen(sys.argv[0])
+ <...file...>
+ # expands user and vars ($HOME)
+ >>> nopen("~/.bashrc").name == nopen("$HOME/.bashrc").name
+ True
+ # an already open file.
+ >>> nopen(open(sys.argv[0]))
+ <...file...>
+ >>> nopen(0)
+ <...file...>
+ Or provide nicer access to Popen.stdout
+ >>> files = list(nopen("|ls"))
+ >>> assert 'setup.py\n' in files or b'setup.py\n' in files, files
+ """
+ if isinstance(f, int_types):
+ return nopen(sys.argv[f], mode)
+
+ if not isinstance(f, basestring):
+ return f
+ if f.startswith("|"):
+ # using shell explicitly makes things like process substitution work:
+ # http:https://stackoverflow.com/questions/7407667/python-subprocess-subshells-and-redirection
+ # use sys.stderr so we dont have to worry about checking it...
+ p = Popen(f[1:], stdout=PIPE, stdin=PIPE,
+ stderr=sys.stderr if mode == "r" else PIPE,
+ shell=True, bufsize=-1, # use system default for buffering
+ preexec_fn=prefunc,
+ close_fds=False, executable=os.environ.get('SHELL'))
+ if sys.version_info[0] > 2:
+ import io
+ p.stdout = io.TextIOWrapper(p.stdout)
+ p.stdin = io.TextIOWrapper(p.stdin)
+ if mode != "r":
+ p.stderr = io.TextIOWrapper(p.stderr)
+
+ if mode and mode[0] == "r":
+ return process_iter(p, f[1:])
+ return p
+
+ if f.startswith(("http:https://", "https://", "ftp:https://")):
+ fh = urlopen(f)
+ if f.endswith(".gz"):
+ return ungzipper(fh)
+ if sys.version_info[0] < 3:
+ return fh
+ import io
+ return io.TextIOWrapper(fh)
+ f = op.expanduser(op.expandvars(f))
+ if f.endswith((".gz", ".Z", ".z")):
+ fh = gzip.open(f, mode)
+ if sys.version_info[0] < 3:
+ return fh
+ import io
+ return io.TextIOWrapper(fh)
+ elif f.endswith((".bz", ".bz2", ".bzip2")):
+ fh = bz2.BZ2File(f, mode)
+ if sys.version_info[0] < 3:
+ return fh
+ import io
+ return io.TextIOWrapper(fh)
+
+ return {"r": sys.stdin, "w": sys.stdout}[mode[0]] if f == "-" \
+ else open(f, mode)
+
+
+def ungzipper(fh, blocksize=16384):
+ """
+ work-around to get streaming download of http:https://.../some.gz
+ """
+ import zlib
+ uzip = zlib.decompressobj(16 + zlib.MAX_WBITS)
+ data = uzip.decompress(fh.read(blocksize)).split("\n")
+
+ while len(data[0]):
+ # last chunk might not be a full line.
+ save = data.pop()
+ for line in data:
+ yield line
+ data = uzip.decompress(fh.read(blocksize)).split("\n")
+ # first line is prepended with saved chunk from end of last set.
+ data[0] = save + data[0]
diff --git a/goatools/gaf_reader.py b/goatools/gaf_reader.py
@@ -10,6 +10,7 @@
 import sys
 import re
 from collections import namedtuple
+from base import nopen
 
 __copyright__ = "Copyright (C) 2016, DV Klopfenstein, H Tang. All rights reserved."
 __author__ = "DV Klopfenstein"
@@ -55,8 +56,10 @@ class GafReader(object):
  # Expected values for a Qualifier
  exp_qualifiers = set(['NOT', 'contributes_to', 'colocalizes_with'])
 
- def __init__(self, log=sys.stdout):
+ def __init__(self, filename, log=sys.stdout):
+ self.filename = filename
  self.log = log
+ self.associations = self.read_gaf(filename)
 
  def _get_ntgaf(self, ntgafobj, flds, ver):
  """Convert fields from string to preferred format for GAF ver 2.1 and 2.0."""
@@ -114,22 +117,22 @@ def _rd_fld_vals(name, val, set_list_ft=True, qty_min=0, qty_max=None):
  return vals if set_list_ft else set(vals)
 
  def read_gaf(self, fin_gaf):
- """Read GAF file."""
+ """Read GAF file. HTTP address okay. GZIPPED/BZIPPED file okay."""
  ga_lst = []
- with open(fin_gaf) as ifstrm:
-  ver = None
-  ntgafobj = None
-  exp_numcol = None
-  for line in ifstrm:
-  if ntgafobj is not None and not line.startswith('!'):
-  flds = self._split_line(line, exp_numcol)
-  ntgaf = self._get_ntgaf(ntgafobj, flds, ver)
-  ga_lst.append(ntgaf)
-  elif ntgafobj is None and line.startswith('!gaf-version:'):
-  ver = line[13:].strip()
-  ntgafobj = namedtuple("ntgafobj", " ".join(self.gaf_columns[ver]))
-  exp_numcol = self.gaf_numcol[ver]
-  self.log.write(" READ {N} items: {FIN}\n".format(N=len(ga_lst), FIN=fin_gaf))
+ ifstrm = nopen(fin_gaf)
+ ver = None
+ ntgafobj = None
+ exp_numcol = None
+ for line in ifstrm:
+ if ntgafobj is not None and not line.startswith('!'):
+ flds = self._split_line(line, exp_numcol)
+ ntgaf = self._get_ntgaf(ntgafobj, flds, ver)
+ ga_lst.append(ntgaf)
+ elif ntgafobj is None and line.startswith('!gaf-version:'):
+ ver = line[13:].strip()
+ ntgafobj = namedtuple("ntgafobj", " ".join(self.gaf_columns[ver]))
+ exp_numcol = self.gaf_numcol[ver]
+ self.log.write(" READ {N} items: {FIN}\n".format(N=len(ga_lst), FIN=fin_gaf))
  return ga_lst
 
  @staticmethod

diff --git a/goatools/obo_parser.py b/goatools/obo_parser.py
@@ -11,8 +11,6 @@
 import os
 import re
 
-import collections as cx
-
 GraphEngines = ("pygraphviz", "pydot")
 
 __copyright__ = "Copyright (C) 2010-2016, H Tang et al., All rights reserved."