Skip to content

Commit

Permalink
[goatools] add base and semantic; rename notebook to notebooks
Browse files Browse the repository at this point in the history
  • Loading branch information
tanghaibao committed Mar 20, 2016
1 parent ceaeb44 commit 21ef80e
Show file tree
Hide file tree
Showing 16 changed files with 570 additions and 27 deletions.
14 changes: 9 additions & 5 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ Description
This package contains a Python library to

- process over- and under-representation of certain GO terms, based on Fisher's
exact test. With numerous multiple correction routines including locally
exact test. With numerous multiple correction routines including locally
implemented routines for Bonferroni, Sidak, Holm, and false discovery rate. Also included are
multiple test corrections from `statsmodels <http:https://www.statsmodels.org/stable/index.html>`_:
FDR Benjamini/Hochberg, FDR Benjamini/Yekutieli, Holm-Sidak, Simes-Hochberg,
Expand Down Expand Up @@ -218,19 +218,23 @@ iPython Notebooks

Run a Gene Ontology Enrichment Analysis (GOEA)
::::::::::::::::::::::::::::::::::::::::::::::
https://github.com/tanghaibao/goatools/blob/master/notebook/goea_nbt3102.ipynb
https://github.com/tanghaibao/goatools/blob/master/notebooks/goea_nbt3102.ipynb

Report level and depth counts of a set of GO terms
::::::::::::::::::::::::::::::::::::::::::::::::::
https://github.com/tanghaibao/goatools/blob/master/notebook/report_depth_level.ipynb
https://github.com/tanghaibao/goatools/blob/master/notebooks/report_depth_level.ipynb

Find all human protein-coding genes associated with cell cycle
::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
https://github.com/tanghaibao/goatools/blob/master/notebook/cell_cycle.ipynb
https://github.com/tanghaibao/goatools/blob/master/notebooks/cell_cycle.ipynb

Calculate annotation coverage of GO terms on various species
::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
https://github.com/tanghaibao/goatools/blob/master/notebook/annotation_coverage.ipynb
https://github.com/tanghaibao/goatools/blob/master/notebooks/annotation_coverage.ipynb

Determine the semantic similarities between GO terms
::::::::::::::::::::::::::::::::::::::::::::::::::::
https://github.com/tanghaibao/goatools/blob/master/notebooks/semantic_similarity.ipynb


Reference
Expand Down
7 changes: 3 additions & 4 deletions goatools/associations.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ def read_ncbi_gene2go(fin_gene2go, taxids=None, **kws):
"""Read NCBI's gene2go. Return gene2go data for user-specified taxids."""
# Written by DV Klopfenstein
# kws: taxid2asscs evidence_set
# Simple associations
# Simple associations
id2gos = defaultdict(set)
# Optional detailed associations split by taxid and having both ID2GOs & GO2IDs
# e.g., taxid2asscs = defaultdict(lambda: defaultdict(lambda: defaultdict(set))
Expand Down Expand Up @@ -117,12 +117,11 @@ def read_gaf(fin_gaf, **kws):
# Written by DV Klopfenstein
# kws: taxid2asscs evidence_set
from goatools.gaf_reader import GafReader
# Simple associations
# Simple associations
id2gos = defaultdict(set)
# Optional detailed associations split by taxid and having both ID2GOs & GO2IDs
taxid2asscs = kws['taxid2asscs'] if 'taxid2asscs' in kws else None
gafobj = GafReader()
gafnts = gafobj.read_gaf(fin_gaf)
gafnts = GafReader(fin_gaf).associations
# Optionaly specify a subset of GOs based on their evidence.
evs = kws['evidence_set'] if 'evidence_set' in kws else None
for nt in gafnts:
Expand Down
105 changes: 105 additions & 0 deletions goatools/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
# Stolen from brentp:
# <https://github.com/brentp/toolshed/blob/master/toolshed/files.py>

import bz2
import gzip
import sys
import urllib


if sys.version_info[0] < 3:
int_types = (int, long)
urlopen = urllib.urlopen
else:
int_types = (int,)
basestring = str
from urllib.request import urlopen


def nopen(f, mode="r"):
r"""
open a file that's gzipped or return stdin for '-'
if f is a number, the result of nopen(sys.argv[f]) is returned.
>>> nopen('-') == sys.stdin, nopen('-', 'w') == sys.stdout
(True, True)
>>> nopen(sys.argv[0])
<...file...>
# expands user and vars ($HOME)
>>> nopen("~/.bashrc").name == nopen("$HOME/.bashrc").name
True
# an already open file.
>>> nopen(open(sys.argv[0]))
<...file...>
>>> nopen(0)
<...file...>
Or provide nicer access to Popen.stdout
>>> files = list(nopen("|ls"))
>>> assert 'setup.py\n' in files or b'setup.py\n' in files, files
"""
if isinstance(f, int_types):
return nopen(sys.argv[f], mode)

if not isinstance(f, basestring):
return f
if f.startswith("|"):
# using shell explicitly makes things like process substitution work:
# http:https://stackoverflow.com/questions/7407667/python-subprocess-subshells-and-redirection
# use sys.stderr so we dont have to worry about checking it...
p = Popen(f[1:], stdout=PIPE, stdin=PIPE,
stderr=sys.stderr if mode == "r" else PIPE,
shell=True, bufsize=-1, # use system default for buffering
preexec_fn=prefunc,
close_fds=False, executable=os.environ.get('SHELL'))
if sys.version_info[0] > 2:
import io
p.stdout = io.TextIOWrapper(p.stdout)
p.stdin = io.TextIOWrapper(p.stdin)
if mode != "r":
p.stderr = io.TextIOWrapper(p.stderr)

if mode and mode[0] == "r":
return process_iter(p, f[1:])
return p

if f.startswith(("http:https://", "https://", "ftp:https://")):
fh = urlopen(f)
if f.endswith(".gz"):
return ungzipper(fh)
if sys.version_info[0] < 3:
return fh
import io
return io.TextIOWrapper(fh)
f = op.expanduser(op.expandvars(f))
if f.endswith((".gz", ".Z", ".z")):
fh = gzip.open(f, mode)
if sys.version_info[0] < 3:
return fh
import io
return io.TextIOWrapper(fh)
elif f.endswith((".bz", ".bz2", ".bzip2")):
fh = bz2.BZ2File(f, mode)
if sys.version_info[0] < 3:
return fh
import io
return io.TextIOWrapper(fh)

return {"r": sys.stdin, "w": sys.stdout}[mode[0]] if f == "-" \
else open(f, mode)


def ungzipper(fh, blocksize=16384):
"""
work-around to get streaming download of http:https://.../some.gz
"""
import zlib
uzip = zlib.decompressobj(16 + zlib.MAX_WBITS)
data = uzip.decompress(fh.read(blocksize)).split("\n")

while len(data[0]):
# last chunk might not be a full line.
save = data.pop()
for line in data:
yield line
data = uzip.decompress(fh.read(blocksize)).split("\n")
# first line is prepended with saved chunk from end of last set.
data[0] = save + data[0]
35 changes: 19 additions & 16 deletions goatools/gaf_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import sys
import re
from collections import namedtuple
from base import nopen

__copyright__ = "Copyright (C) 2016, DV Klopfenstein, H Tang. All rights reserved."
__author__ = "DV Klopfenstein"
Expand Down Expand Up @@ -55,8 +56,10 @@ class GafReader(object):
# Expected values for a Qualifier
exp_qualifiers = set(['NOT', 'contributes_to', 'colocalizes_with'])

def __init__(self, log=sys.stdout):
def __init__(self, filename, log=sys.stdout):
self.filename = filename
self.log = log
self.associations = self.read_gaf(filename)

def _get_ntgaf(self, ntgafobj, flds, ver):
"""Convert fields from string to preferred format for GAF ver 2.1 and 2.0."""
Expand Down Expand Up @@ -114,22 +117,22 @@ def _rd_fld_vals(name, val, set_list_ft=True, qty_min=0, qty_max=None):
return vals if set_list_ft else set(vals)

def read_gaf(self, fin_gaf):
"""Read GAF file."""
"""Read GAF file. HTTP address okay. GZIPPED/BZIPPED file okay."""
ga_lst = []
with open(fin_gaf) as ifstrm:
ver = None
ntgafobj = None
exp_numcol = None
for line in ifstrm:
if ntgafobj is not None and not line.startswith('!'):
flds = self._split_line(line, exp_numcol)
ntgaf = self._get_ntgaf(ntgafobj, flds, ver)
ga_lst.append(ntgaf)
elif ntgafobj is None and line.startswith('!gaf-version:'):
ver = line[13:].strip()
ntgafobj = namedtuple("ntgafobj", " ".join(self.gaf_columns[ver]))
exp_numcol = self.gaf_numcol[ver]
self.log.write(" READ {N} items: {FIN}\n".format(N=len(ga_lst), FIN=fin_gaf))
ifstrm = nopen(fin_gaf)
ver = None
ntgafobj = None
exp_numcol = None
for line in ifstrm:
if ntgafobj is not None and not line.startswith('!'):
flds = self._split_line(line, exp_numcol)
ntgaf = self._get_ntgaf(ntgafobj, flds, ver)
ga_lst.append(ntgaf)
elif ntgafobj is None and line.startswith('!gaf-version:'):
ver = line[13:].strip()
ntgafobj = namedtuple("ntgafobj", " ".join(self.gaf_columns[ver]))
exp_numcol = self.gaf_numcol[ver]
self.log.write(" READ {N} items: {FIN}\n".format(N=len(ga_lst), FIN=fin_gaf))
return ga_lst

@staticmethod
Expand Down
2 changes: 0 additions & 2 deletions goatools/obo_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,6 @@
import os
import re

import collections as cx

GraphEngines = ("pygraphviz", "pydot")

__copyright__ = "Copyright (C) 2010-2016, H Tang et al., All rights reserved."
Expand Down

0 comments on commit 21ef80e

Please sign in to comment.