Merge pull request #221 from daler/v0.12rc

v0.12
daler · Jul 4, 2023 · 955fb41 · 955fb41
2 parents 1e2571f + ca68ef2
commit 955fb41
Show file tree

Hide file tree

Showing 9 changed files with 274 additions and 86 deletions.
diff --git a/README.rst b/README.rst
@@ -1,19 +1,12 @@
-
-.. image:: https://travis-ci.org/daler/gffutils.png?branch=master
- :target: https://travis-ci.org/daler/gffutils
-
-.. image:: https://badge.fury.io/py/gffutils.svg
- :target: http:https://badge.fury.io/py/gffutils
-
-.. image:: https://pypip.in/d/gffutils/badge.png
- :target: https://pypi.python.org/pypi/gffutils
-
-
+gffutils
+========
 
 ``gffutils`` is a Python package for working with and manipulating the GFF and
-GTF format files typically used for genomic annotations. Files are loaded into
-a sqlite3 database, allowing much more complex manipulation of hierarchical
-features (e.g., genes, transcripts, and exons) than is possible with plain-text
-methods alone.
+GTF format files typically used for genomic annotations.
+
+Files are loaded into a sqlite3 database, allowing much more complex
+manipulation of hierarchical features (e.g., genes, transcripts, and exons)
+than is possible with plain-text methods alone.
 
-See documentation at **http:https://daler.github.io/gffutils**.
+See documentation at https://daler.github.io/gffutils, and GitHub repo at
+https:/github.com/daler/gffutils.
diff --git a/doc/source/changelog.rst b/doc/source/changelog.rst
@@ -3,8 +3,22 @@
 Change log
 ==========
 
-Changes in v0.11.1
-------------------
+v0.12
+-----
+
+- Fix `#216 <https://github.com/daler/gffutils/issues/216>`_ (remove deprecated OptimizedUnicode text factory)
+- When interfeatures (like when creating introns) results in features with
+ multiple IDs, concatenate them (`#219
+ <https://github.com/daler/gffutils/pull/219>`_, thanks @Juke34)
+- Handle corner cases observed in GRCh38 annotations where a quoted comma in
+ the attributes causes the dialect inference to incorrectly conclude that
+ repeated keys are not present. See `PR #208 <https://github.com/daler/gffutils/pull/208>`_ for details.
+- Refactor tests to use pytest instead of the deprecated nosetests (`PR #201
+ <https://github.com/daler/gffutils/pull/201>`_, thanks @mr-c)
+- New method, `FeatureDB.create_splice_sites` (`PR #220 <https://github.com/daler/gffutils/pull/200>`_, thanks @Juke34)
+
+v0.11.1
+-------
 
 Bugfix: This fixes `#197 <https://github.com/daler/gffutils/issues/197>`_,
 where the :meth:`FeatureDB.interfeatures` function was not behaving correctly
@@ -18,8 +32,8 @@ This also makes a minor maintenance change, replacing
 been an alias to the latter, but this alias will be removed in Python 3.12.
 Making the change now avoids a deprecation warning.
 
-Changes in v0.11
-----------------
+v0.11
+-----
 
 This is largely a bugfix release, many thanks to contributors Rory Kirchner,
 Stefano Rivera, Daniel Lowengrub, Nolan Woods, Stefen Moeller, and Husen Umer.
@@ -69,14 +83,14 @@ Stefano Rivera, Daniel Lowengrub, Nolan Woods, Stefen Moeller, and Husen Umer.
  issue with a custom id spec. This addresses `#181
  <https://github.com/daler/gffutils/issues/181>`_.
 
-Changes in v0.10.1
-------------------
+v0.10.1
+-------
 
 - Fix issue with new merge routine (`#152
  <https://github.com/daler/gffutils/issues/152>`_)
 
-Changes in v0.10
-----------------
+v0.10
+-----
 
 - Support very large chromosomes (fixed issues `#94
  <https://github.com/daler/gffutils/issues/94>`_ and `#112
@@ -123,8 +137,8 @@ Changes in v0.10
  <https://github.com/daler/gffutils/pull/149>`_)
 
 
-Changes in v0.9
----------------
+v0.9
+----
 Long-overdue release with performance improvements and better handling of
 corner-case GFF and GTF files.
 
@@ -155,26 +169,26 @@ corner-case GFF and GTF files.
  encoding/decoding behavior.
 - improved testing framework
 
-Changes in v0.8.7.1
--------------------
+v0.8.7.1
+--------
 Fixes bug in `gffutils.pybedtools_integration.tsses` where iterating over large
 databases and using the `as_bed6=True` argument could cause a deadlock.
 
-Changes in v0.8.7
------------------
+v0.8.7
+------
 New module, :mod:`gffutils.pybedtools_integration`. In particular, the
 :func:`gffutils.pybedtools_integration.tsses` function provides many options
 for creating a GTF, GFF, or BED file of transcription start sites (TSSes) from
 an annotation.
 
-Changes in v0.8.6.1
--------------------
+v0.8.6.1
+--------
 Only a warning -- and not an ImportError -- is raised if BioPython is not installed.
 
 Lots of updates in the testing framework to use docker containers on travis-ci.org.
 
-Changes in v0.8.4
------------------
+v0.8.4
+------
 This version addresses issues `#48
 <https://github.com/daler/gffutils/issues/48>`_ and `#20
 <https://github.com/daler/gffutils/issues/20>`_. It only affects database
@@ -213,15 +227,15 @@ transcripts if genes exist, or infer genes if transcripts exist (rather than
 the previous all-or-nothing approach).
 
 
-Changes in v0.8.3.1
--------------------
+v0.8.3.1
+--------
 Thanks to Sven-Eric Schelhorn (@schellhorn on github), this version fixes a bug
 where, if multiple gffutils processes try to create databases from GTF files
 simultaneously, the resulting databases would be incomplete and incorrect.
 
 
-Changes in v0.8.3
------------------
+v0.8.3
+------
 New features
 ~~~~~~~~~~~~
 - New :func:`inspect.inspect` function for examining the contents of

diff --git a/gffutils/interface.py b/gffutils/interface.py
@@ -102,7 +102,7 @@ def __init__(
  keep_order=False,
  pragmas=constants.default_pragmas,
  sort_attribute_values=False,
- text_factory=sqlite3.OptimizedUnicode,
+ text_factory=str
  ):
  """
  Connect to a database created by :func:`gffutils.create_db`.
@@ -117,8 +117,7 @@ def __init__(
  text_factory : callable
 
  Optionally set the way sqlite3 handles strings. Default is
- sqlite3.OptimizedUnicode, which returns ascii when possible,
- unicode otherwise
+ str
 
  default_encoding : str
 
@@ -895,7 +894,14 @@ def _prep_for_yield(d):
  if d['start'] > d['end']:
  return None
 
- return self._feature_returner(**d)
+ new_feature = self._feature_returner(**d)
+
+ # concat list of ID to create uniq IDs because feature with
+ # multiple values for their ID are no longer permitted since v0.11
+ if "ID" in new_feature.attributes and len(new_feature.attributes["ID"]) > 1:
+ new_id = '-'.join(new_feature.attributes["ID"])
+ new_feature.attributes["ID"] = [new_id]
+ return new_feature
 
  # If not provided, use a no-op function instead.
  if not attribute_func:
@@ -1267,6 +1273,128 @@ def child_gen():
  ):
  yield intron
 
+ def create_splice_sites(
+ self,
+ exon_featuretype="exon",
+ grandparent_featuretype="gene",
+ parent_featuretype=None,
+ merge_attributes=True,
+ numeric_sort=False,
+ ):
+ """
+ Create splice sites from existing annotations.
+
+
+ Parameters
+ ----------
+ exon_featuretype : string
+ Feature type to use in order to infer splice sites. Typically `"exon"`.
+
+ grandparent_featuretype : string
+ If `grandparent_featuretype` is not None, then group exons by
+ children of this featuretype. If `granparent_featuretype` is
+ "gene" (default), then splice sites will be created for all first-level
+ children of genes. This may include mRNA, rRNA, ncRNA, etc. If
+ you only want to infer splice sites from one of these featuretypes
+ (e.g., mRNA), then use the `parent_featuretype` kwarg which is
+ mutually exclusive with `grandparent_featuretype`.
+
+ parent_featuretype : string
+ If `parent_featuretype` is not None, then only use this featuretype
+ to infer splice sites. Use this if you only want a subset of
+ featuretypes to have splice sites (e.g., "mRNA" only, and not ncRNA or
+ rRNA). Mutually exclusive with `grandparent_featuretype`.
+
+ merge_attributes : bool
+ Whether or not to merge attributes from all exons. If False then no
+ attributes will be created for the splice sites.
+
+ numeric_sort : bool
+ If True, then merged attributes that can be cast to float will be
+ sorted by their numeric values (but will still be returned as
+ string). This is useful, for example, when creating splice sites between
+ exons and the exons have exon_number attributes as an integer.
+ Using numeric_sort=True will ensure that the returned exons have
+ merged exon_number attribute of ['9', '10'] (numerically sorted)
+ rather than ['10', '9'] (alphabetically sorted).
+
+ Returns
+ -------
+ A generator object that yields :class:`Feature` objects representing
+ new splice sites
+
+ Notes
+ -----
+ The returned generator can be passed directly to the
+ :meth:`FeatureDB.update` method to permanently add them to the
+ database, e.g., ::
+
+ db.update(db.create_splice sites())
+
+ """
+ if (grandparent_featuretype and parent_featuretype) or (
+ grandparent_featuretype is None and parent_featuretype is None
+ ):
+ raise ValueError(
+ "exactly one of `grandparent_featuretype` or "
+ "`parent_featuretype` should be provided"
+ )
+
+ if grandparent_featuretype:
+
+ def child_gen():
+ for gene in self.features_of_type(grandparent_featuretype):
+ for child in self.children(gene, level=1):
+ yield child
+
+ elif parent_featuretype:
+
+ def child_gen():
+ for child in self.features_of_type(parent_featuretype):
+ yield child
+
+ # Two splice features need to be created for each interleave
+ for side in ["left", "right"]:
+ for child in child_gen():
+ exons = self.children(
+ child, level=1, featuretype=exon_featuretype, order_by="start"
+ )
+
+ # get strand
+ strand = child.strand
+
+ new_featuretype = "splice_site"
+ if side == "left":
+ if strand == "+":
+ new_featuretype = "five_prime_cis_splice_site"
+ elif strand == "-":
+ new_featuretype = "three_prime_cis_splice_site"
+
+ if side == "right":
+ if strand == "+":
+ new_featuretype = "three_prime_cis_splice_site"
+ elif strand == "-":
+ new_featuretype = "five_prime_cis_splice_site"
+
+ for splice_site in self.interfeatures(
+ exons,
+ new_featuretype=new_featuretype,
+ merge_attributes=merge_attributes,
+ numeric_sort=numeric_sort,
+ dialect=self.dialect,
+ ):
+
+ if side == "left":
+ splice_site.end = splice_site.start + 1
+ if side == "right":
+ splice_site.start = splice_site.end - 1
+
+ # make ID uniq by adding suffix
+ splice_site.attributes["ID"] = [new_featuretype + "_" + splice_site.attributes["ID"][0]]
+
+ yield splice_site
+
+
  def _old_merge(self, features, ignore_strand=False):
  """
  DEPRECATED, only retained here for backwards compatibility. Please use

diff --git a/gffutils/parser.py b/gffutils/parser.py
@@ -346,7 +346,11 @@ def _unquote_quals(quals, dialect):
  # strings
  # quals[key].extend([v for v in val.split(',') if v])
 
- # See issue #198, where 
+ # See issue #198, where commas within a description can incorrectly
+ # cause the dialect inference to conclude that there are not
+ # repeated keys.
+ #
+ # More description in PR #208.
  if dialect["repeated keys"]:
  quals[key].append(val)
  else:

diff --git a/gffutils/test/data/issue181.gff b/gffutils/test/data/issue181.gff