Merge pull request #231 from daler/v0.13rc

V0.13rc
daler · Apr 13, 2024 · 34c9c6a · 34c9c6a
2 parents 32e48a1 + cce76a6
commit 34c9c6a
Show file tree

Hide file tree

Showing 21 changed files with 310 additions and 215 deletions.
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -4,7 +4,7 @@ jobs:
  build-and-test:
  strategy:
  matrix:
- python-version: ["3.7", "3.8", "3.9", "3.10"]
+ python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
  runs-on: ubuntu-latest
  steps:
  - uses: actions/checkout@v3
@@ -30,7 +30,7 @@ jobs:
  conda config --system --set channel_priority strict
  mamba create -y -n gffutils-env \
  python=${{ matrix.python-version }} \
- --file requirements.txt
+ bedtools
 
  conda activate gffutils-env
  python setup.py clean sdist
@@ -43,9 +43,9 @@ jobs:
  run: |
  source "${HOME}/conda/etc/profile.d/conda.sh"
  source "${HOME}/conda/etc/profile.d/mamba.sh"
- mamba install -y -n gffutils-env --file optional-requirements.txt pytest hypothesis
 
  conda activate gffutils-env
+ pip install pytest hypothesis biopython pybedtools
  pytest -v --doctest-modules gffutils
  conda deactivate
 

diff --git a/doc/source/changelog.rst b/doc/source/changelog.rst
@@ -3,6 +3,17 @@
 Change log
 ==========
 
+v0.13
+-----
+
+- Document options for avoiding deadlocks when simultaneously reading/writing
+ to a db on disk (fixes `#227
+ <https://github.com/daler/gffutils/issues/227>`__).
+- Support later versions of BioPython (fixes `#228
+ <https://github.com/daler/gffutils/issues/228>`__).
+- Drop support for Python 3.7 and unused ``six`` dependency; support Python
+ 3.11 and 3.12 (fixes `#223 <https://github.com/daler/gffutils/issues/223>`__)
+
 v0.12
 -----
 

diff --git a/gffutils/attributes.py b/gffutils/attributes.py
@@ -1,4 +1,3 @@
-import six
 import collections
 
 try:
@@ -95,7 +94,7 @@ def __str__(self):
  return "\n".join(s)
 
  def update(self, *args, **kwargs):
- for k, v in six.iteritems(dict(*args, **kwargs)):
+ for k, v in dict(*args, **kwargs).items():
  self[k] = v
 
 

diff --git a/gffutils/biopython_integration.py b/gffutils/biopython_integration.py
@@ -2,7 +2,6 @@
 Module for integration with BioPython, specifically SeqRecords and SeqFeature
 objects.
 """
-import six
 
 try:
  from Bio.SeqFeature import SeqFeature, FeatureLocation
@@ -15,7 +14,8 @@
 _biopython_strand = {
  "+": 1,
  "-": -1,
- ".": 0,
+ ".": None,
+ "?": 0,
 }
 _feature_strand = dict((v, k) for k, v in _biopython_strand.items())
 
@@ -33,7 +33,7 @@ def to_seqfeature(feature):
  If string, assume it is a GFF or GTF-format line; otherwise just use
  the provided feature directly.
  """
- if isinstance(feature, six.string_types):
+ if isinstance(feature, str):
  feature = feature_from_line(feature)
 
  qualifiers = {
@@ -46,10 +46,11 @@ def to_seqfeature(feature):
  return SeqFeature(
  # Convert from GFF 1-based to standard Python 0-based indexing used by
  # BioPython
- FeatureLocation(feature.start - 1, feature.stop),
+ FeatureLocation(
+ feature.start - 1, feature.stop, strand=_biopython_strand[feature.strand]
+ ),
  id=feature.id,
  type=feature.featuretype,
- strand=_biopython_strand[feature.strand],
  qualifiers=qualifiers,
  )
 
@@ -66,12 +67,12 @@ def from_seqfeature(s, **kwargs):
  score = s.qualifiers.get("score", ".")[0]
  seqid = s.qualifiers.get("seqid", ".")[0]
  frame = s.qualifiers.get("frame", ".")[0]
- strand = _feature_strand[s.strand]
+ strand = _feature_strand[s.location.strand]
 
  # BioPython parses 1-based GenBank positions into 0-based for use within
  # Python. We need to convert back to 1-based GFF format here.
- start = s.location.start.position + 1
- stop = s.location.end.position
+ start = s.location.start + 1
+ stop = s.location.end
  featuretype = s.type
  id = s.id
  attributes = dict(s.qualifiers)

diff --git a/gffutils/convert.py b/gffutils/convert.py
@@ -2,8 +2,6 @@
 Conversion functions that operate on :class:`FeatureDB` classes.
 """
 
-import six
-
 
 def to_bed12(f, db, child_type="exon", name_field="ID"):
  """
@@ -22,7 +20,7 @@ def to_bed12(f, db, child_type="exon", name_field="ID"):
  Attribute to be used in the "name" field of the BED12 entry. Usually
  "ID" for GFF; "transcript_id" for GTF.
  """
- if isinstance(f, six.string_types):
+ if isinstance(f, str):
  f = db[f]
  children = list(db.children(f, featuretype=child_type, order_by="start"))
  sizes = [len(i) for i in children]

diff --git a/gffutils/create.py b/gffutils/create.py
@@ -5,7 +5,6 @@
 import sys
 import os
 import sqlite3
-import six
 from textwrap import dedent
 from gffutils import constants
 from gffutils import version
@@ -119,7 +118,7 @@ def __init__(
  os.unlink(dbfn)
  self.dbfn = dbfn
  self.id_spec = id_spec
- if isinstance(dbfn, six.string_types):
+ if isinstance(dbfn, str):
  conn = sqlite3.connect(dbfn)
  else:
  conn = dbfn
@@ -171,7 +170,7 @@ def _id_handler(self, f):
  """
 
  # If id_spec is a string or callable, convert to iterable for later
- if isinstance(self.id_spec, six.string_types):
+ if isinstance(self.id_spec, str):
  id_key = [self.id_spec]
  elif hasattr(self.id_spec, "__call__"):
  id_key = [self.id_spec]
@@ -181,7 +180,7 @@ def _id_handler(self, f):
  elif isinstance(self.id_spec, dict):
  try:
  id_key = self.id_spec[f.featuretype]
- if isinstance(id_key, six.string_types):
+ if isinstance(id_key, str):
  id_key = [id_key]
 
  # Otherwise, use default auto-increment.
@@ -217,7 +216,8 @@ def _id_handler(self, f):
  "a single value is required for a primary key in the "
  "database. Consider using a custom id_spec to "
  "convert these multiple values into a single "
- "value".format(k))
+ "value".format(k)
+ )
  except KeyError:
  pass
  try:
@@ -684,7 +684,7 @@ def _update_relations(self):
  # c.execute('CREATE INDEX childindex ON relations (child)')
  # self.conn.commit()
 
- if isinstance(self._keep_tempfiles, six.string_types):
+ if isinstance(self._keep_tempfiles, str):
  suffix = self._keep_tempfiles
  else:
  suffix = ".gffutils"
@@ -883,7 +883,7 @@ def _update_relations(self):
  msg = "transcript"
  logger.info("Inferring %s extents " "and writing to tempfile" % msg)
 
- if isinstance(self._keep_tempfiles, six.string_types):
+ if isinstance(self._keep_tempfiles, str):
  suffix = self._keep_tempfiles
  else:
  suffix = ".gffutils"

diff --git a/gffutils/feature.py b/gffutils/feature.py
@@ -1,5 +1,4 @@
 from pyfaidx import Fasta
-import six
 import simplejson as json
 from gffutils import constants
 from gffutils import helpers
@@ -166,7 +165,7 @@ def __init__(
  # for testing.
  attributes = attributes or dict_class()
 
- if isinstance(attributes, six.string_types):
+ if isinstance(attributes, str):
  try:
  attributes = helpers._unjsonify(attributes, isattributes=True)
 
@@ -182,7 +181,7 @@ def __init__(
  # If string, then try un-JSONifying it into a list; if that doesn't
  # work then assume it's tab-delimited and convert to a list.
  extra = extra or []
- if isinstance(extra, six.string_types):
+ if isinstance(extra, str):
  try:
  extra = helpers._unjsonify(extra)
  except json.JSONDecodeError:
@@ -254,10 +253,7 @@ def __setitem__(self, key, value):
  self.attributes[key] = value
 
  def __str__(self):
- if six.PY3:
- return self.__unicode__()
- else:
- return unicode(self).encode("utf-8")
+ return self.__unicode__()
 
  def __unicode__(self):
 
@@ -387,7 +383,7 @@ def sequence(self, fasta, use_strand=True):
  -------
  string
  """
- if isinstance(fasta, six.string_types):
+ if isinstance(fasta, str):
  fasta = Fasta(fasta, as_raw=False)
 
  # recall GTF/GFF is 1-based closed; pyfaidx uses Python slice notation

diff --git a/gffutils/gffwriter.py b/gffutils/gffwriter.py
@@ -1,7 +1,6 @@
 ##
 ## GFF Writer (writer): serializing gffutils records as GFF text files.
 ##
-import six
 import tempfile
 import shutil
 from time import strftime, localtime
@@ -41,7 +40,7 @@ def __init__(self, out, with_header=True, in_place=False):
  self.temp_file = None
  # Output stream to write to
  self.out_stream = None
- if isinstance(out, six.string_types):
+ if isinstance(out, str):
  if self.in_place:
  # Use temporary file
  self.temp_file = tempfile.NamedTemporaryFile(delete=False)

diff --git a/gffutils/helpers.py b/gffutils/helpers.py
@@ -4,7 +4,6 @@
 import simplejson as json
 import time
 import tempfile
-import six
 from gffutils import constants
 from gffutils import bins
 import gffutils
@@ -202,7 +201,7 @@ def make_query(
  # e.g., "featuretype = 'exon'"
  #
  # or, "featuretype IN ('exon', 'CDS')"
- if isinstance(featuretype, six.string_types):
+ if isinstance(featuretype, str):
  d["FEATURETYPE"] = "features.featuretype = ?"
  args.append(featuretype)
  else:
@@ -218,7 +217,7 @@ def make_query(
  # `limit` is a string or a tuple of (chrom, start, stop)
  #
  # e.g., "seqid = 'chr2L' AND start > 1000 AND end < 5000"
- if isinstance(limit, six.string_types):
+ if isinstance(limit, str):
  seqid, startstop = limit.split(":")
  start, end = startstop.split("-")
  else:
@@ -257,7 +256,7 @@ def make_query(
  # Default is essentially random order.
  #
  # e.g. "ORDER BY seqid, start DESC"
- if isinstance(order_by, six.string_types):
+ if isinstance(order_by, str):
  _order_by.append(order_by)
 
  else:
@@ -387,7 +386,7 @@ def merge_attributes(attr1, attr2, numeric_sort=False):
  if not isinstance(v, list):
  new_d[k] = [v]
 
- for k, v in six.iteritems(attr1):
+ for k, v in attr1.items():
  if k in attr2:
  if not isinstance(v, list):
  v = [v]
@@ -507,9 +506,9 @@ def is_gff_db(db_fname):
 
 
 def to_unicode(obj, encoding="utf-8"):
- if isinstance(obj, six.string_types):
- if not isinstance(obj, six.text_type):
- obj = six.text_type(obj, encoding)
+ if isinstance(obj, str):
+ if not isinstance(obj, str):
+ obj = str(obj, encoding)
  return obj
 
 
@@ -520,7 +519,6 @@ def canonical_transcripts(db, fasta_filename):
  """
  import pyfaidx
 
-
  fasta = pyfaidx.Fasta(fasta_filename, as_raw=False)
  for gene in db.features_of_type("gene"):
 
@@ -536,7 +534,20 @@ def canonical_transcripts(db, fasta_filename):
  cds_len += exon_length
  total_len += exon_length
 
- exon_list.append((cds_len, total_len, transcript, exons if cds_len == 0 else [e for e in exons if e.featuretype in ['CDS', 'five_prime_UTR', 'three_prime_UTR']]))
+ exon_list.append(
+ (
+ cds_len,
+ total_len,
+ transcript,
+ exons
+ if cds_len == 0
+ else [
+ e
+ for e in exons
+ if e.featuretype in ["CDS", "five_prime_UTR", "three_prime_UTR"]
+ ],
+ )
+ )
 
  # If we have CDS, then use the longest coding transcript
  if max(i[0] for i in exon_list) > 0:
@@ -549,7 +560,12 @@ def canonical_transcripts(db, fasta_filename):
 
  canonical_exons = best[-1]
  transcript = best[-2]
- seqs = [i.sequence(fasta) for i in sorted(canonical_exons, key=lambda x: x.start, reverse=transcript.strand != '+')]
+ seqs = [
+ i.sequence(fasta)
+ for i in sorted(
+ canonical_exons, key=lambda x: x.start, reverse=transcript.strand != "+"
+ )
+ ]
  yield transcript, "".join(seqs)