Skip to content

Commit

Permalink
Merge pull request #220 from Juke34/create_splice_sites
Browse files Browse the repository at this point in the history
add a function to create splice sites similar to create_introns
  • Loading branch information
daler committed Jul 4, 2023
2 parents 185c676 + 3826571 commit e480e11
Show file tree
Hide file tree
Showing 2 changed files with 165 additions and 8 deletions.
122 changes: 122 additions & 0 deletions gffutils/interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -1273,6 +1273,128 @@ def child_gen():
):
yield intron

def create_splice_sites(
self,
exon_featuretype="exon",
grandparent_featuretype="gene",
parent_featuretype=None,
merge_attributes=True,
numeric_sort=False,
):
"""
Create splice sites from existing annotations.
Parameters
----------
exon_featuretype : string
Feature type to use in order to infer splice sites. Typically `"exon"`.
grandparent_featuretype : string
If `grandparent_featuretype` is not None, then group exons by
children of this featuretype. If `granparent_featuretype` is
"gene" (default), then splice sites will be created for all first-level
children of genes. This may include mRNA, rRNA, ncRNA, etc. If
you only want to infer splice sites from one of these featuretypes
(e.g., mRNA), then use the `parent_featuretype` kwarg which is
mutually exclusive with `grandparent_featuretype`.
parent_featuretype : string
If `parent_featuretype` is not None, then only use this featuretype
to infer splice sites. Use this if you only want a subset of
featuretypes to have splice sites (e.g., "mRNA" only, and not ncRNA or
rRNA). Mutually exclusive with `grandparent_featuretype`.
merge_attributes : bool
Whether or not to merge attributes from all exons. If False then no
attributes will be created for the splice sites.
numeric_sort : bool
If True, then merged attributes that can be cast to float will be
sorted by their numeric values (but will still be returned as
string). This is useful, for example, when creating splice sites between
exons and the exons have exon_number attributes as an integer.
Using numeric_sort=True will ensure that the returned exons have
merged exon_number attribute of ['9', '10'] (numerically sorted)
rather than ['10', '9'] (alphabetically sorted).
Returns
-------
A generator object that yields :class:`Feature` objects representing
new splice sites
Notes
-----
The returned generator can be passed directly to the
:meth:`FeatureDB.update` method to permanently add them to the
database, e.g., ::
db.update(db.create_splice sites())
"""
if (grandparent_featuretype and parent_featuretype) or (
grandparent_featuretype is None and parent_featuretype is None
):
raise ValueError(
"exactly one of `grandparent_featuretype` or "
"`parent_featuretype` should be provided"
)

if grandparent_featuretype:

def child_gen():
for gene in self.features_of_type(grandparent_featuretype):
for child in self.children(gene, level=1):
yield child

elif parent_featuretype:

def child_gen():
for child in self.features_of_type(parent_featuretype):
yield child

# Two splice features need to be created for each interleave
for side in ["left", "right"]:
for child in child_gen():
exons = self.children(
child, level=1, featuretype=exon_featuretype, order_by="start"
)

# get strand
strand = child.strand

new_featuretype = "splice_site"
if side == "left":
if strand == "+":
new_featuretype = "five_prime_cis_splice_site"
elif strand == "-":
new_featuretype = "three_prime_cis_splice_site"

if side == "right":
if strand == "+":
new_featuretype = "three_prime_cis_splice_site"
elif strand == "-":
new_featuretype = "five_prime_cis_splice_site"

for splice_site in self.interfeatures(
exons,
new_featuretype=new_featuretype,
merge_attributes=merge_attributes,
numeric_sort=numeric_sort,
dialect=self.dialect,
):

if side == "left":
splice_site.end = splice_site.start + 1
if side == "right":
splice_site.start = splice_site.end - 1

# make ID uniq by adding suffix
splice_site.attributes["ID"] = [new_featuretype + "_" + splice_site.attributes["ID"][0]]

yield splice_site


def _old_merge(self, features, ignore_strand=False):
"""
DEPRECATED, only retained here for backwards compatibility. Please use
Expand Down
51 changes: 43 additions & 8 deletions gffutils/test/test.py → gffutils/test/test_1.py
Original file line number Diff line number Diff line change
Expand Up @@ -934,17 +934,17 @@ def _transform(f):
[(i.start, i.stop) for i in db.features_of_type("exon")]
)

def clean_tempdir():
tempfile.tempdir = tempdir
if os.path.exists(tempdir):
shutil.rmtree(tempdir)
os.makedirs(tempdir)

def test_tempfiles():
# specify a writeable temp dir for testing
tempdir = "/tmp/gffutils-test"

# specifiy a writeable temp dir for testing
tempdir = "/tmp/gffutils-test"
def test_tempfiles():

def clean_tempdir():
tempfile.tempdir = tempdir
if os.path.exists(tempdir):
shutil.rmtree(tempdir)
os.makedirs(tempdir)

clean_tempdir()

Expand Down Expand Up @@ -992,6 +992,10 @@ def clean_tempdir():
assert len(filelist) == 1, filelist
assert filelist[0].endswith(".GFFtmp")

@pytest.mark.skip(reason="Unclear if still needed; currently failing")
def test_parallel_db():
# DISABLING in v0.12

# Test n parallel instances of gffutils across PROCESSES processes.
#
# Note that travis-ci doesn't like it when you use multiple cores, so the
Expand All @@ -1010,6 +1014,7 @@ def clean_tempdir():
res = pool.map(make_db, range(n))
finally:
pool.close()

assert sorted(list(res)) == list(range(n))
filelist = os.listdir(tempdir)
assert len(filelist) == n, len(filelist)
Expand Down Expand Up @@ -1232,6 +1237,36 @@ def test_db_unquoting():
assert db["f"]["Note"] == [","]


def test_create_splice_sites():
fn = gffutils.example_filename("gff_example1.gff3")
db = gffutils.create_db(fn, ":memory:")
db = db.update(db.create_splice_sites())
observed = "\n".join(str(feature) for feature in db.all_features())
expected = dedent("""\
chr1 ensGene gene 4763287 4775820 . - . Name=ENSMUSG00000033845;ID=ENSMUSG00000033845;Alias=ENSMUSG00000033845;gid=ENSMUSG00000033845
chr1 ensGene mRNA 4764517 4775779 . - . Name=ENSMUST00000045689;Parent=ENSMUSG00000033845;ID=ENSMUST00000045689;Alias=ENSMUSG00000033845;gid=ENSMUSG00000033845
chr1 ensGene CDS 4775654 4775758 . - 0 Name=ENSMUST00000045689.cds0;Parent=ENSMUST00000045689;ID=ENSMUST00000045689.cds0;gid=ENSMUSG00000033845
chr1 ensGene CDS 4772761 4772814 . - 0 Name=ENSMUST00000045689.cds1;Parent=ENSMUST00000045689;ID=ENSMUST00000045689.cds1;gid=ENSMUSG00000033845
chr1 ensGene exon 4775654 4775779 . - . Name=ENSMUST00000045689.exon0;Parent=ENSMUST00000045689;ID=ENSMUST00000045689.exon0;gid=ENSMUSG00000033845
chr1 ensGene exon 4772649 4772814 . - . Name=ENSMUST00000045689.exon1;Parent=ENSMUST00000045689;ID=ENSMUST00000045689.exon1;gid=ENSMUSG00000033845
chr1 ensGene exon 4767606 4767729 . - . Name=ENSMUST00000045689.exon2;Parent=ENSMUST00000045689;ID=ENSMUST00000045689.exon2;gid=ENSMUSG00000033845
chr1 ensGene exon 4764517 4764597 . - . Name=ENSMUST00000045689.exon3;Parent=ENSMUST00000045689;ID=ENSMUST00000045689.exon3;gid=ENSMUSG00000033845
chr1 ensGene five_prime_UTR 4775759 4775779 . - . Name=ENSMUST00000045689.utr0;Parent=ENSMUST00000045689;ID=ENSMUST00000045689.utr0;gid=ENSMUSG00000033845
chr1 ensGene three_prime_UTR 4772649 4772760 . - . Name=ENSMUST00000045689.utr1;Parent=ENSMUST00000045689;ID=ENSMUST00000045689.utr1;gid=ENSMUSG00000033845
chr1 ensGene three_prime_UTR 4767606 4767729 . - . Name=ENSMUST00000045689.utr2;Parent=ENSMUST00000045689;ID=ENSMUST00000045689.utr2;gid=ENSMUSG00000033845
chr1 ensGene three_prime_UTR 4764517 4764597 . - . Name=ENSMUST00000045689.utr3;Parent=ENSMUST00000045689;ID=ENSMUST00000045689.utr3;gid=ENSMUSG00000033845
chr1 gffutils_derived three_prime_cis_splice_site 4764598 4764599 . - . Name=ENSMUST00000045689.exon2,ENSMUST00000045689.exon3;Parent=ENSMUST00000045689;ID=three_prime_cis_splice_site_ENSMUST00000045689.exon2-ENSMUST00000045689.exon3;gid=ENSMUSG00000033845
chr1 gffutils_derived three_prime_cis_splice_site 4767730 4767731 . - . Name=ENSMUST00000045689.exon1,ENSMUST00000045689.exon2;Parent=ENSMUST00000045689;ID=three_prime_cis_splice_site_ENSMUST00000045689.exon1-ENSMUST00000045689.exon2;gid=ENSMUSG00000033845
chr1 gffutils_derived three_prime_cis_splice_site 4772815 4772816 . - . Name=ENSMUST00000045689.exon0,ENSMUST00000045689.exon1;Parent=ENSMUST00000045689;ID=three_prime_cis_splice_site_ENSMUST00000045689.exon0-ENSMUST00000045689.exon1;gid=ENSMUSG00000033845
chr1 gffutils_derived five_prime_cis_splice_site 4767604 4767605 . - . Name=ENSMUST00000045689.exon2,ENSMUST00000045689.exon3;Parent=ENSMUST00000045689;ID=five_prime_cis_splice_site_ENSMUST00000045689.exon2-ENSMUST00000045689.exon3;gid=ENSMUSG00000033845
chr1 gffutils_derived five_prime_cis_splice_site 4772647 4772648 . - . Name=ENSMUST00000045689.exon1,ENSMUST00000045689.exon2;Parent=ENSMUST00000045689;ID=five_prime_cis_splice_site_ENSMUST00000045689.exon1-ENSMUST00000045689.exon2;gid=ENSMUSG00000033845
chr1 gffutils_derived five_prime_cis_splice_site 4775652 4775653 . - . Name=ENSMUST00000045689.exon0,ENSMUST00000045689.exon1;Parent=ENSMUST00000045689;ID=five_prime_cis_splice_site_ENSMUST00000045689.exon0-ENSMUST00000045689.exon1;gid=ENSMUSG00000033845""")

assert observed == expected




if __name__ == "__main__":
# this test case fails
# test_attributes_modify()
Expand Down

0 comments on commit e480e11

Please sign in to comment.