Skip to content

Commit

Permalink
format
Browse files Browse the repository at this point in the history
  • Loading branch information
saketkc committed Sep 25, 2020
1 parent 6973208 commit 8b11862
Show file tree
Hide file tree
Showing 67 changed files with 258 additions and 160 deletions.
32 changes: 20 additions & 12 deletions GTF.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,11 +42,19 @@
import re


GTF_HEADER = ['seqname', 'source', 'feature', 'start', 'end', 'score',
'strand', 'frame']
R_SEMICOLON = re.compile(r'\s*;\s*')
R_COMMA = re.compile(r'\s*,\s*')
R_KEYVALUE = re.compile(r'(\s+|\s*=\s*)')
GTF_HEADER = [
"seqname",
"source",
"feature",
"start",
"end",
"score",
"strand",
"frame",
]
R_SEMICOLON = re.compile(r"\s*;\s*")
R_COMMA = re.compile(r"\s*,\s*")
R_KEYVALUE = re.compile(r"(\s+|\s*=\s*)")


def dataframe(filename):
Expand All @@ -72,11 +80,11 @@ def dataframe(filename):
def lines(filename):
"""Open an optionally gzipped GTF file and generate a dict for each line.
"""
fn_open = gzip.open if filename.endswith('.gz') else open
fn_open = gzip.open if filename.endswith(".gz") else open

with fn_open(filename) as fh:
for line in fh:
if line.startswith('#'):
if line.startswith("#"):
continue
else:
yield parse(line)
Expand All @@ -87,7 +95,7 @@ def parse(line):
"""
result = {}

fields = line.rstrip().split('\t')
fields = line.rstrip().split("\t")

for i, col in enumerate(GTF_HEADER):
result[col] = _get_value(fields[i])
Expand All @@ -101,7 +109,7 @@ def parse(line):
key, _, value = re.split(R_KEYVALUE, info, 1)
# But sometimes it is just "value".
except ValueError:
key = 'INFO{}'.format(i)
key = "INFO{}".format(i)
value = info
# Ignore the field if there is no value.
if value:
Expand All @@ -115,13 +123,13 @@ def _get_value(value):
return None

# Strip double and single quotes.
value = value.strip('"\'')
value = value.strip("\"'")

# Return a list if the value has a comma.
if ',' in value:
if "," in value:
value = re.split(R_COMMA, value)
# These values are equivalent to None.
elif value in ['', '.', 'NA']:
elif value in ["", ".", "NA"]:
return None

return value
Empty file modified data/BDGP6/v96/cds.bed.gz
100644 → 100755
Empty file.
Empty file modified data/BDGP6/v96/exon.bed.gz
100644 → 100755
Empty file.
Empty file modified data/BDGP6/v96/gene.bed.gz
100644 → 100755
Empty file.
Empty file modified data/BDGP6/v96/intron.bed.gz
100644 → 100755
Empty file.
Empty file modified data/BDGP6/v96/start_codon.bed.gz
100644 → 100755
Empty file.
Empty file modified data/BDGP6/v96/stop_codon.bed.gz
100644 → 100755
Empty file.
Empty file modified data/BDGP6/v96/utr3.bed.gz
100644 → 100755
Empty file.
Empty file modified data/BDGP6/v96/utr5.bed.gz
100644 → 100755
Empty file.
Empty file modified data/GRCg6/v96/cds.bed.gz
100644 → 100755
Empty file.
Empty file modified data/GRCg6/v96/exon.bed.gz
100644 → 100755
Empty file.
Empty file modified data/GRCg6/v96/gene.bed.gz
100644 → 100755
Empty file.
Empty file modified data/GRCg6/v96/intron.bed.gz
100644 → 100755
Empty file.
Empty file modified data/GRCg6/v96/start_codon.bed.gz
100644 → 100755
Empty file.
Empty file modified data/GRCg6/v96/stop_codon.bed.gz
100644 → 100755
Empty file.
Empty file modified data/GRCg6/v96/utr3.bed.gz
100644 → 100755
Empty file.
Empty file modified data/GRCg6/v96/utr5.bed.gz
100644 → 100755
Empty file.
Empty file modified data/GRCz11/v96/cds.bed.gz
100644 → 100755
Empty file.
Empty file modified data/GRCz11/v96/exon.bed.gz
100644 → 100755
Empty file.
Empty file modified data/GRCz11/v96/gene.bed.gz
100644 → 100755
Empty file.
Empty file modified data/GRCz11/v96/intron.bed.gz
100644 → 100755
Empty file.
Empty file modified data/GRCz11/v96/start_codon.bed.gz
100644 → 100755
Empty file.
Empty file modified data/GRCz11/v96/stop_codon.bed.gz
100644 → 100755
Empty file.
Empty file modified data/GRCz11/v96/utr3.bed.gz
100644 → 100755
Empty file.
Empty file modified data/GRCz11/v96/utr5.bed.gz
100644 → 100755
Empty file.
Empty file modified data/Rnor6.0/v96/cds.bed.gz
100644 → 100755
Empty file.
Empty file modified data/Rnor6.0/v96/exon.bed.gz
100644 → 100755
Empty file.
Empty file modified data/Rnor6.0/v96/gene.bed.gz
100644 → 100755
Empty file.
Empty file modified data/Rnor6.0/v96/intron.bed.gz
100644 → 100755
Empty file.
Empty file modified data/Rnor6.0/v96/start_codon.bed.gz
100644 → 100755
Empty file.
Empty file modified data/Rnor6.0/v96/stop_codon.bed.gz
100644 → 100755
Empty file.
Empty file modified data/Rnor6.0/v96/utr3.bed.gz
100644 → 100755
Empty file.
Empty file modified data/Rnor6.0/v96/utr5.bed.gz
100644 → 100755
Empty file.
Empty file modified data/WBcel235/v96/cds.bed.gz
100644 → 100755
Empty file.
Empty file modified data/WBcel235/v96/exon.bed.gz
100644 → 100755
Empty file.
Empty file modified data/WBcel235/v96/gene.bed.gz
100644 → 100755
Empty file.
Empty file modified data/WBcel235/v96/intron.bed.gz
100644 → 100755
Empty file.
Empty file modified data/WBcel235/v96/utr3.bed.gz
100644 → 100755
Empty file.
Empty file modified data/WBcel235/v96/utr5.bed.gz
100644 → 100755
Empty file.
Empty file modified data/mm10/v96/cds.bed.gz
100644 → 100755
Empty file.
Empty file modified data/mm10/v96/exon.bed.gz
100644 → 100755
Empty file.
Empty file modified data/mm10/v96/gene.bed.gz
100644 → 100755
Empty file.
Empty file modified data/mm10/v96/intron.bed.gz
100644 → 100755
Empty file.
Empty file modified data/mm10/v96/start_codon.bed.gz
100644 → 100755
Empty file.
Empty file modified data/mm10/v96/stop_codon.bed.gz
100644 → 100755
Empty file.
Empty file modified data/mm10/v96/utr3.bed.gz
100644 → 100755
Empty file.
Empty file modified data/mm10/v96/utr5.bed.gz
100644 → 100755
Empty file.
26 changes: 14 additions & 12 deletions extract_lincRNA.py
Original file line number Diff line number Diff line change
@@ -1,29 +1,31 @@
#!/usr/bin/env
'''
"""
Extract lincRNA coordinates from GTF
'''
"""
import sys
import GTF
import numpy as np
import pandas as pd


def main(GENCODE):
gc = GTF.dataframe(GENCODE)
gc.gene_id = gc.gene_id.replace(to_replace=r'\.[0-9]+', value='', regex=True)
gc.gene_id = gc.gene_id.replace(to_replace=r"\.[0-9]+", value="", regex=True)

idx = (gc.feature == 'transcript') & (gc.transcript_type == 'lincRNA')
lincRNA = gc.ix[idx, ['seqname','start','end','gene_id','gene_name', 'strand']]
idx = (gc.feature == "transcript") & (gc.transcript_type == "lincRNA")
lincRNA = gc.ix[idx, ["seqname", "start", "end", "gene_id", "gene_name", "strand"]]
lincRNA.start = lincRNA.start.astype(int)
lincRNA.end = lincRNA.end.astype(int)
lincRNA.sort_values(by=['seqname','start','end'], inplace=True)
lincRNA.to_csv('lincRNA.bed', sep='\t', header=False, index=False)
lincRNA.sort_values(by=["seqname", "start", "end"], inplace=True)
lincRNA.to_csv("lincRNA.bed", sep="\t", header=False, index=False)

idx = (gc.feature == 'gene') & (gc.gene_type == 'lincRNA')
lincRNA = gc.ix[idx, ['seqname','start','end','gene_id','gene_name', 'strand']]
idx = (gc.feature == "gene") & (gc.gene_type == "lincRNA")
lincRNA = gc.ix[idx, ["seqname", "start", "end", "gene_id", "gene_name", "strand"]]
lincRNA.start = lincRNA.start.astype(int)
lincRNA.end = lincRNA.end.astype(int)
lincRNA.sort_values(by=['seqname','start','end'], inplace=True)
lincRNA.to_csv('lincRNA_genes.bed', sep='\t', header=False, index=False)
lincRNA.sort_values(by=["seqname", "start", "end"], inplace=True)
lincRNA.to_csv("lincRNA_genes.bed", sep="\t", header=False, index=False)


if __name__ == '__main__':
if __name__ == "__main__":
main(sys.argv[1])
29 changes: 16 additions & 13 deletions extract_rRNA.py
Original file line number Diff line number Diff line change
@@ -1,30 +1,33 @@
#!/usr/bin/env
'''
"""
Extract rRNA coordinates from GTF
'''
"""
import sys
import GTF
import numpy as np
import pandas as pd


def main(GENCODE):
gc = GTF.dataframe(GENCODE)
gc.gene_id = gc.gene_id.replace(to_replace=r'\.[0-9]+', value='', regex=True)
gc.gene_id = gc.gene_id.replace(to_replace=r"\.[0-9]+", value="", regex=True)

idx = (gc.feature == 'transcript') & gc.transcript_type.str.contains('rRNA')
rRNA = gc.ix[idx, ['seqname','start','end','transcript_id','gene_name', 'strand']]
idx = (gc.feature == "transcript") & gc.transcript_type.str.contains("rRNA")
rRNA = gc.ix[
idx, ["seqname", "start", "end", "transcript_id", "gene_name", "strand"]
]
rRNA.start = rRNA.start.astype(int)
rRNA.end = rRNA.end.astype(int)
rRNA.sort_values(by=['seqname','start','end'], inplace=True)
rRNA.to_csv('rRNA_transcripts.bed', sep='\t', header=False, index=False)
rRNA.sort_values(by=["seqname", "start", "end"], inplace=True)
rRNA.to_csv("rRNA_transcripts.bed", sep="\t", header=False, index=False)

idx = (gc.feature == 'gene') & gc.gene_type.str.contains('rRNA')
rRNA = gc.ix[idx, ['seqname','start','end','gene_id','gene_name', 'strand']]
idx = (gc.feature == "gene") & gc.gene_type.str.contains("rRNA")
rRNA = gc.ix[idx, ["seqname", "start", "end", "gene_id", "gene_name", "strand"]]
rRNA.start = rRNA.start.astype(int)
rRNA.end = rRNA.end.astype(int)
rRNA.sort_values(by=['seqname','start','end'], inplace=True)
rRNA.to_csv('rRNA_genes.bed', sep='\t', header=False, index=False)
rRNA.sort_values(by=["seqname", "start", "end"], inplace=True)
rRNA.to_csv("rRNA_genes.bed", sep="\t", header=False, index=False)

if __name__ == '__main__':
main(sys.argv[1])

if __name__ == "__main__":
main(sys.argv[1])
29 changes: 16 additions & 13 deletions extract_tRNA.py
Original file line number Diff line number Diff line change
@@ -1,30 +1,33 @@
#!/usr/bin/env
'''
"""
Extract tRNA coordinates from GTF
'''
"""
import sys
import GTF
import numpy as np
import pandas as pd


def main(GENCODE):
gc = GTF.dataframe(GENCODE)
gc.gene_id = gc.gene_id.replace(to_replace=r'\.[0-9]+', value='', regex=True)
gc.gene_id = gc.gene_id.replace(to_replace=r"\.[0-9]+", value="", regex=True)

idx = (gc.feature == 'transcript') & gc.transcript_type.str.contains('tRNA')
tRNA = gc.ix[idx, ['seqname','start','end','transcript_id','gene_name', 'strand']]
idx = (gc.feature == "transcript") & gc.transcript_type.str.contains("tRNA")
tRNA = gc.ix[
idx, ["seqname", "start", "end", "transcript_id", "gene_name", "strand"]
]
tRNA.start = tRNA.start.astype(int)
tRNA.end = tRNA.end.astype(int)
tRNA.sort_values(by=['seqname','start','end'], inplace=True)
tRNA.to_csv('tRNA_transcripts.bed', sep='\t', header=False, index=False)
tRNA.sort_values(by=["seqname", "start", "end"], inplace=True)
tRNA.to_csv("tRNA_transcripts.bed", sep="\t", header=False, index=False)

idx = (gc.feature == 'gene') & gc.gene_type.str.contains('tRNA')
tRNA = gc.ix[idx, ['seqname','start','end','gene_id','gene_name', 'strand']]
idx = (gc.feature == "gene") & gc.gene_type.str.contains("tRNA")
tRNA = gc.ix[idx, ["seqname", "start", "end", "gene_id", "gene_name", "strand"]]
tRNA.start = tRNA.start.astype(int)
tRNA.end = tRNA.end.astype(int)
tRNA.sort_values(by=['seqname','start','end'], inplace=True)
tRNA.to_csv('tRNA_genes.bed', sep='\t', header=False, index=False)
tRNA.sort_values(by=["seqname", "start", "end"], inplace=True)
tRNA.to_csv("tRNA_genes.bed", sep="\t", header=False, index=False)

if __name__ == '__main__':
main(sys.argv[1])

if __name__ == "__main__":
main(sys.argv[1])
Loading

0 comments on commit 8b11862

Please sign in to comment.