Skip to content

Commit

Permalink
Add trna,rrna extractor
Browse files Browse the repository at this point in the history
  • Loading branch information
saketkc committed Mar 10, 2017
1 parent 9f6a475 commit 6c589a4
Show file tree
Hide file tree
Showing 2 changed files with 60 additions and 0 deletions.
30 changes: 30 additions & 0 deletions extract_rRNA.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#!/usr/bin/env
'''
Extract rRNA coordinates from GTF
'''
import sys
import GTF
import numpy as np
import pandas as pd

def main(GENCODE):
gc = GTF.dataframe(GENCODE)
gc.gene_id = gc.gene_id.replace(to_replace=r'\.[0-9]+', value='', regex=True)

idx = (gc.feature == 'transcript') & gc.transcript_type.str.contains('rRNA')
rRNA = gc.ix[idx, ['seqname','start','end','transcript_id','gene_name', 'strand']]
rRNA.start = rRNA.start.astype(int)
rRNA.end = rRNA.end.astype(int)
rRNA.sort_values(by=['seqname','start','end'], inplace=True)
rRNA.to_csv('rRNA_transcripts.bed', sep='\t', header=False, index=False)

idx = (gc.feature == 'gene') & gc.gene_type.str.contains('rRNA')
rRNA = gc.ix[idx, ['seqname','start','end','gene_id','gene_name', 'strand']]
rRNA.start = rRNA.start.astype(int)
rRNA.end = rRNA.end.astype(int)
rRNA.sort_values(by=['seqname','start','end'], inplace=True)
rRNA.to_csv('rRNA_genes.bed', sep='\t', header=False, index=False)

if __name__ == '__main__':
main(sys.argv[1])

30 changes: 30 additions & 0 deletions extract_tRNA.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#!/usr/bin/env
'''
Extract tRNA coordinates from GTF
'''
import sys
import GTF
import numpy as np
import pandas as pd

def main(GENCODE):
gc = GTF.dataframe(GENCODE)
gc.gene_id = gc.gene_id.replace(to_replace=r'\.[0-9]+', value='', regex=True)

idx = (gc.feature == 'transcript') & gc.transcript_type.str.contains('tRNA')
tRNA = gc.ix[idx, ['seqname','start','end','transcript_id','gene_name', 'strand']]
tRNA.start = tRNA.start.astype(int)
tRNA.end = tRNA.end.astype(int)
tRNA.sort_values(by=['seqname','start','end'], inplace=True)
tRNA.to_csv('tRNA_transcripts.bed', sep='\t', header=False, index=False)

idx = (gc.feature == 'gene') & gc.gene_type.str.contains('tRNA')
tRNA = gc.ix[idx, ['seqname','start','end','gene_id','gene_name', 'strand']]
tRNA.start = tRNA.start.astype(int)
tRNA.end = tRNA.end.astype(int)
tRNA.sort_values(by=['seqname','start','end'], inplace=True)
tRNA.to_csv('tRNA_genes.bed', sep='\t', header=False, index=False)

if __name__ == '__main__':
main(sys.argv[1])

0 comments on commit 6c589a4

Please sign in to comment.