-
Notifications
You must be signed in to change notification settings - Fork 53
/
extract_tRNA.py
30 lines (25 loc) · 1.03 KB
/
extract_tRNA.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
#!/usr/bin/env
'''
Extract tRNA coordinates from GTF
'''
import sys
import GTF
import numpy as np
import pandas as pd
def main(GENCODE):
gc = GTF.dataframe(GENCODE)
gc.gene_id = gc.gene_id.replace(to_replace=r'\.[0-9]+', value='', regex=True)
idx = (gc.feature == 'transcript') & gc.transcript_type.str.contains('tRNA')
tRNA = gc.ix[idx, ['seqname','start','end','transcript_id','gene_name', 'strand']]
tRNA.start = tRNA.start.astype(int)
tRNA.end = tRNA.end.astype(int)
tRNA.sort_values(by=['seqname','start','end'], inplace=True)
tRNA.to_csv('tRNA_transcripts.bed', sep='\t', header=False, index=False)
idx = (gc.feature == 'gene') & gc.gene_type.str.contains('tRNA')
tRNA = gc.ix[idx, ['seqname','start','end','gene_id','gene_name', 'strand']]
tRNA.start = tRNA.start.astype(int)
tRNA.end = tRNA.end.astype(int)
tRNA.sort_values(by=['seqname','start','end'], inplace=True)
tRNA.to_csv('tRNA_genes.bed', sep='\t', header=False, index=False)
if __name__ == '__main__':
main(sys.argv[1])