{ "cells": [ { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "from collections import defaultdict, OrderedDict\n", "import warnings\n", "import gffutils\n", "import pybedtools\n", "import pandas as pd\n", "import copy\n", "import os\n", "import re\n", "from gffutils.pybedtools_integration import tsses\n", "from copy import deepcopy\n", "from collections import OrderedDict, Callable\n", "import errno\n", "\n", "def mkdir_p(path):\n", " try:\n", " os.makedirs(path)\n", " except OSError as exc: # Python >2.5\n", " if exc.errno == errno.EEXIST and os.path.isdir(path):\n", " pass\n", " else:\n", " raise\n", " \n", "class DefaultOrderedDict(OrderedDict):\n", " # Source: http://stackoverflow.com/a/6190500/562769\n", " def __init__(self, default_factory=None, *a, **kw):\n", " if (default_factory is not None and\n", " not isinstance(default_factory, Callable)):\n", " raise TypeError('first argument must be callable')\n", " OrderedDict.__init__(self, *a, **kw)\n", " self.default_factory = default_factory\n", "\n", " def __getitem__(self, key):\n", " try:\n", " return OrderedDict.__getitem__(self, key)\n", " except KeyError:\n", " return self.__missing__(key)\n", "\n", " def __missing__(self, key):\n", " if self.default_factory is None:\n", " raise KeyError(key)\n", " self[key] = value = self.default_factory()\n", " return value\n", "\n", " def __reduce__(self):\n", " if self.default_factory is None:\n", " args = tuple()\n", " else:\n", " args = self.default_factory,\n", " return type(self), args, None, None, self.items()\n", "\n", " def copy(self):\n", " return self.__copy__()\n", "\n", " def __copy__(self):\n", " return type(self)(self.default_factory, self)\n", "\n", " def __deepcopy__(self, memo):\n", " import copy\n", " return type(self)(self.default_factory,\n", " copy.deepcopy(self.items()))\n", "\n", " def __repr__(self):\n", " return 'OrderedDefaultDict(%s, %s)' % (self.default_factory,\n", " OrderedDict.__repr__(self))\n" ] }, { "cell_type": "raw", "metadata": {}, "source": [ "gtf = '/home/cmb-panasas2/skchoudh/genomes/Mmul8/annotation/Macaca_mulatta.Mmul_8.0.1.94.gtf'\n", "gtf_db = '/home/cmb-panasas2/skchoudh/genomes/Mmul8/annotation/Macaca_mulatta.Mmul_8.0.1.94.gtf.db'\n", "prefix = '/home/cmb-panasas2/skchoudh/github_projects/riboraptor/riboraptor/annotation/Mmul8/v94/'\n", "chrsizes = '/home/cmb-panasas2/skchoudh/genomes/Mmul8/fasta/Macaca_mulatta.Mmul_8.0.1.dna.toplevel.sizes'\n", "mkdir_p(prefix)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "gtf = '/home/cmb-panasas2/skchoudh/genomes/Mmul8/annotation/Macaca_mulatta.Mmul_8.0.1.96.gtf'\n", "gtf_db = '/home/cmb-panasas2/skchoudh/genomes/Mmul8/annotation/Macaca_mulatta.Mmul_8.0.1.96.gtf.db'\n", "prefix = '/home/cmb-panasas2/skchoudh/github_projects/riboraptor/riboraptor/annotation/Mmul8/v96/'\n", "chrsizes = '/home/cmb-panasas2/skchoudh/genomes/Mmul8/fasta/Macaca_mulatta.Mmul_8.0.1.dna.toplevel.sizes'\n", "mkdir_p(prefix)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "def create_gene_dict(db):\n", " '''\n", " Store each feature line db.all_features() as a dict of dicts\n", " '''\n", " gene_dict = DefaultOrderedDict(lambda: DefaultOrderedDict(lambda: DefaultOrderedDict(list)))\n", " for line_no, feature in enumerate(db.all_features()):\n", " gene_ids = feature.attributes['gene_id']\n", " feature_type = feature.featuretype\n", " if feature_type == 'gene':\n", " if len(gene_ids)!=1:\n", " logging.warning('Found multiple gene_ids on line {} in gtf'.format(line_no))\n", " break\n", " else:\n", " gene_id = gene_ids[0]\n", " gene_dict[gene_id]['gene'] = feature\n", " else:\n", " transcript_ids = feature.attributes['transcript_id']\n", "\n", " for gene_id in gene_ids:\n", " for transcript_id in transcript_ids:\n", " gene_dict[gene_id][transcript_id][feature_type].append(feature)\n", " return gene_dict" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "db = gffutils.create_db(gtf, dbfn=gtf_db, merge_strategy='merge', force=True, disable_infer_transcripts=True, disable_infer_genes=True)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "db = gffutils.FeatureDB(gtf_db, keep_order=True)\n", "gene_dict = create_gene_dict(db)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CDS\n", "exon\n", "five_prime_utr\n", "gene\n", "start_codon\n", "stop_codon\n", "three_prime_utr\n", "transcript\n" ] } ], "source": [ "for x in db.featuretypes():\n", " print(x)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "def get_gene_list(gene_dict):\n", " return list(set(gene_dict.keys()))\n", "\n", "def get_UTR_regions(gene_dict, gene_id, transcript, cds):\n", " if len(cds)==0:\n", " return [], []\n", " utr5_regions = []\n", " utr3_regions = []\n", " utrs = gene_dict[gene_id][transcript]['UTR']\n", " first_cds = cds[0]\n", " last_cds = cds[-1]\n", " for utr in utrs:\n", " ## Push all cds at once\n", " ## Sort later to remove duplicates\n", " strand = utr.strand\n", " if strand == '+':\n", " if utr.stop < first_cds.start:\n", " utr.feature_type = 'five_prime_UTR'\n", " utr5_regions.append(utr)\n", " elif utr.start > last_cds.stop:\n", " utr.feature_type = 'three_prime_UTR'\n", " utr3_regions.append(utr)\n", " else:\n", " raise RuntimeError('Error with cds')\n", " elif strand == '-':\n", " if utr.stop < first_cds.start:\n", " utr.feature_type = 'three_prime_UTR'\n", " utr3_regions.append(utr)\n", " elif utr.start > last_cds.stop:\n", " utr.feature_type = 'five_prime_UTR'\n", " utr5_regions.append(utr) \n", " else:\n", " raise RuntimeError('Error with cds') \n", " return utr5_regions, utr3_regions\n", " \n", "def create_bed(regions, bedtype='0'):\n", " '''Create bed from list of regions\n", " bedtype: 0 or 1\n", " 0-Based or 1-based coordinate of the BED\n", " '''\n", " bedstr = ''\n", " for region in regions:\n", " assert len(region.attributes['gene_id']) == 1\n", " ## GTF start is 1-based, so shift by one while writing \n", " ## to 0-based BED format\n", " if bedtype == '0':\n", " start = region.start - 1\n", " else:\n", " start = region.start\n", " bedstr += '{}\\t{}\\t{}\\t{}\\t{}\\t{}\\n'.format(region.chrom,\n", " start,\n", " region.stop,\n", " re.sub('\\.\\d+', '', region.attributes['gene_id'][0]),\n", " '.',\n", " region.strand)\n", " return bedstr\n", "\n", "def rename_regions(regions, gene_id):\n", " regions = list(regions)\n", " if len(regions) == 0:\n", " return []\n", " for region in regions:\n", " region.attributes['gene_id'] = gene_id\n", " return regions\n", "\n", "def merge_regions(db, regions):\n", " if len(regions) == 0:\n", " return []\n", " merged = db.merge(sorted(list(regions), key=lambda x: x.start))\n", " return merged\n", "\n", "def merge_regions_nostrand(db, regions):\n", " if len(regions) == 0:\n", " return []\n", " merged = db.merge(sorted(list(regions), key=lambda x: x.start), ignore_strand=True)\n", " return merged" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "utr5_bed = ''\n", "utr3_bed = ''\n", "gene_bed = ''\n", "exon_bed = ''\n", "intron_bed = ''\n", "start_codon_bed = ''\n", "stop_codon_bed = ''\n", "cds_bed = ''\n", "\n", "gene_list = []\n", "\n", "for gene_id in get_gene_list(gene_dict):\n", " gene_list.append(gene_dict[gene_id]['gene'])\n", " \n", " utr5_regions, utr3_regions = [], []\n", " exon_regions, intron_regions = [], []\n", " star_codon_regions, stop_codon_regions = [], []\n", " cds_regions = []\n", " \n", " for feature in gene_dict[gene_id].keys():\n", " if feature == 'gene':\n", " continue\n", " cds = list(gene_dict[gene_id][feature]['CDS'])\n", " exons = list(gene_dict[gene_id][feature]['exon'])\n", " merged_exons = merge_regions(db, exons)\n", " introns = db.interfeatures(merged_exons)\n", " #utr5_region, utr3_region = get_UTR_regions(gene_dict, gene_id, feature, cds)\n", " utr5_region = list(gene_dict[gene_id][feature]['five_prime_utr'])\n", " utr3_region = list(gene_dict[gene_id][feature]['three_prime_utr'])\n", " utr5_regions += utr5_region\n", " utr3_regions += utr3_region\n", " exon_regions += exons\n", " intron_regions += introns\n", " cds_regions += cds\n", " \n", " merged_utr5 = merge_regions(db, utr5_regions)\n", " renamed_utr5 = rename_regions(merged_utr5, gene_id)\n", " \n", " merged_utr3 = merge_regions(db, utr3_regions)\n", " renamed_utr3 = rename_regions(merged_utr3, gene_id)\n", " \n", " merged_exons = merge_regions(db, exon_regions)\n", " renamed_exons = rename_regions(merged_exons, gene_id)\n", " \n", " merged_introns = merge_regions(db, intron_regions)\n", " renamed_introns = rename_regions(merged_introns, gene_id)\n", " \n", " merged_cds = merge_regions(db, cds_regions)\n", " renamed_cds = rename_regions(merged_cds, gene_id)\n", " \n", " utr3_bed += create_bed(renamed_utr3)\n", " utr5_bed += create_bed(renamed_utr5)\n", " exon_bed += create_bed(renamed_exons)\n", " intron_bed += create_bed(renamed_introns)\n", " cds_bed += create_bed(renamed_cds)\n", " \n", " \n", "gene_bed = create_bed(gene_list)\n", "gene_bedtool = pybedtools.BedTool(gene_bed, from_string=True)\n", "utr5_bedtool = pybedtools.BedTool(utr5_bed, from_string=True)\n", "utr3_bedtool = pybedtools.BedTool(utr3_bed, from_string=True)\n", "exon_bedtool = pybedtools.BedTool(exon_bed, from_string=True)\n", "intron_bedtool = pybedtools.BedTool(intron_bed, from_string=True)\n", "cds_bedtool = pybedtools.BedTool(cds_bed, from_string=True)\n", "\n", "utr5_cds_subtracted = utr5_bedtool.subtract(cds_bedtool)\n", "utr3_cds_subtracted = utr3_bedtool.subtract(cds_bedtool)\n", "utr5_cds_subtracted.remove_invalid().sort().saveas(os.path.join(prefix, 'utr5.bed.gz'))\n", "utr3_cds_subtracted.remove_invalid().sort().saveas(os.path.join(prefix, 'utr3.bed.gz'))\n", "gene_bedtool.remove_invalid().sort().saveas(os.path.join(prefix, 'gene.bed.gz'))\n", "exon_bedtool.remove_invalid().sort().saveas(os.path.join(prefix, 'exon.bed.gz'))\n", "intron_bedtool.remove_invalid().sort().saveas(os.path.join(prefix, 'intron.bed.gz'))\n", "cds_bedtool.remove_invalid().sort().saveas(os.path.join(prefix, 'cds.bed.gz'))" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "for gene_id in get_gene_list(gene_dict):\n", " start_codons = []\n", " stop_codons = []\n", " for start_codon in db.children(gene_id, featuretype='start_codon'):\n", " ## 1 -based stop\n", " ## 0-based start handled while converting to bed\n", " start_codon.stop = start_codon.start\n", " start_codons.append(start_codon)\n", " for stop_codon in db.children(gene_id, featuretype='stop_codon'):\n", " stop_codon.start = stop_codon.stop\n", " stop_codon.stop = stop_codon.stop+1\n", " stop_codons.append(stop_codon)\n", " merged_start_codons = merge_regions(db, start_codons)\n", " renamed_start_codons = rename_regions(merged_start_codons, gene_id)\n", " merged_stop_codons = merge_regions(db, stop_codons)\n", " renamed_stop_codons = rename_regions(merged_stop_codons, gene_id)\n", " \n", " start_codon_bed += create_bed(renamed_start_codons) \n", " stop_codon_bed += create_bed(renamed_stop_codons)\n", "\n", " \n", "start_codon_bedtool = pybedtools.BedTool(start_codon_bed, from_string=True)\n", "stop_codon_bedtool = pybedtools.BedTool(stop_codon_bed, from_string=True)\n", "start_codon_bedtool.remove_invalid().sort().saveas(os.path.join(prefix, 'start_codon.bed.gz'))\n", "stop_codon_bedtool.remove_invalid().sort().saveas(os.path.join(prefix, 'stop_codon.bed.gz'))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Check GTF regions do not overshoot fasta" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "from collections import defaultdict\n", "import gzip\n", "import pandas as pd\n", "import re\n", "\n", "\n", "GTF_HEADER = ['seqname', 'source', 'feature', 'start', 'end', 'score',\n", " 'strand', 'frame']\n", "R_SEMICOLON = re.compile(r'\\s*;\\s*')\n", "R_COMMA = re.compile(r'\\s*,\\s*')\n", "R_KEYVALUE = re.compile(r'(\\s+|\\s*=\\s*)')\n", "\n", "\n", "def dataframe(filename):\n", " \"\"\"Open an optionally gzipped GTF file and return a pandas.DataFrame.\n", " \"\"\"\n", " # Each column is a list stored as a value in this dict.\n", " result = defaultdict(list)\n", "\n", " for i, line in enumerate(lines(filename)):\n", " for key in line.keys():\n", " # This key has not been seen yet, so set it to None for all\n", " # previous lines.\n", " if key not in result:\n", " result[key] = [None] * i\n", "\n", " # Ensure this row has some value for each column.\n", " for key in result.keys():\n", " result[key].append(line.get(key, None))\n", "\n", " return pd.DataFrame(result)\n", "\n", "\n", "def lines(filename):\n", " \"\"\"Open an optionally gzipped GTF file and generate a dict for each line.\n", " \"\"\"\n", " fn_open = gzip.open if filename.endswith('.gz') else open\n", "\n", " with fn_open(filename) as fh:\n", " for line in fh:\n", " if line.startswith('#'):\n", " continue\n", " else:\n", " yield parse(line)\n", "\n", "\n", "def parse(line):\n", " \"\"\"Parse a single GTF line and return a dict.\n", " \"\"\"\n", " result = {}\n", "\n", " fields = line.rstrip().split('\\t')\n", "\n", " for i, col in enumerate(GTF_HEADER):\n", " result[col] = _get_value(fields[i])\n", "\n", " # INFO field consists of \"key1=value;key2=value;...\".\n", " infos = [x for x in re.split(R_SEMICOLON, fields[8]) if x.strip()]\n", "\n", " for i, info in enumerate(infos, 1):\n", " # It should be key=\"value\".\n", " try:\n", " key, _, value = re.split(R_KEYVALUE, info, 1)\n", " # But sometimes it is just \"value\".\n", " except ValueError:\n", " key = 'INFO{}'.format(i)\n", " value = info\n", " # Ignore the field if there is no value.\n", " if value:\n", " result[key] = _get_value(value)\n", "\n", " return result\n", "\n", "\n", "def _get_value(value):\n", " if not value:\n", " return None\n", "\n", " # Strip double and single quotes.\n", " value = value.strip('\"\\'')\n", "\n", " # Return a list if the value has a comma.\n", " if ',' in value:\n", " value = re.split(R_COMMA, value)\n", " # These values are equivalent to None.\n", " elif value in ['', '.', 'NA']:\n", " return None\n", "\n", " return value\n", "\n", "def get_seq(row):\n", " chrom = row['seqname']\n", " start = row['start']\n", " end = row['end']\n", " strand = row['strand']\n", " \n", " if strand == '+':\n", " return fasta.get_seq(chrom, start, end)\n", " else:\n", " seq = fasta.get_seq(chrom, start, end)\n", " return str(Seq(seq, generic_dna).reverse_complement())" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
chromsize
01225584828
12204787373
23185818997
34172585720
45190429646
56180051392
67169600520
78144306982
89129882849
91092844088
1011133663169
1112125506784
1213108979918
1314127894412
1415111343173
151677216781
161795684472
171870235451
181953671032
192074971481
20X149150640
21Y11753682
22MT16564
23KQ739485.1919841
24KQ739657.1805525
25KQ732613.1643885
26KQ733369.1610816
27KQ742746.1602698
28KQ732984.1576867
29KQ734207.1516943
.........
284698JSUE03175605.1202
284699JSUE03263931.1201
284700JSUE03160683.1200
284701JSUE03246639.1200
284702JSUE03096016.1199
284703JSUE03334604.1198
284704JSUE03056228.1196
284705JSUE03246815.1194
284706JSUE03263583.1193
284707JSUE03174083.1192
284708JSUE03297290.1191
284709JSUE03092936.1188
284710JSUE03155852.1186
284711JSUE03055955.1186
284712JSUE03279751.1184
284713JSUE03318858.1179
284714JSUE03238914.1179
284715JSUE03154903.1179
284716JSUE03136600.1174
284717JSUE03107000.1173
284718JSUE03174048.1170
284719JSUE03272611.1163
284720JSUE03140296.1157
284721JSUE03165938.1155
284722JSUE03289583.1147
284723JSUE03178651.1141
284724JSUE03100647.1134
284725JSUE03333562.1134
284726JSUE03109914.1130
284727JSUE03199803.1116
\n", "

284728 rows × 2 columns

\n", "
" ], "text/plain": [ " chrom size\n", "0 1 225584828\n", "1 2 204787373\n", "2 3 185818997\n", "3 4 172585720\n", "4 5 190429646\n", "5 6 180051392\n", "6 7 169600520\n", "7 8 144306982\n", "8 9 129882849\n", "9 10 92844088\n", "10 11 133663169\n", "11 12 125506784\n", "12 13 108979918\n", "13 14 127894412\n", "14 15 111343173\n", "15 16 77216781\n", "16 17 95684472\n", "17 18 70235451\n", "18 19 53671032\n", "19 20 74971481\n", "20 X 149150640\n", "21 Y 11753682\n", "22 MT 16564\n", "23 KQ739485.1 919841\n", "24 KQ739657.1 805525\n", "25 KQ732613.1 643885\n", "26 KQ733369.1 610816\n", "27 KQ742746.1 602698\n", "28 KQ732984.1 576867\n", "29 KQ734207.1 516943\n", "... ... ...\n", "284698 JSUE03175605.1 202\n", "284699 JSUE03263931.1 201\n", "284700 JSUE03160683.1 200\n", "284701 JSUE03246639.1 200\n", "284702 JSUE03096016.1 199\n", "284703 JSUE03334604.1 198\n", "284704 JSUE03056228.1 196\n", "284705 JSUE03246815.1 194\n", "284706 JSUE03263583.1 193\n", "284707 JSUE03174083.1 192\n", "284708 JSUE03297290.1 191\n", "284709 JSUE03092936.1 188\n", "284710 JSUE03155852.1 186\n", "284711 JSUE03055955.1 186\n", "284712 JSUE03279751.1 184\n", "284713 JSUE03318858.1 179\n", "284714 JSUE03238914.1 179\n", "284715 JSUE03154903.1 179\n", "284716 JSUE03136600.1 174\n", "284717 JSUE03107000.1 173\n", "284718 JSUE03174048.1 170\n", "284719 JSUE03272611.1 163\n", "284720 JSUE03140296.1 157\n", "284721 JSUE03165938.1 155\n", "284722 JSUE03289583.1 147\n", "284723 JSUE03178651.1 141\n", "284724 JSUE03100647.1 134\n", "284725 JSUE03333562.1 134\n", "284726 JSUE03109914.1 130\n", "284727 JSUE03199803.1 116\n", "\n", "[284728 rows x 2 columns]" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "chrom_sizes" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "gtf_df = dataframe('/home/cmb-panasas2/skchoudh/genomes/Mmul8/annotation/Macaca_mulatta.Mmul_8.0.1.96.gtf')" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/cmb-06/as/skchoudh/software_frozen/anaconda37/envs/riboraptor/lib/python3.6/site-packages/ipykernel_launcher.py:2: FutureWarning: read_table is deprecated, use read_csv instead, passing sep='\\t'.\n", " \n" ] } ], "source": [ "chrom_sizes = pd.read_table('/home/cmb-06/as/skchoudh/genomes/Mmul8/fasta/Macaca_mulatta.Mmul_8.0.1.dna.toplevel.sizes',\n", " names=['chrom', 'size'])\n", "chrom_sizes = dict(chrom_sizes.values.tolist())" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "def is_within_chrom_size(chrom, end):\n", " is_true = int(int(chrom_sizes[str(chrom)]) >= int(end))\n", " #print(chrom, end,int(chrom_sizes[str(chrom)]), is_true)\n", " return is_true\n", "\n", "gtf_df['is_within_chrom_size'] = gtf_df.apply(lambda row: is_within_chrom_size(row.seqname, row.end), axis=1)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
seqnamesourcefeaturestartendscorestrandframegene_idgene_version...transcript_versiontranscript_nametranscript_sourcetranscript_biotypeexon_numberexon_idexon_versionprotein_idprotein_versionis_within_chrom_size
1164394JSUE03241461.1ensemblgene8611038None-NoneENSMMUG000000445181...NoneNoneNoneNoneNoneNoneNoneNoneNone0
1164395JSUE03241461.1ensembltranscript8611038None-NoneENSMMUG000000445181...1RF01241-201ensemblsnoRNANoneNoneNoneNoneNone0
1164396JSUE03241461.1ensemblexon8611038None-NoneENSMMUG000000445181...1RF01241-201ensemblsnoRNA1ENSMMUE000003919841NoneNone0
1164453JSUE03318976.1ensemblgene9161039None+NoneENSMMUG000000485521...NoneNoneNoneNoneNoneNoneNoneNoneNone0
1164454JSUE03318976.1ensembltranscript9161039None+NoneENSMMUG000000485521...1RF00003-201ensemblsnRNANoneNoneNoneNoneNone0
1164455JSUE03318976.1ensemblexon9161039None+NoneENSMMUG000000485521...1RF00003-201ensemblsnRNA1ENSMMUE000003845621NoneNone0
1164628JSUE03158362.1ensemblgene908995None+NoneENSMMUG000000464521...NoneNoneNoneNoneNoneNoneNoneNoneNone0
1164629JSUE03158362.1ensembltranscript908995None+NoneENSMMUG000000464521...1RF01518-201ensemblmisc_RNANoneNoneNoneNoneNone0
1164630JSUE03158362.1ensemblexon908995None+NoneENSMMUG000000464521...1RF01518-201ensemblmisc_RNA1ENSMMUE000003829891NoneNone0
1165906JSUE03156690.1ensemblgene546751None+NoneENSMMUG000000450551...NoneNoneNoneNoneNoneNoneNoneNoneNone0
1165907JSUE03156690.1ensembltranscript546751None+NoneENSMMUG000000450551...1RF00100-201ensemblmisc_RNANoneNoneNoneNoneNone0
1165908JSUE03156690.1ensemblexon546751None+NoneENSMMUG000000450551...1RF00100-201ensemblmisc_RNA1ENSMMUE000004066631NoneNone0
1166392JSUE03144147.1ensemblgene207526None+NoneENSMMUG000000390971...NoneNoneNoneNoneNoneNoneNoneNoneNone0
1166393JSUE03144147.1ensembltranscript207526None+NoneENSMMUG000000390971...1RF02104-201ensemblmisc_RNANoneNoneNoneNoneNone0
1166394JSUE03144147.1ensemblexon207526None+NoneENSMMUG000000390971...1RF02104-201ensemblmisc_RNA1ENSMMUE000003920751NoneNone0
\n", "

15 rows × 24 columns

\n", "
" ], "text/plain": [ " seqname source feature start end score strand frame \\\n", "1164394 JSUE03241461.1 ensembl gene 861 1038 None - None \n", "1164395 JSUE03241461.1 ensembl transcript 861 1038 None - None \n", "1164396 JSUE03241461.1 ensembl exon 861 1038 None - None \n", "1164453 JSUE03318976.1 ensembl gene 916 1039 None + None \n", "1164454 JSUE03318976.1 ensembl transcript 916 1039 None + None \n", "1164455 JSUE03318976.1 ensembl exon 916 1039 None + None \n", "1164628 JSUE03158362.1 ensembl gene 908 995 None + None \n", "1164629 JSUE03158362.1 ensembl transcript 908 995 None + None \n", "1164630 JSUE03158362.1 ensembl exon 908 995 None + None \n", "1165906 JSUE03156690.1 ensembl gene 546 751 None + None \n", "1165907 JSUE03156690.1 ensembl transcript 546 751 None + None \n", "1165908 JSUE03156690.1 ensembl exon 546 751 None + None \n", "1166392 JSUE03144147.1 ensembl gene 207 526 None + None \n", "1166393 JSUE03144147.1 ensembl transcript 207 526 None + None \n", "1166394 JSUE03144147.1 ensembl exon 207 526 None + None \n", "\n", " gene_id gene_version ... transcript_version \\\n", "1164394 ENSMMUG00000044518 1 ... None \n", "1164395 ENSMMUG00000044518 1 ... 1 \n", "1164396 ENSMMUG00000044518 1 ... 1 \n", "1164453 ENSMMUG00000048552 1 ... None \n", "1164454 ENSMMUG00000048552 1 ... 1 \n", "1164455 ENSMMUG00000048552 1 ... 1 \n", "1164628 ENSMMUG00000046452 1 ... None \n", "1164629 ENSMMUG00000046452 1 ... 1 \n", "1164630 ENSMMUG00000046452 1 ... 1 \n", "1165906 ENSMMUG00000045055 1 ... None \n", "1165907 ENSMMUG00000045055 1 ... 1 \n", "1165908 ENSMMUG00000045055 1 ... 1 \n", "1166392 ENSMMUG00000039097 1 ... None \n", "1166393 ENSMMUG00000039097 1 ... 1 \n", "1166394 ENSMMUG00000039097 1 ... 1 \n", "\n", " transcript_name transcript_source transcript_biotype exon_number \\\n", "1164394 None None None None \n", "1164395 RF01241-201 ensembl snoRNA None \n", "1164396 RF01241-201 ensembl snoRNA 1 \n", "1164453 None None None None \n", "1164454 RF00003-201 ensembl snRNA None \n", "1164455 RF00003-201 ensembl snRNA 1 \n", "1164628 None None None None \n", "1164629 RF01518-201 ensembl misc_RNA None \n", "1164630 RF01518-201 ensembl misc_RNA 1 \n", "1165906 None None None None \n", "1165907 RF00100-201 ensembl misc_RNA None \n", "1165908 RF00100-201 ensembl misc_RNA 1 \n", "1166392 None None None None \n", "1166393 RF02104-201 ensembl misc_RNA None \n", "1166394 RF02104-201 ensembl misc_RNA 1 \n", "\n", " exon_id exon_version protein_id protein_version \\\n", "1164394 None None None None \n", "1164395 None None None None \n", "1164396 ENSMMUE00000391984 1 None None \n", "1164453 None None None None \n", "1164454 None None None None \n", "1164455 ENSMMUE00000384562 1 None None \n", "1164628 None None None None \n", "1164629 None None None None \n", "1164630 ENSMMUE00000382989 1 None None \n", "1165906 None None None None \n", "1165907 None None None None \n", "1165908 ENSMMUE00000406663 1 None None \n", "1166392 None None None None \n", "1166393 None None None None \n", "1166394 ENSMMUE00000392075 1 None None \n", "\n", " is_within_chrom_size \n", "1164394 0 \n", "1164395 0 \n", "1164396 0 \n", "1164453 0 \n", "1164454 0 \n", "1164455 0 \n", "1164628 0 \n", "1164629 0 \n", "1164630 0 \n", "1165906 0 \n", "1165907 0 \n", "1165908 0 \n", "1166392 0 \n", "1166393 0 \n", "1166394 0 \n", "\n", "[15 rows x 24 columns]" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "gtf_df[gtf_df['is_within_chrom_size']!=1]" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "db = gffutils.FeatureDB(gtf_db, keep_order=True)\n" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "with open('/home/cmb-06/as/skchoudh/genomes/Mmul8/annotation/Macaca_mulatta.Mmul_8.0.1.96_fixed.gtf', 'w') as fout:\n", " for feature in db.all_features():\n", " max_size = chrom_sizes[feature.chrom]\n", " # If end is longer than the chromosome\n", " if feature.end > max_size:\n", " feature.end = max_size\n", " fout.write(str(feature) + '\\n')\n", " " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python [conda env:riboraptor]", "language": "python", "name": "conda-env-riboraptor-py" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.7" } }, "nbformat": 4, "nbformat_minor": 2 }