{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "Collapsed": "false" }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/cmb-06/as/skchoudh/software_frozen/anaconda37/envs/riboraptor/lib/python3.7/site-packages/ipykernel_launcher.py:11: DeprecationWarning: Using or importing the ABCs from 'collections' instead of from 'collections.abc' is deprecated, and in 3.8 it will stop working\n", " # This is added back by InteractiveShellApp.init_path()\n" ] } ], "source": [ "from collections import defaultdict, OrderedDict\n", "import warnings\n", "import gffutils\n", "import pybedtools\n", "import pandas as pd\n", "import copy\n", "import os\n", "import re\n", "from gffutils.pybedtools_integration import tsses\n", "from copy import deepcopy\n", "from collections import OrderedDict, Callable\n", "import errno\n", "\n", "def mkdir_p(path):\n", " try:\n", " os.makedirs(path)\n", " except OSError as exc: # Python >2.5\n", " if exc.errno == errno.EEXIST and os.path.isdir(path):\n", " pass\n", " else:\n", " raise\n", " \n", "class DefaultOrderedDict(OrderedDict):\n", " # Source: http://stackoverflow.com/a/6190500/562769\n", " def __init__(self, default_factory=None, *a, **kw):\n", " if (default_factory is not None and\n", " not isinstance(default_factory, Callable)):\n", " raise TypeError('first argument must be callable')\n", " OrderedDict.__init__(self, *a, **kw)\n", " self.default_factory = default_factory\n", "\n", " def __getitem__(self, key):\n", " try:\n", " return OrderedDict.__getitem__(self, key)\n", " except KeyError:\n", " return self.__missing__(key)\n", "\n", " def __missing__(self, key):\n", " if self.default_factory is None:\n", " raise KeyError(key)\n", " self[key] = value = self.default_factory()\n", " return value\n", "\n", " def __reduce__(self):\n", " if self.default_factory is None:\n", " args = tuple()\n", " else:\n", " args = self.default_factory,\n", " return type(self), args, None, None, self.items()\n", "\n", " def copy(self):\n", " return self.__copy__()\n", "\n", " def __copy__(self):\n", " return type(self)(self.default_factory, self)\n", "\n", " def __deepcopy__(self, memo):\n", " import copy\n", " return type(self)(self.default_factory,\n", " copy.deepcopy(self.items()))\n", "\n", " def __repr__(self):\n", " return 'OrderedDefaultDict(%s, %s)' % (self.default_factory,\n", " OrderedDict.__repr__(self))\n" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "Collapsed": "false" }, "outputs": [], "source": [ "gtf = '/panfs/qcb-panasas/skchoudh/genomes/hg38/annotation/Homo_sapiens.GRCh38.96.gtf'\n", "gtf_db = '/panfs/qcb-panasas/skchoudh/genomes/hg38/annotation/Homo_sapiens.GRCh38.96.gtf.db'\n", "prefix = '/panfs/qcb-panasas/skchoudh/github_projects/riboraptor/riboraptor/annotation/hg38/v96/'\n", "chrsizes = '/panfs/qcb-panasas/skchoudh/genomes/hg38/fasta/hg38.chrom.sizes'\n", "mkdir_p(prefix)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "Collapsed": "false" }, "outputs": [], "source": [ "def create_gene_dict(db):\n", " '''\n", " Store each feature line db.all_features() as a dict of dicts\n", " '''\n", " gene_dict = DefaultOrderedDict(lambda: DefaultOrderedDict(lambda: DefaultOrderedDict(list)))\n", " for line_no, feature in enumerate(db.all_features()):\n", " gene_ids = feature.attributes['gene_id']\n", " feature_type = feature.featuretype\n", " if feature_type == 'gene':\n", " if len(gene_ids)!=1:\n", " logging.warning('Found multiple gene_ids on line {} in gtf'.format(line_no))\n", " break\n", " else:\n", " gene_id = gene_ids[0]\n", " gene_dict[gene_id]['gene'] = feature\n", " else:\n", " transcript_ids = feature.attributes['transcript_id']\n", "\n", " for gene_id in gene_ids:\n", " for transcript_id in transcript_ids:\n", " gene_dict[gene_id][transcript_id][feature_type].append(feature)\n", " return gene_dict" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "Collapsed": "false" }, "outputs": [], "source": [ "db = gffutils.create_db(gtf, dbfn=gtf_db, \n", " merge_strategy='merge', \n", " force=True, \n", " disable_infer_transcripts=True,\n", " disable_infer_genes=True)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "Collapsed": "false" }, "outputs": [], "source": [ "db = gffutils.FeatureDB(gtf_db, keep_order=True)\n", "gene_dict = create_gene_dict(db)\n" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "Collapsed": "false" }, "outputs": [], "source": [ "def get_gene_list(gene_dict):\n", " return list(set(gene_dict.keys()))\n", "\n", "def get_UTR_regions(utrs, cds):\n", " if len(cds)==0:\n", " return [], []\n", " utr5_regions = []\n", " utr3_regions = [] \n", " cds_sorted = sorted(list(cds), key=lambda x: x.start)\n", " first_cds = cds_sorted[0]\n", " last_cds = cds_sorted[-1]\n", " for orig_utr in utrs:\n", " utr = deepcopy(orig_utr)\n", " ## Push all cds at once\n", " ## Sort later to remove duplicates\n", " strand = utr.strand\n", " if utr.start < first_cds.start:\n", " if utr.stop >= first_cds.start:\n", " utr.stop = first_cds.start - 1\n", " if strand == '+':\n", " utr5_regions.append(utr)\n", " else:\n", " utr3_regions.append(utr)\n", " elif utr.stop > last_cds.stop:\n", " if utr.start <= last_cds.stop:\n", " utr.start = last_cds.stop + 1\n", " if strand == '+':\n", " utr3_regions.append(utr)\n", " else:\n", " utr5_regions.append(utr)\n", " \n", " return utr5_regions, utr3_regions\n", " \n", "def create_bed(regions, bedtype='0'):\n", " '''Create bed from list of regions\n", " bedtype: 0 or 1\n", " 0-Based or 1-based coordinate of the BED\n", " '''\n", " bedstr = ''\n", " for region in regions:\n", " assert len(region.attributes['gene_id']) == 1\n", " ## GTF start is 1-based, so shift by one while writing \n", " ## to 0-based BED format\n", " if bedtype == '0':\n", " start = region.start - 1\n", " else:\n", " start = region.start\n", " bedstr += '{}\\t{}\\t{}\\t{}\\t{}\\t{}\\n'.format(region.chrom,\n", " start,\n", " region.stop,\n", " re.sub('\\.\\d+', '', region.attributes['gene_id'][0]),\n", " '.',\n", " region.strand)\n", " return bedstr\n", "\n", "def rename_regions(regions, gene_id):\n", " regions = list(regions)\n", " if len(regions) == 0:\n", " return []\n", " for region in regions:\n", " region.attributes['gene_id'] = gene_id\n", " return regions\n", "\n", "def merge_regions(db, regions):\n", " if len(regions) == 0:\n", " return []\n", " merged = db.merge(sorted(list(regions), key=lambda x: x.start))\n", " return merged\n", "\n", "def merge_regions_nostrand(db, regions):\n", " if len(regions) == 0:\n", " return []\n", " merged = db.merge(sorted(list(regions), key=lambda x: x.start), ignore_strand=True)\n", " return merged" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "Collapsed": "false" }, "outputs": [], "source": [ "utr5_bed = ''\n", "utr3_bed = ''\n", "gene_bed = ''\n", "exon_bed = ''\n", "intron_bed = ''\n", "start_codon_bed = ''\n", "stop_codon_bed = ''\n", "cds_bed = ''\n", "\n", "gene_list = []\n", "\n", "for gene_id in get_gene_list(gene_dict):\n", " gene_list.append(gene_dict[gene_id]['gene'])\n", " \n", " utr5_regions, utr3_regions = [], []\n", " exon_regions, intron_regions = [], []\n", " star_codon_regions, stop_codon_regions = [], []\n", " cds_regions = []\n", " utr_regions = []\n", " for feature in gene_dict[gene_id].keys():\n", " if feature == 'gene':\n", " continue\n", " cds = list(gene_dict[gene_id][feature]['CDS'])\n", " exons = list(gene_dict[gene_id][feature]['exon'])\n", " utrs = list(gene_dict[gene_id][feature]['UTR'])\n", "\n", " cds = sorted(list(cds), key=lambda x: x.start)\n", " exons = sorted(list(exons), key=lambda x: x.start)\n", " utrs = sorted(list(utrs), key=lambda x: x.start)\n", " \n", " merged_exons = merge_regions(db, exons)\n", " introns = db.interfeatures(merged_exons)\n", " \n", " exon_regions += exons\n", " intron_regions += introns\n", " cds_regions += cds\n", " utr_regions += utrs\n", "\n", " cds_regions = sorted(list(cds_regions), key=lambda x: x.start)\n", " utr_regions = sorted(list(utr_regions), key=lambda x: x.start)\n", " \n", " utr5_regions, utr3_regions = get_UTR_regions(utr_regions, cds_regions)\n", " \n", " merged_utr5 = merge_regions(db, utr5_regions)\n", " renamed_utr5 = rename_regions(merged_utr5, gene_id)\n", " \n", " merged_utr3 = merge_regions(db, utr3_regions)\n", " renamed_utr3 = rename_regions(merged_utr3, gene_id)\n", " \n", " merged_exons = merge_regions(db, exon_regions)\n", " renamed_exons = rename_regions(merged_exons, gene_id)\n", " \n", " merged_introns = merge_regions(db, intron_regions)\n", " renamed_introns = rename_regions(merged_introns, gene_id)\n", " \n", " merged_cds = merge_regions(db, cds_regions)\n", " renamed_cds = rename_regions(merged_cds, gene_id)\n", " \n", " utr3_bed += create_bed(renamed_utr3)\n", " utr5_bed += create_bed(renamed_utr5)\n", " exon_bed += create_bed(renamed_exons)\n", " intron_bed += create_bed(renamed_introns)\n", " cds_bed += create_bed(renamed_cds)\n", " \n", " \n", "gene_bed = create_bed(gene_list)\n", "gene_bedtool = pybedtools.BedTool(gene_bed, from_string=True)\n", "utr5_bedtool = pybedtools.BedTool(utr5_bed, from_string=True)\n", "utr3_bedtool = pybedtools.BedTool(utr3_bed, from_string=True)\n", "exon_bedtool = pybedtools.BedTool(exon_bed, from_string=True)\n", "intron_bedtool = pybedtools.BedTool(intron_bed, from_string=True)\n", "cds_bedtool = pybedtools.BedTool(cds_bed, from_string=True)\n", "\n", "utr5_cds_subtracted = utr5_bedtool.subtract(cds_bedtool)\n", "utr3_cds_subtracted = utr3_bedtool.subtract(cds_bedtool)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "Collapsed": "false" }, "outputs": [], "source": [ "utr5_cds_subtracted.remove_invalid().sort().saveas(os.path.join(prefix, 'utr5.bed.gz'))\n", "utr3_cds_subtracted.remove_invalid().sort().saveas(os.path.join(prefix, 'utr3.bed.gz'))\n", "gene_bedtool.remove_invalid().sort().saveas(os.path.join(prefix, 'gene.bed.gz'))\n", "exon_bedtool.remove_invalid().sort().saveas(os.path.join(prefix, 'exon.bed.gz'))\n", "intron_bedtool.remove_invalid().sort().saveas(os.path.join(prefix, 'intron.bed.gz'))\n", "cds_bedtool.remove_invalid().sort().saveas(os.path.join(prefix, 'cds.bed.gz'))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "Collapsed": "false" }, "outputs": [], "source": [ "for gene_id in get_gene_list(gene_dict):\n", " start_codons = []\n", " stop_codons = []\n", " for start_codon in db.children(gene_id, featuretype='start_codon'):\n", " ## 1 -based stop\n", " ## 0-based start handled while converting to bed\n", " start_codon.stop = start_codon.start\n", " start_codons.append(start_codon)\n", " for stop_codon in db.children(gene_id, featuretype='stop_codon'):\n", " stop_codon.start = stop_codon.stop\n", " stop_codon.stop = stop_codon.stop+1\n", " stop_codons.append(stop_codon)\n", " merged_start_codons = merge_regions(db, start_codons)\n", " renamed_start_codons = rename_regions(merged_start_codons, gene_id)\n", " merged_stop_codons = merge_regions(db, stop_codons)\n", " renamed_stop_codons = rename_regions(merged_stop_codons, gene_id)\n", " \n", " start_codon_bed += create_bed(renamed_start_codons) \n", " stop_codon_bed += create_bed(renamed_stop_codons)\n", "\n", " \n", "start_codon_bedtool = pybedtools.BedTool(start_codon_bed, from_string=True)\n", "stop_codon_bedtool = pybedtools.BedTool(stop_codon_bed, from_string=True)\n", "start_codon_bedtool.remove_invalid().sort().saveas(os.path.join(prefix, 'start_codon.bed.gz'))\n", "stop_codon_bedtool.remove_invalid().sort().saveas(os.path.join(prefix, 'stop_codon.bed.gz'))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "Collapsed": "false" }, "outputs": [], "source": [ "stop_codon_bedtool" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "Collapsed": "false" }, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tss = tsses(db, as_bed6=True, merge_overlapping=True)\n", "tss.remove_invalid().sort().saveas(os.path.join(prefix, 'tss.bed'))\n" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "Collapsed": "false" }, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "promoter = tss.slop(l=1500, r=1500, s=True, g=chrsizes)\n", "promoter.remove_invalid().sort().saveas(os.path.join(prefix, 'promoter.1500.bed.gz'))" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "Collapsed": "false" }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/cmb-06/as/skchoudh/software_frozen/anaconda37/envs/riboraptor/lib/python3.7/site-packages/IPython/core/interactiveshell.py:3249: DtypeWarning: Columns (0) have mixed types. Specify dtype option on import or set low_memory=False.\n", " if (await self.run_code(code, result, async_=asy)):\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
chromstartendnamescorestrand
011036813369ENSG00000223972.+
111050913510ENSG00000223972.+
211593518936ENSG00000278267.-
312805331054ENSG00000243485.+
412806931070ENSG00000227232.-
\n", "
" ], "text/plain": [ " chrom start end name score strand\n", "0 1 10368 13369 ENSG00000223972 . +\n", "1 1 10509 13510 ENSG00000223972 . +\n", "2 1 15935 18936 ENSG00000278267 . -\n", "3 1 28053 31054 ENSG00000243485 . +\n", "4 1 28069 31070 ENSG00000227232 . -" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "promoter.to_dataframe().head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "Collapsed": "false" }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python [conda env:genomicutils]", "language": "python", "name": "conda-env-genomicutils-py" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.6" } }, "nbformat": 4, "nbformat_minor": 4 }