{ "cells": [ { "cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [], "source": [ "from collections import defaultdict, OrderedDict\n", "import warnings\n", "import gffutils\n", "import pybedtools\n", "import pandas as pd\n", "import copy\n", "import os\n", "import re\n", "from gffutils.pybedtools_integration import tsses\n", "from copy import deepcopy\n", "from collections import OrderedDict, Callable\n", "import errno\n", "\n", "def mkdir_p(path):\n", " try:\n", " os.makedirs(path)\n", " except OSError as exc: # Python >2.5\n", " if exc.errno == errno.EEXIST and os.path.isdir(path):\n", " pass\n", " else:\n", " raise\n", " \n", "class DefaultOrderedDict(OrderedDict):\n", " # Source: http://stackoverflow.com/a/6190500/562769\n", " def __init__(self, default_factory=None, *a, **kw):\n", " if (default_factory is not None and\n", " not isinstance(default_factory, Callable)):\n", " raise TypeError('first argument must be callable')\n", " OrderedDict.__init__(self, *a, **kw)\n", " self.default_factory = default_factory\n", "\n", " def __getitem__(self, key):\n", " try:\n", " return OrderedDict.__getitem__(self, key)\n", " except KeyError:\n", " return self.__missing__(key)\n", "\n", " def __missing__(self, key):\n", " if self.default_factory is None:\n", " raise KeyError(key)\n", " self[key] = value = self.default_factory()\n", " return value\n", "\n", " def __reduce__(self):\n", " if self.default_factory is None:\n", " args = tuple()\n", " else:\n", " args = self.default_factory,\n", " return type(self), args, None, None, self.items()\n", "\n", " def copy(self):\n", " return self.__copy__()\n", "\n", " def __copy__(self):\n", " return type(self)(self.default_factory, self)\n", "\n", " def __deepcopy__(self, memo):\n", " import copy\n", " return type(self)(self.default_factory,\n", " copy.deepcopy(self.items()))\n", "\n", " def __repr__(self):\n", " return 'OrderedDefaultDict(%s, %s)' % (self.default_factory,\n", " OrderedDict.__repr__(self))\n" ] }, { "cell_type": "raw", "metadata": {}, "source": [ "gtf = '/home/cmb-panasas2/skchoudh/genomes/panTro3/annotation/Pan_troglodytes.Pan_tro_3.0.94.gtf'\n", "gtf_db = '/home/cmb-panasas2/skchoudh/genomes/panTro3/annotation/Pan_troglodytes.Pan_tro_3.0.94.gtf.db'\n", "prefix = '/home/cmb-panasas2/skchoudh/github_projects/riboraptor/riboraptor/annotation/panTro3/v94/'\n", "chrsizes = '/home/cmb-panasas2/skchoudh/genomes/panTro3/fasta/Pan_troglodytes.Pan_tro_3.0.dna.toplevel.sizes'\n", "mkdir_p(prefix)" ] }, { "cell_type": "code", "execution_count": 52, "metadata": {}, "outputs": [], "source": [ "gtf = '/home/cmb-panasas2/skchoudh/genomes/panTro3/annotation/Pan_troglodytes.Pan_tro_3.0.96.gtf'\n", "gtf_db = '/home/cmb-panasas2/skchoudh/genomes/panTro3/annotation/Pan_troglodytes.Pan_tro_3.0.96.gtf.db'\n", "prefix = '/home/cmb-panasas2/skchoudh/github_projects/riboraptor/riboraptor/annotation/panTro3/v96/'\n", "chrsizes = '/home/cmb-panasas2/skchoudh/genomes/panTro3/fasta/Pan_troglodytes.Pan_tro_3.0.dna.toplevel.sizes'\n", "mkdir_p(prefix)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "def create_gene_dict(db):\n", " '''\n", " Store each feature line db.all_features() as a dict of dicts\n", " '''\n", " gene_dict = DefaultOrderedDict(lambda: DefaultOrderedDict(lambda: DefaultOrderedDict(list)))\n", " for line_no, feature in enumerate(db.all_features()):\n", " gene_ids = feature.attributes['gene_id']\n", " feature_type = feature.featuretype\n", " if feature_type == 'gene':\n", " if len(gene_ids)!=1:\n", " logging.warning('Found multiple gene_ids on line {} in gtf'.format(line_no))\n", " break\n", " else:\n", " gene_id = gene_ids[0]\n", " gene_dict[gene_id]['gene'] = feature\n", " else:\n", " transcript_ids = feature.attributes['transcript_id']\n", "\n", " for gene_id in gene_ids:\n", " for transcript_id in transcript_ids:\n", " gene_dict[gene_id][transcript_id][feature_type].append(feature)\n", " return gene_dict" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "db = gffutils.create_db(gtf, dbfn=gtf_db, merge_strategy='merge', force=True, disable_infer_transcripts=True, disable_infer_genes=True)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "db = gffutils.FeatureDB(gtf_db, keep_order=True)\n", "gene_dict = create_gene_dict(db)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CDS\n", "exon\n", "five_prime_utr\n", "gene\n", "start_codon\n", "stop_codon\n", "three_prime_utr\n", "transcript\n" ] } ], "source": [ "for x in db.featuretypes():\n", " print(x)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "def get_gene_list(gene_dict):\n", " return list(set(gene_dict.keys()))\n", "\n", "def get_UTR_regions(gene_dict, gene_id, transcript, cds):\n", " if len(cds)==0:\n", " return [], []\n", " utr5_regions = []\n", " utr3_regions = []\n", " utrs = gene_dict[gene_id][transcript]['UTR']\n", " first_cds = cds[0]\n", " last_cds = cds[-1]\n", " for utr in utrs:\n", " ## Push all cds at once\n", " ## Sort later to remove duplicates\n", " strand = utr.strand\n", " if strand == '+':\n", " if utr.stop < first_cds.start:\n", " utr.feature_type = 'five_prime_UTR'\n", " utr5_regions.append(utr)\n", " elif utr.start > last_cds.stop:\n", " utr.feature_type = 'three_prime_UTR'\n", " utr3_regions.append(utr)\n", " else:\n", " raise RuntimeError('Error with cds')\n", " elif strand == '-':\n", " if utr.stop < first_cds.start:\n", " utr.feature_type = 'three_prime_UTR'\n", " utr3_regions.append(utr)\n", " elif utr.start > last_cds.stop:\n", " utr.feature_type = 'five_prime_UTR'\n", " utr5_regions.append(utr) \n", " else:\n", " raise RuntimeError('Error with cds') \n", " return utr5_regions, utr3_regions\n", " \n", "def create_bed(regions, bedtype='0'):\n", " '''Create bed from list of regions\n", " bedtype: 0 or 1\n", " 0-Based or 1-based coordinate of the BED\n", " '''\n", " bedstr = ''\n", " for region in regions:\n", " assert len(region.attributes['gene_id']) == 1\n", " ## GTF start is 1-based, so shift by one while writing \n", " ## to 0-based BED format\n", " if bedtype == '0':\n", " start = region.start - 1\n", " else:\n", " start = region.start\n", " bedstr += '{}\\t{}\\t{}\\t{}\\t{}\\t{}\\n'.format(region.chrom,\n", " start,\n", " region.stop,\n", " re.sub('\\.\\d+', '', region.attributes['gene_id'][0]),\n", " '.',\n", " region.strand)\n", " return bedstr\n", "\n", "def rename_regions(regions, gene_id):\n", " regions = list(regions)\n", " if len(regions) == 0:\n", " return []\n", " for region in regions:\n", " region.attributes['gene_id'] = gene_id\n", " return regions\n", "\n", "def merge_regions(db, regions):\n", " if len(regions) == 0:\n", " return []\n", " merged = db.merge(sorted(list(regions), key=lambda x: x.start))\n", " return merged\n", "\n", "def merge_regions_nostrand(db, regions):\n", " if len(regions) == 0:\n", " return []\n", " merged = db.merge(sorted(list(regions), key=lambda x: x.start), ignore_strand=True)\n", " return merged" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "utr5_bed = ''\n", "utr3_bed = ''\n", "gene_bed = ''\n", "exon_bed = ''\n", "intron_bed = ''\n", "start_codon_bed = ''\n", "stop_codon_bed = ''\n", "cds_bed = ''\n", "\n", "gene_list = []\n", "\n", "for gene_id in get_gene_list(gene_dict):\n", " gene_list.append(gene_dict[gene_id]['gene'])\n", " \n", " utr5_regions, utr3_regions = [], []\n", " exon_regions, intron_regions = [], []\n", " star_codon_regions, stop_codon_regions = [], []\n", " cds_regions = []\n", " \n", " for feature in gene_dict[gene_id].keys():\n", " if feature == 'gene':\n", " continue\n", " cds = list(gene_dict[gene_id][feature]['CDS'])\n", " exons = list(gene_dict[gene_id][feature]['exon'])\n", " merged_exons = merge_regions(db, exons)\n", " introns = db.interfeatures(merged_exons)\n", " #utr5_region, utr3_region = get_UTR_regions(gene_dict, gene_id, feature, cds)\n", " utr5_region = list(gene_dict[gene_id][feature]['five_prime_utr'])\n", " utr3_region = list(gene_dict[gene_id][feature]['three_prime_utr'])\n", " utr5_regions += utr5_region\n", " utr3_regions += utr3_region\n", " exon_regions += exons\n", " intron_regions += introns\n", " cds_regions += cds\n", " \n", " merged_utr5 = merge_regions(db, utr5_regions)\n", " renamed_utr5 = rename_regions(merged_utr5, gene_id)\n", " \n", " merged_utr3 = merge_regions(db, utr3_regions)\n", " renamed_utr3 = rename_regions(merged_utr3, gene_id)\n", " \n", " merged_exons = merge_regions(db, exon_regions)\n", " renamed_exons = rename_regions(merged_exons, gene_id)\n", " \n", " merged_introns = merge_regions(db, intron_regions)\n", " renamed_introns = rename_regions(merged_introns, gene_id)\n", " \n", " merged_cds = merge_regions(db, cds_regions)\n", " renamed_cds = rename_regions(merged_cds, gene_id)\n", " \n", " utr3_bed += create_bed(renamed_utr3)\n", " utr5_bed += create_bed(renamed_utr5)\n", " exon_bed += create_bed(renamed_exons)\n", " intron_bed += create_bed(renamed_introns)\n", " cds_bed += create_bed(renamed_cds)\n", " \n", " \n", "gene_bed = create_bed(gene_list)\n", "gene_bedtool = pybedtools.BedTool(gene_bed, from_string=True)\n", "utr5_bedtool = pybedtools.BedTool(utr5_bed, from_string=True)\n", "utr3_bedtool = pybedtools.BedTool(utr3_bed, from_string=True)\n", "exon_bedtool = pybedtools.BedTool(exon_bed, from_string=True)\n", "intron_bedtool = pybedtools.BedTool(intron_bed, from_string=True)\n", "cds_bedtool = pybedtools.BedTool(cds_bed, from_string=True)\n", "\n", "utr5_cds_subtracted = utr5_bedtool.subtract(cds_bedtool)\n", "utr3_cds_subtracted = utr3_bedtool.subtract(cds_bedtool)\n", "utr5_cds_subtracted.remove_invalid().sort().saveas(os.path.join(prefix, 'utr5.bed.gz'))\n", "utr3_cds_subtracted.remove_invalid().sort().saveas(os.path.join(prefix, 'utr3.bed.gz'))\n", "gene_bedtool.remove_invalid().sort().saveas(os.path.join(prefix, 'gene.bed.gz'))\n", "exon_bedtool.remove_invalid().sort().saveas(os.path.join(prefix, 'exon.bed.gz'))\n", "intron_bedtool.remove_invalid().sort().saveas(os.path.join(prefix, 'intron.bed.gz'))\n", "cds_bedtool.remove_invalid().sort().saveas(os.path.join(prefix, 'cds.bed.gz'))" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "for gene_id in get_gene_list(gene_dict):\n", " start_codons = []\n", " stop_codons = []\n", " for start_codon in db.children(gene_id, featuretype='start_codon'):\n", " ## 1 -based stop\n", " ## 0-based start handled while converting to bed\n", " start_codon.stop = start_codon.start\n", " start_codons.append(start_codon)\n", " for stop_codon in db.children(gene_id, featuretype='stop_codon'):\n", " stop_codon.start = stop_codon.stop\n", " stop_codon.stop = stop_codon.stop+1\n", " stop_codons.append(stop_codon)\n", " merged_start_codons = merge_regions(db, start_codons)\n", " renamed_start_codons = rename_regions(merged_start_codons, gene_id)\n", " merged_stop_codons = merge_regions(db, stop_codons)\n", " renamed_stop_codons = rename_regions(merged_stop_codons, gene_id)\n", " \n", " start_codon_bed += create_bed(renamed_start_codons) \n", " stop_codon_bed += create_bed(renamed_stop_codons)\n", "\n", " \n", "start_codon_bedtool = pybedtools.BedTool(start_codon_bed, from_string=True)\n", "stop_codon_bedtool = pybedtools.BedTool(stop_codon_bed, from_string=True)\n", "start_codon_bedtool.remove_invalid().sort().saveas(os.path.join(prefix, 'start_codon.bed.gz'))\n", "stop_codon_bedtool.remove_invalid().sort().saveas(os.path.join(prefix, 'stop_codon.bed.gz'))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Check if gtf violates fasta" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/cmb-06/as/skchoudh/software_frozen/anaconda37/envs/riboraptor/lib/python3.6/site-packages/ipykernel_launcher.py:4: FutureWarning: read_table is deprecated, use read_csv instead, passing sep='\\t'.\n", " after removing the cwd from sys.path.\n" ] } ], "source": [ "import pyfaidx\n", "import pandas as pd\n", "fasta = pyfaidx.Fasta('/home/cmb-06/as/skchoudh/genomes/panTro3/fasta/Pan_troglodytes.Pan_tro_3.0.dna.toplevel.fa')\n", "chrom_sizes = pd.read_table('/home/cmb-06/as/skchoudh/genomes/panTro3/fasta/Pan_troglodytes.Pan_tro_3.0.dna.toplevel.sizes', names=['chrom', 'size'])" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "from collections import defaultdict\n", "import gzip\n", "import pandas as pd\n", "import re\n", "\n", "\n", "GTF_HEADER = ['seqname', 'source', 'feature', 'start', 'end', 'score',\n", " 'strand', 'frame']\n", "R_SEMICOLON = re.compile(r'\\s*;\\s*')\n", "R_COMMA = re.compile(r'\\s*,\\s*')\n", "R_KEYVALUE = re.compile(r'(\\s+|\\s*=\\s*)')\n", "\n", "\n", "def dataframe(filename):\n", " \"\"\"Open an optionally gzipped GTF file and return a pandas.DataFrame.\n", " \"\"\"\n", " # Each column is a list stored as a value in this dict.\n", " result = defaultdict(list)\n", "\n", " for i, line in enumerate(lines(filename)):\n", " for key in line.keys():\n", " # This key has not been seen yet, so set it to None for all\n", " # previous lines.\n", " if key not in result:\n", " result[key] = [None] * i\n", "\n", " # Ensure this row has some value for each column.\n", " for key in result.keys():\n", " result[key].append(line.get(key, None))\n", "\n", " return pd.DataFrame(result)\n", "\n", "\n", "def lines(filename):\n", " \"\"\"Open an optionally gzipped GTF file and generate a dict for each line.\n", " \"\"\"\n", " fn_open = gzip.open if filename.endswith('.gz') else open\n", "\n", " with fn_open(filename) as fh:\n", " for line in fh:\n", " if line.startswith('#'):\n", " continue\n", " else:\n", " yield parse(line)\n", "\n", "\n", "def parse(line):\n", " \"\"\"Parse a single GTF line and return a dict.\n", " \"\"\"\n", " result = {}\n", "\n", " fields = line.rstrip().split('\\t')\n", "\n", " for i, col in enumerate(GTF_HEADER):\n", " result[col] = _get_value(fields[i])\n", "\n", " # INFO field consists of \"key1=value;key2=value;...\".\n", " infos = [x for x in re.split(R_SEMICOLON, fields[8]) if x.strip()]\n", "\n", " for i, info in enumerate(infos, 1):\n", " # It should be key=\"value\".\n", " try:\n", " key, _, value = re.split(R_KEYVALUE, info, 1)\n", " # But sometimes it is just \"value\".\n", " except ValueError:\n", " key = 'INFO{}'.format(i)\n", " value = info\n", " # Ignore the field if there is no value.\n", " if value:\n", " result[key] = _get_value(value)\n", "\n", " return result\n", "\n", "\n", "def _get_value(value):\n", " if not value:\n", " return None\n", "\n", " # Strip double and single quotes.\n", " value = value.strip('\"\\'')\n", "\n", " # Return a list if the value has a comma.\n", " if ',' in value:\n", " value = re.split(R_COMMA, value)\n", " # These values are equivalent to None.\n", " elif value in ['', '.', 'NA']:\n", " return None\n", "\n", " return value\n", "\n", "def get_seq(row):\n", " chrom = row['seqname']\n", " start = row['start']\n", " end = row['end']\n", " strand = row['strand']\n", " \n", " if strand == '+':\n", " return fasta.get_seq(chrom, start, end)\n", " else:\n", " seq = fasta.get_seq(chrom, start, end)\n", " return str(Seq(seq, generic_dna).reverse_complement())" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "gtf_df = dataframe('/home/cmb-06/as/skchoudh/genomes/panTro3/annotation/Pan_troglodytes.Pan_tro_3.0.96.gtf')" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/cmb-06/as/skchoudh/software_frozen/anaconda37/envs/riboraptor/lib/python3.6/site-packages/ipykernel_launcher.py:3: FutureWarning: read_table is deprecated, use read_csv instead, passing sep='\\t'.\n", " This is separate from the ipykernel package so we can avoid doing imports until\n" ] } ], "source": [ "#/home/cmb-06/as/skchoudh/genomes/panTro3/fasta_check\n", "chrom_sizes = pd.read_table('/home/cmb-06/as/skchoudh/genomes/panTro3/fasta_check/Pan_troglodytes.Pan_tro_3.0.dna.toplevel.sizes',\n", " names=['chrom', 'size'])\n", "chrom_sizes = dict(chrom_sizes.values.tolist())" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [], "source": [ "def is_within_chrom_size(chrom, end):\n", " is_true = int(int(chrom_sizes[str(chrom)]) >= int(end))\n", " #print(chrom, end,int(chrom_sizes[str(chrom)]), is_true)\n", " return is_true\n", "\n", "gtf_df['is_within_chrom_size'] = gtf_df.apply(lambda row: is_within_chrom_size(row.seqname, row.end), axis=1)" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "count 1.296530e+06\n", "mean 9.999931e-01\n", "std 2.634685e-03\n", "min 0.000000e+00\n", "25% 1.000000e+00\n", "50% 1.000000e+00\n", "75% 1.000000e+00\n", "max 1.000000e+00\n", "Name: is_within_chrom_size, dtype: float64" ] }, "execution_count": 43, "metadata": {}, "output_type": "execute_result" } ], "source": [ "gtf_df['is_within_chrom_size'].describe()" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [], "source": [ "gtf_df['is_within_chrom_size_check'] = gtf_df.apply(lambda row: is_within_chrom_size(row.seqname, row.end), axis=1)" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
seqnamesourcefeaturestartendscorestrandframegene_idgene_version...exon_numberexon_idexon_versionprotein_idprotein_versionprojection_parent_transcriptgene_nametranscript_nameis_within_chrom_sizeis_within_chrom_size_check
\n", "

0 rows × 26 columns

\n", "
" ], "text/plain": [ "Empty DataFrame\n", "Columns: [seqname, source, feature, start, end, score, strand, frame, gene_id, gene_version, gene_source, gene_biotype, transcript_id, transcript_version, transcript_source, transcript_biotype, exon_number, exon_id, exon_version, protein_id, protein_version, projection_parent_transcript, gene_name, transcript_name, is_within_chrom_size, is_within_chrom_size_check]\n", "Index: []\n", "\n", "[0 rows x 26 columns]" ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "gtf_df[gtf_df.is_within_chrom_size_check!=gtf_df.is_within_chrom_size]" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "228573443" ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "chrom_sizes['1']" ] }, { "cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
seqnamesourcefeaturestartendscorestrandframegene_idgene_version...exon_numberexon_idexon_versionprotein_idprotein_versionprojection_parent_transcriptgene_nametranscript_nameis_within_chrom_sizeis_within_chrom_size_check
1293468KV421959.1ensemblgene1066110764None+NoneENSPTRG000000443311...NoneNoneNoneNoneNoneNoneRF00026None01
1293469KV421959.1ensembltranscript1066110764None+NoneENSPTRG000000443311...NoneNoneNoneNoneNoneNoneRF00026RF00026-20101
1293470KV421959.1ensemblexon1066110764None+NoneENSPTRG000000443311...1ENSPTRE000005393031NoneNoneNoneRF00026RF00026-20101
1295990AACZ04043439.1ensemblgene13101416None+NoneENSPTRG000000511611...NoneNoneNoneNoneNoneNoneRF00026None01
1295991AACZ04043439.1ensembltranscript13101416None+NoneENSPTRG000000511611...NoneNoneNoneNoneNoneNoneRF00026RF00026-20101
1295992AACZ04043439.1ensemblexon13101416None+NoneENSPTRG000000511611...1ENSPTRE000004821491NoneNoneNoneRF00026RF00026-20101
1296074AACZ04052657.1ensemblgene12201326None+NoneENSPTRG000000528121...NoneNoneNoneNoneNoneNoneRF00026None01
1296075AACZ04052657.1ensembltranscript12201326None+NoneENSPTRG000000528121...NoneNoneNoneNoneNoneNoneRF00026RF00026-20101
1296076AACZ04052657.1ensemblexon12201326None+NoneENSPTRG000000528121...1ENSPTRE000005199581NoneNoneNoneRF00026RF00026-20101
\n", "

9 rows × 26 columns

\n", "
" ], "text/plain": [ " seqname source feature start end score strand frame \\\n", "1293468 KV421959.1 ensembl gene 10661 10764 None + None \n", "1293469 KV421959.1 ensembl transcript 10661 10764 None + None \n", "1293470 KV421959.1 ensembl exon 10661 10764 None + None \n", "1295990 AACZ04043439.1 ensembl gene 1310 1416 None + None \n", "1295991 AACZ04043439.1 ensembl transcript 1310 1416 None + None \n", "1295992 AACZ04043439.1 ensembl exon 1310 1416 None + None \n", "1296074 AACZ04052657.1 ensembl gene 1220 1326 None + None \n", "1296075 AACZ04052657.1 ensembl transcript 1220 1326 None + None \n", "1296076 AACZ04052657.1 ensembl exon 1220 1326 None + None \n", "\n", " gene_id gene_version ... exon_number exon_id \\\n", "1293468 ENSPTRG00000044331 1 ... None None \n", "1293469 ENSPTRG00000044331 1 ... None None \n", "1293470 ENSPTRG00000044331 1 ... 1 ENSPTRE00000539303 \n", "1295990 ENSPTRG00000051161 1 ... None None \n", "1295991 ENSPTRG00000051161 1 ... None None \n", "1295992 ENSPTRG00000051161 1 ... 1 ENSPTRE00000482149 \n", "1296074 ENSPTRG00000052812 1 ... None None \n", "1296075 ENSPTRG00000052812 1 ... None None \n", "1296076 ENSPTRG00000052812 1 ... 1 ENSPTRE00000519958 \n", "\n", " exon_version protein_id protein_version projection_parent_transcript \\\n", "1293468 None None None None \n", "1293469 None None None None \n", "1293470 1 None None None \n", "1295990 None None None None \n", "1295991 None None None None \n", "1295992 1 None None None \n", "1296074 None None None None \n", "1296075 None None None None \n", "1296076 1 None None None \n", "\n", " gene_name transcript_name is_within_chrom_size \\\n", "1293468 RF00026 None 0 \n", "1293469 RF00026 RF00026-201 0 \n", "1293470 RF00026 RF00026-201 0 \n", "1295990 RF00026 None 0 \n", "1295991 RF00026 RF00026-201 0 \n", "1295992 RF00026 RF00026-201 0 \n", "1296074 RF00026 None 0 \n", "1296075 RF00026 RF00026-201 0 \n", "1296076 RF00026 RF00026-201 0 \n", "\n", " is_within_chrom_size_check \n", "1293468 1 \n", "1293469 1 \n", "1293470 1 \n", "1295990 1 \n", "1295991 1 \n", "1295992 1 \n", "1296074 1 \n", "1296075 1 \n", "1296076 1 \n", "\n", "[9 rows x 26 columns]" ] }, "execution_count": 48, "metadata": {}, "output_type": "execute_result" } ], "source": [ "gtf_df[gtf_df.is_within_chrom_size!=1]\n" ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [], "source": [ "gtf_df[gtf_df.is_within_chrom_size!=1].to_csv('/home/cmb-06/as/skchoudh/genomes/panTro3/annotation/Pan_troglodytes.Pan_tro_3.0.96_wrong_coordinates.tsv', sep='\\t', index=False, header=True)" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "10756" ] }, "execution_count": 45, "metadata": {}, "output_type": "execute_result" } ], "source": [ "chrom_sizes['KV421959.1']" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1400" ] }, "execution_count": 46, "metadata": {}, "output_type": "execute_result" } ], "source": [ "chrom_sizes['AACZ04043439.1']" ] }, { "cell_type": "code", "execution_count": 49, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1310" ] }, "execution_count": 49, "metadata": {}, "output_type": "execute_result" } ], "source": [ "chrom_sizes['AACZ04052657.1']" ] }, { "cell_type": "code", "execution_count": 53, "metadata": {}, "outputs": [], "source": [ "db = gffutils.FeatureDB(gtf_db, keep_order=True)\n" ] }, { "cell_type": "code", "execution_count": 57, "metadata": {}, "outputs": [], "source": [ "with open('/home/cmb-06/as/skchoudh/genomes/panTro3/annotation/Pan_troglodytes.Pan_tro_3.0.96_fixed.gtf', 'w') as fout:\n", " for feature in db.all_features():\n", " max_size = chrom_sizes[feature.chrom]\n", " # If end is longer than the chromosome\n", " if feature.end > max_size:\n", " feature.end = max_size\n", " fout.write(str(feature) + '\\n')\n", " " ] }, { "cell_type": "code", "execution_count": 55, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "57629" ] }, "execution_count": 55, "metadata": {}, "output_type": "execute_result" } ], "source": [ "feature.chrom\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python [conda env:riboraptor]", "language": "python", "name": "conda-env-riboraptor-py" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.7" } }, "nbformat": 4, "nbformat_minor": 2 }