inst/extdata/config/nongithub/nongithub_meta.toml

title = "Non Github resources"
[nongithub.cfg_meta]
avaliable_cfg = ["nongithub.toml"]
prefix_url = "https://raw.githubusercontent.com/JhuangLab/BioInstaller/master/inst/extdata/config/nongithub"
cfg_dir = "@>@system.file('extdata', 'config/nongithub', package = 'BioInstaller')@<@"

[nongithub.item.gmap]
title = "GMAP: A Genomic Mapping and Alignment Program for mRNA and EST Sequences, and GSNAP: Genomic Short-read Nucleotide Alignment Program"
description = "The programs GMAP and GSNAP, for aligning RNA-Seq and DNA-Seq datasets to genomes, have evolved along with advances in biological methodology to handle longer reads, larger volumes of data, and new types of biological assays. The genomic representation has been improved to include linear genomes that can compare sequences using single-instruction multiple-data (SIMD) instructions, compressed genomic hash tables with fast access using SIMD instructions, handling of large genomes with more than four billion bp, and enhanced suffix arrays (ESAs) with novel data structures for fast access. Improvements to the algorithms have included a greedy match-and-extend algorithm using suffix arrays, segment chaining using genomic hash tables, diagonalization using segmental hash tables, and nucleotide-level dynamic programming procedures that use SIMD instructions and eliminate the need for F-loop calculations. Enhancements to the functionality of the programs include standardization of indel positions, handling of ambiguous splicing, clipping and merging of overlapping paired-end reads, and alignments to circular chromosomes and alternate scaffolds. The programs have been adapted for use in pipelines by integrating their usage into R/Bioconductor packages such as gmapR and HTSeqGenie, and these pipelines have facilitated the discovery of numerous biological phenomena."
publication = """Wu T D, Watanabe C K. GMAP: a genomic mapping and alignment program for mRNA and EST sequences[J]. Bioinformatics, 2005, 21(9): 1859-1875. doi: 10.1093/bioinformatics/bti310; Wu T D, Reeder J, Lawrence M, et al. GMAP and GSNAP for genomic sequence alignment: enhancements to speed, accuracy, and functionality[J]. Statistical Genomics: Methods and Protocols, 2016: 283-334. doi: 10.1007/978-1-4939-3578-9_15"""
tag = ["Genomics", "NGS", "Genomic alignment", "DNA-seq", "RNA-seq", "mRNA", "Whole Transcriptome Sequencing",
"EST"]

[nongithub.item.gridss]
title = "GRIDSS: sensitive and specific genomic rearrangement detection using positional de Bruijn graph assembly."
description = "The identification of genomic rearrangements with high sensitivity and specificity using massively parallel sequencing remains a major challenge, particularly in precision medicine and cancer research. Here, we describe a new method for detecting rearrangements, GRIDSS (Genome Rearrangement IDentification Software Suite). GRIDSS is a multithreaded structural variant (SV) caller that performs efficient genome-wide break-end assembly prior to variant calling using a novel positional de Bruijn graph-based assembler. By combining assembly, split read, and read pair evidence using a probabilistic scoring, GRIDSS achieves high sensitivity and specificity on simulated, cell line, and patient tumor data, recently winning SV subchallenge #5 of the ICGC-TCGA DREAM8.5 Somatic Mutation Calling Challenge. On human cell line data, GRIDSS halves the false discovery rate compared to other recent methods while matching or exceeding their sensitivity. GRIDSS identifies nontemplate sequence insertions, microhomologies, and large imperfect homologies, estimates a quality score for each breakpoint, stratifies calls into high or low confidence, and supports multisample analysis."
publication = "Cameron D L, Schroeder J, Penington J S, et al. GRIDSS: sensitive and specific genomic rearrangement detection using positional de Bruijn graph assembly[J]. Genome Research, 2017, 27(12). doi: 10.1101/gr.222109.117"
tag = ["NGS", "SV"]

[nongithub.item.interproscan]
title = "Protein sequence analysis & classification"
description = "InterProScan is the software package that allows sequences (protein and nucleic) to be scanned against InterPro's signatures. Signatures are predictive models, provided by several different databases, that make up the InterPro consortium."
publication = "Zdobnov E M, Apweiler R. InterProScan – an integration platform for the signature-recognition methods in InterPro[J]. Bioinformatics, 2001, 17(9):847-848. doi: PMID: 11590104"
tag = ["Protein", "Classification"]

[nongithub.item.subread]
title = "High-performance read alignment, quantification and mutation discovery"
description = "The Subread software package is a tool kit for processing next-gen sequencing data. It includes Subread aligner, Subjunc exon-exon junction detector and featureCounts read summarization program. Subread aligner can be used to align both gDNA-seq and RNA-seq reads. Subjunc aligner was specified designed for the detection of exon-exon junction. For the mapping of RNA-seq reads, Subread performs local alignments and Subjunc performs global alignments."
publication = """Yang Liao, Gordon K Smyth and Wei Shi. "The Subread aligner: fast, accurate and scalable read mapping by seed-and-vote", Nucleic Acids Research, 2013, 41(10):e108"""
tag = ["NGS", "aligner"]

[nongithub.item.vcfanno]
title = "annotate a VCF with other VCFs/BEDs/tabixed files"
description = "vcfanno allows you to quickly annotate your VCF with any number of INFO fields from any number of VCFs or BED files. It uses a simple conf file to allow the user to specify the source annotation files and fields and how they will be added to the info of the query VCF."
publication = "Pedersen B S, Layer R M, Quinlan A R. Vcfanno: fast, flexible annotation of genetic variants[J]. Genome Biology, 2016, 17(1):1-9."
tag = ["NGS", "annotation"]

[nongithub.item.absolute]
title = "ABSOLUTE can estimate purity/ploidy, and from that compute absolute copy-number and mutation multiplicities."
description = "When DNA is extracted from an admixed population of cancer and normal cells, the information on absolute copy number per cancer cell is lost in the mixing.  The purpose of ABSOLUTE is to re-extract these data from the mixed DNA population.  This process begins by generation of segmented copy number data, which is input to the ABSOLUTE algorithm together with pre-computed models of recurrent cancer karyotypes and, optionally, allelic fraction values for somatic point mutations.  The output of ABSOLUTE then provides re-extracted information on the absolute cellular copy number of local DNA segments and, for point mutations, the number of mutated alleles."
publication = "Carter S L, Cibulskis K, Helman E, et al. Absolute quantification of somatic DNA alterations in human cancer[J]. Nature biotechnology, 2012, 30(5): 413-421."

[nongithub.item.hapseg]
title = "A probabilistic method to interpret bi-allelic marker data in cancer samples."
description = "The HAPSEG module takes single nucleotide polymorphism (SNP) microarray data and outputs copy number data segmented by haplotype.  The output data is suitable for use as input data for the ABSOLUTE module. More detail see https://software.broadinstitute.org/cancer/software/genepattern/modules/docs/HAPSEG/1"
publication = "Carter SL, Meyerson M, Getz G. Accurate estimation of homologue-specific DNA concentration-ratios in cancer samples allows long-range haplotyping. Available from Nature Precedings; 2011."

[nongithub.item.atlas2]
title = "Atlas2, next-generation sequencing suite of variant analysis tools specializing in the separation of true SNPs and insertions and deletions (indels)"
description = "Atlas2 is a next-generation sequencing suite of variant analysis tools specializing in the separation of true SNPs and insertions and deletions (indels) from sequencing and mapping errors in Whole Exome Capture Sequencing (WECS) data."
publication = "Challis D. etc. An integrative variant analysis suite for whole exome next-generation sequencing data. BMC Bioinformatics 2012, 13:8 doi:10.1186/1471-2105-13-8"

[nongithub.item.beagle]
title = "Beagle, a software package that performs genotype calling, genotype phasing, imputation of ungenotyped markers, and identity-by-descent segment detection."
description = """Beagle version 4.1 has a more accurate genotype phasing algorithm and a very fast and accurate genotype imputation algorithm. Version 4.1 also has several changes to the command line arguments which are described in the release notes. The "ped" argument has no effect in version 4.1. If your data contains nuclear families and you want to model the parent-offspring relationships when phasing genotypes, please use version 4.0."""
publication = """S R Browning and B L Browning (2007) Rapid and accurate haplotype phasing and missing data inference for whole genome association studies by use of localized haplotype clustering. Am J Hum Genet 81:1084-1097. doi:10.1086/521987; B L Browning and S R Browning (2013). Improving the accuracy and efficiency of identity-by-descent detection in population data. Genetics 194(2):459-71. doi:10.1534/genetics.113.150029; B L Browning and S R Browning (2016). Genotype imputation with millions of reference samples. Am J Hum Genet 98:116-126. doi:10.1016/j.ajhg.2015.11.020"""

[nongithub.item.contest]
title = "ContEst is a tool (and method) for estimating the amount of cross-sample contamination in next generation sequencing data.  Using a Bayesian framework, contamination levels are estimated from array based genotypes and sequencing reads."
description = "Here, we present ContEst, a tool for estimating the level of cross-individual contamination in next-generation sequencing data. We demonstrate the accuracy of ContEst across a range of contamination levels, sources and read depths using sequencing data mixed in silico at known concentrations. We applied our tool to published cancer sequencing datasets and report their estimated contamination levels."
publication = "Cibulskis K, Mckenna A, Fennell T, et al. ContEst: estimating cross-contamination of human samples in next-generation sequencing data[J]. Bioinformatics, 2011, 27(18):2601-2602."

[nongithub.item.rmats]
title = "Multivariate Analysis of Transcript Splicing (MATS)"
description = "MATS is a computational tool to detect differential alternative splicing events from RNA-Seq data. The statistical model of MATS calculates the P-value and false discovery rate that the difference in the isoform ratio of a gene between two conditions exceeds a given user-defined threshold. From the RNA-Seq data, MATS can automatically detect and analyze alternative splicing events corresponding to all major types of alternative splicing patterns. MATS handles replicate RNA-Seq data from both paired and unpaired study design."
publication = """Shen S., Park JW., Lu ZX., Lin L., Henry MD., Wu YN., Zhou Q., Xing Y. rMATS: Robust and Flexible Detection of Differential Alternative Splicing from Replicate RNA-Seq Data. PNAS, 111(51):E5593-601. doi: 10.1073/pnas.1419161111; Park JW., Tokheim C., Shen S., Xing Y. Identifying differential alternative splicing events from RNA sequencing data using RNASeq-MATS. Methods in Molecular Biology: Deep Sequencing Data Analysis, 2013;1038:171-179 doi: 10.1007/978-1-62703-514-9_10; Shen S., Park JW., Huang J., Dittmar KA., Lu ZX., Zhou Q., Carstens RP., Xing Y. MATS: A Bayesian Framework for Flexible Detection of Differential Alternative Splicing from RNA-Seq Data. Nucleic Acids Research, 2012;40(8):e61 doi: 10.1093/nar/gkr1291"""

[nongithub.item.prada]
title = "PRADA : Pipeline for RNA-Sequencing Data Analysis"
description = """Massively parallel sequencing of cDNA reverse transcribed from RNA (RNASeq) provides an accurate estimate of the quantity and composition of mRNAs. To characterize the transcriptome through the analysis of RNA-seq data, we developed PRADA. PRADA focuses on the processing and analysis of gene expression estimates, supervised and unsupervised gene fusion identification, and supervised intragenic deletion identification.
PRADA currently supports 7 modules to process and identify abnormalities from RNAseq data:
preprocess: Generates aligned and recalibrated BAM files.
expression: Generates gene expression (RPKM) and quality metrics.
fusion: Identifies candidate gene fusions.
guess-ft: Supervised search for fusion transcripts.
guess-if: Supervised search for intragenic fusions.
homology: Calculates homology between given two genes.
frame: Predicts functional consequence of fusion transcript"""
publication = "PRADA: pipeline for RNA sequencing data analysis[J]. Bioinformatics, 2014, 30(15): 2224-2226. https://doi.org/10.1093/bioinformatics/btu169"

[nongithub.item.igv]
title = "The Integrative Genomics Viewer (IGV)"
description = "The Integrative Genomics Viewer (IGV) is a high-performance visualization tool for interactive exploration of large, integrated genomic datasets. It supports a wide variety of data types, including array-based and next-generation sequence data, and genomic annotations."
publication = """Integrative Genomics Viewer. Nature Biotechnology 29, 24–26 (2011); Integrative Genomics Viewer (IGV): high-performance genomics data visualization and exploration. Briefings in Bioinformatics 14, 178-192 (2013)."""


[nongithub.item.marina]
title = "Master Regulator Inference Algorithm"
description = "MARINA (Master Regulator Inference Algorithm) MAster Regulator INference algorithm (MARINa), designed to infer transcription factors (TFs) controlling the transition between the two phenotypes, A and B, and the maintenance of the latter phenotype. Expression at the mRNA level is often a poor predictor of a TF's regulatory activity and an even worst predictor of its biological relevance in regulating phenotype-specific programs. To obviate this problem, MARINa infers TF activity from the global transcriptional activation of its regulon (i.e. its activated and repressed targets) and its biological relevance by TF-regulon overlap with phenotype-specific programs."
publication = "Lefebvre C, Rajbhandari P, Alvarez MJ, Bandaru P, Lim WK, Sato M, Wang K, Sumazin P, Kustagi M, Bisikirska BC, Basso K, Beltrao P, Krogan N, Gautier J, Dalla-Favera R, Califano A. A human B-cell interactome identifies MYB and FOXM1 as master regulators of proliferation in germinal centers. Mol Syst Biol. 2010 Jun 8;6:377."

[nongithub.item.paradigm]
title = "PAthway Representation and Analysis by Direct Inference on Graphical Models"
description = "High-dimensional ‘-omics’ profiling provides a detailed molecular view of individual cancers; however, understanding the mechanisms by which tumors evade cellular defenses requires deep knowledge of the underlying cellular pathways within each cancer sample. We extended the PARADIGM algorithm (Vaske et al., 2010, Bioinformatics, 26, i237–i245), a pathway analysis method for combining multiple ‘-omics’ data types, to learn the strength and direction of 9139 gene and protein interactions curated from the literature. Using genomic and mRNA expression data from 1936 samples in The Cancer Genome Atlas (TCGA) cohort, we learned interactions that provided support for and relative strength of 7138 (78%) of the curated links. Gene set enrichment found that genes involved in the strongest interactions were significantly enriched for transcriptional regulation, apoptosis, cell cycle regulation and response to tumor cells. Within the TCGA breast cancer cohort, we assessed different interaction strengths between breast cancer subtypes, and found interactions associated with the MYC pathway and the ER alpha network to be among the most differential between basal and luminal A subtypes. PARADIGM with the Naive Bayesian assumption produced gene activity predictions that, when clustered, found groups of patients with better separation in survival than both the original version of PARADIGM and a version without the assumption. We found that this Naive Bayes assumption was valid for the vast majority of co-regulators, indicating that most co-regulators act independently on their shared target."
publication = "Sedgewick A J, Benz S C, Rabizadeh S, et al. Learning subgroup-specific regulatory interactions and regulator independence with PARADIGM[J]. Bioinformatics, 2013, 29(13): i62-i70. https://doi.org/10.1093/bioinformatics/btt229"

[nongithub.item.meerkat]
title = "https://dx.doi.org/10.1016/j.cell.2013.04.010"
description = "Identification of somatic rearrangements in cancer genomes has accelerated through analysis of high-throughput sequencing data. However, characterization of complex structural alterations and their underlying mechanisms remains inadequate. Here, applying an algorithm to predict structural variations from short reads, we report a comprehensive catalog of somatic structural variations and the mechanisms generating them, using high-coverage whole-genome sequencing data from 140 patients across ten tumor types. We characterize the relative contributions of different types of rearrangements and their mutational mechanisms, find that ∼20% of the somatic deletions are complex deletions formed by replication errors, and describe the differences between the mutational mechanisms in somatic and germline alterations. Importantly, we provide detailed reconstructions of the events responsible for loss of CDKN2A/B and gain of EGFR in glioblastoma, revealing that these alterations can result from multiple mechanisms even in a single genome and that both DNA double-strand breaks and replication errors drive somatic rearrangements."
publication = "Yang L, Luquette L J, Gehlenborg N, et al. Diverse Mechanisms of Somatic Structural Variations in Human Cancer Genomes[J]. Cell, 2013, 153(4):919-29."

[nongithub.item.vadir]
title = "VaDiR: an integrated approach to Variant Detection in RNA"
description = """Advances in next-generation DNA sequencing technologies are now enabling detailed
characterization of sequence variations in cancer genomes. With whole genome sequencing, variations in
coding and non-coding sequences can be discovered. But the cost associated with it is currently limiting its
general use in research. Whole exome sequencing is used to characterize sequence variations in coding regions,
but the cost associated with capture reagents and biases in capture rate limit its full use in research. Additional
limitations include uncertainty in assigning the functional signicance of the mutations when these mutations
are observed in the non-coding region or in genes that are not expressed in cancer tissue.
We investigated the feasibility of uncovering mutations from expressed genes using RNA sequencing
datasets with a method called VaDiR: Variant Detection in RNA" that integrate three variant callers, namely:
SNPiR, RVBoost and MuTect2. The combination of all three methods, which we called Tier1 variants,
produced the highest precision with true positive mutations from RNA-seq that could be validated at the DNA
level. We also found that the integration of Tier1 variants with those called by MuTect2 and SNPiR produced
the highest recall with acceptable precision. Finally, we observed higher rate of mutation discovery in genes
that are expressed at higher levels."""
publication = "Neums L, Suenaga S, Beyerlein P, et al. VaDiR: an integrated approach to Variant Detection in RNA[J]. GigaScience, 2017. https://doi.org/10.1093/gigascience/gix122"

[nongithub.item.mutsig]
title = "Mutational heterogeneity in cancer and the search for new cancer-associated genes"
description = """
MutSig (for "Mutation Significance") is a package of tools for analyzing mutation data.  It operates on a cohort of patients and identifies mutations, genes, and other genomic elements predicted to be driver candidates.
"""
publication = "Lawrence, M. et al. Mutational heterogeneity in cancer and the search for new cancer-associated genes. Nature 499, 214-218 (2013) https://dx.doi.org/10.1038/nature12213"

[nongithub.item.effusion]
title = "Effusion: Prediction of Protein Function from Sequence Similarity Networks"
description = "A method for predicting protein function, Effusion, that uses a sequence similarity network to add context for homology transfer, a probabilistic model to account for the uncertainty in labels and function propagation, and the structure of the Gene Ontology (GO) to best utilize sparse input labels and make consistent output predictions. Effusion's model makes it practical to integrate rare experimental data and abundant primary sequence and sequence similarity."
publication = "Effusion: Prediction of Protein Function from Sequence Similarity Networks. Bioinformatics. 2018 Aug 01, PMID: 30084920 DOI: 10.1093/bioinformatics/bty672"

[nongithub.item.hgtid]
title = "HGT-ID: an efficient and sensitive workflow to detect human-viral insertion sites using next-generation sequencing data"
description = "Identifies viral insertion sequences for the genome of human cancers. HGT-ID incorporates a sample dataset, references limited to chromosome 18 and the software package. This software detects viral insertion sequences from known viral reference genome of human cancers."
publication = """HGT-ID: an efficient and sensitive workflow to detect human-viral insertion sites using next-generation sequencing data, BMC Bioinformatics 2018, 10.1186/s12859-018-2260-9"""

[nongithub.item.cromwell]
title = "Scientific workflow engine designed for simplicity & scalability. Trivially transition between one off use cases to massive scale production environments https://cromwell.readthedocs.io/"
description = "Cromwell is a Workflow Management System geared towards scientific workflows. Cromwell is open sourced under the BSD 3-Clause license."
publication = "Not yet"