This repository has been archived by the owner on Dec 7, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Cyril Matthey-Doret
committed
Mar 25, 2019
1 parent
5c04ac3
commit 4a762e4
Showing
1 changed file
with
211 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,211 @@ | ||
--- | ||
title: "Number of genes in CSD regions" | ||
author: "Cyril Matthey-Doret" | ||
date: "March 21, 2019" | ||
output: pdf_document | ||
--- | ||
|
||
```{r setup, include=FALSE} | ||
library(tidyverse); library(reshape2); library(kableExtra) | ||
library(gridExtra); library(ggforce); library(GenomicRanges) | ||
library(GO.db) | ||
setwd('../../data/') | ||
annot <- read_tsv("annotations/ord_simple_gene_chrom.bed", col_names = F) | ||
hits <- read_tsv("assoc_mapping/case_control_hits.tsv") | ||
snps <- read_tsv("assoc_mapping/case_control_all.tsv") | ||
tigs <- read_tsv("ref_genome/ordered_genome/karyotype.txt", col_names = F) | ||
go_terms <- read_tsv("annotations/blast2go.txt", col_names = c("ID", "GO", "function")) | ||
regions <- tibble(chr =hits %>% pull(Chr), | ||
start = rep(0, nrow(hits)), | ||
end = rep(0, nrow(hits))) | ||
go_terms <- go_terms %>% mutate(ID = str_replace(ID, "-PA", "")) | ||
``` | ||
|
||
## Regions definition | ||
Here, CSD regions are defined as the regions surrounding significant SNPs until a non-significant SNP is encountered. Low SNP density will likely cause an overestimation of region sizes, but this is the most objective definition with available data. There are `r nrow(hits)` significant SNPs out of `r nrow(snps)` in total, and a total of `r nrow(annot)` annotations found on the `r round(sum(tigs$X2)/1000000, 1)`Mb of anchored genome. | ||
|
||
This generates multiple identical (overlapping) regions for consecutive significant SNPs ( 1/SNP) and directly adjacent regions in case there a single non-significant SNP between two significant ones (Figure 2). Overlapping and directly adjacent intervals are merged. | ||
|
||
## Annotations | ||
The number of annotated genes (i.e. not included isoforms or other features) contained in each (non-overlapping, non-adjacent) region is shown in table 1. The density of annotations is highly variable between regions (Figure 3). In total, there are 381 annotated genes among all 6 regions. | ||
|
||
## GO terms | ||
Performing a functional enrichment test reveals nothing interesting; the predicted functions of genes in CSD regions are too diverse to compute an enrichment for one particular term (Table 2). | ||
|
||
|
||
```{r define_regions, include=T, echo=F} | ||
for (row in 1:nrow(hits)){ | ||
chr_r <- pull(hits[row, "Chr"]) | ||
pos_r <- pull(hits[row, "BP"]) | ||
rel <- snps %>% | ||
filter(Chr == chr_r & fisher < 5) %>% | ||
mutate(dist = BP - pos_r) | ||
if(pos_r <= min(rel %>% pull(BP))){ | ||
start_r <- 0 | ||
} | ||
else{ | ||
start_r <- rel %>% | ||
filter(dist < 0) %>% | ||
filter(dist == max(dist)) %>% | ||
dplyr::select(BP) %>% | ||
pull | ||
} | ||
if(pos_r >= max(rel %>% pull(BP))){ | ||
end_r = tigs %>% filter(X1 == chr_r) %>% pull(X2) | ||
} | ||
else{ | ||
end_r <- rel %>% | ||
filter(dist > 0) %>% | ||
filter(dist == min(dist)) %>% | ||
dplyr::select(BP) %>% | ||
pull | ||
} | ||
regions[row,"start"] <- start_r | ||
regions[row,"end"] <- end_r | ||
} | ||
``` | ||
|
||
|
||
|
||
```{r viz_regions, include=T, echo=F, message=F, warning=F, fig.height=2, fig.cap="Visual representation of genomic ranges for each region. Each blue interval represent the span of a region. Regions are dodged vertically to show overlap and are separated in different panels by chromosomes."} | ||
# Visualise regions sizes and overlap | ||
ggplot(data=regions) + | ||
geom_linerange(aes(x=1, ymin=start/1000000, ymax=end/1000000), | ||
position=position_dodge2(0.6), | ||
lwd=2, col="blue") + | ||
facet_grid(~chr, scales="free_x") + | ||
xlim(c(0.5,1.5)) + | ||
theme_bw() + | ||
theme(axis.text.y = element_blank(), axis.ticks.y = element_blank()) + | ||
scale_x_continuous(breaks=NULL) + | ||
xlab("") + ylab("Position [Mb]") + | ||
coord_flip() | ||
``` | ||
|
||
```{r merge_regions, include=T, echo=F, error=F} | ||
# Merge overlapping regions | ||
# 1. Define an "overlap group" variable (g) as a | ||
# 2. Summarise each overlap group by smallest start and largest end | ||
regions <- regions %>% | ||
arrange(start) %>% | ||
group_by(chr, g = cumsum(cummax(lag(end, default = dplyr::first(end))) < start)) %>% | ||
summarise(start = dplyr::first(start), end = max(end)) %>% | ||
dplyr::select(-g) | ||
regions$index=rownames(regions) | ||
# Add length of regions | ||
regions$length_bp <- regions$end - regions$start | ||
regions$n_genes <- rep(0) | ||
for (row in 1:nrow(regions)){ | ||
chr_r <- pull(regions[row, "chr"]) | ||
start_r <- pull(regions[row, "start"]) | ||
end_r <- pull(regions[row, "end"]) | ||
ngenes_r <- annot %>% | ||
filter(X1 == chr_r & (X3 >= start_r & X2 <= end_r)) %>% | ||
nrow | ||
regions[row, "n_genes"] <- ngenes_r | ||
} | ||
regions %>% | ||
dplyr::select(-index) %>% | ||
mutate(length_bp = format(length_bp, digits = 3, scientific = T)) %>% | ||
kable(caption = "Summary of the different CSD regions after merging overlapping or directly adjacent regions. Note the first region on chromosome 2 starts at 0 since the first SNP on the chromosome is significant") %>% | ||
kable_styling(bootstrap_options = "striped", position="center") | ||
``` | ||
|
||
|
||
```{r viz_merged, include=T, echo=F, message=F, fig.height=2, fig.cap="Visual representation of genomic ranges for each region after merging overlapping and adjacent regions. Each blue interval represent the span of a region. Regions are dodged vertically to show overlap and are separated in different panels by chromosomes."} | ||
# Visualise regions sizes and overlap | ||
ggplot(data=regions) + | ||
geom_linerange(aes(x=1, ymin=start/1000000, ymax=end/1000000), | ||
position=position_dodge2(0.6), | ||
lwd=2, col="blue") + | ||
facet_grid(~chr, scales="free_x") + | ||
xlim(c(0.5,1.5)) + | ||
theme_bw() + | ||
theme(axis.text.y = element_blank(), axis.ticks.y = element_blank()) + | ||
scale_x_continuous(breaks=NULL) + | ||
xlab("") + ylab("Position [Mb]") + | ||
coord_flip() | ||
``` | ||
|
||
```{r viz_annot, eval=T, echo=F, include=T, echo=F, fig.height=8, fig.cap="Top: Number of annotated features in each region compared to their size, in basepairs. Bottom: Density of annotations in each CSD region per megaase in each CSD regions. Colors represent chromosomes in both panels."} | ||
# Show quantity of annotations in each region | ||
n_annot <- ggplot(data=regions, aes(x=length_bp, y=n_genes, col=chr)) + | ||
geom_point(size=3) + | ||
xlab("Region size [bp]") + | ||
ylab("Number of annotations") + | ||
theme_bw() | ||
annot_dens <- ggplot(data=regions, aes(y=n_genes/(length_bp/1000000), | ||
x=paste0(chr, ":", | ||
round((start+end)/(2*1000000)), | ||
"Mb"), | ||
fill=chr)) + | ||
xlab("") + | ||
ylab("Gene density in CSD regions [annotation/Mb]") + | ||
theme_bw() + | ||
geom_bar(stat = "identity") + | ||
coord_flip() | ||
grid.arrange(n_annot, annot_dens, nrow=2) | ||
``` | ||
|
||
|
||
|
||
```{r go_terms, echo=F} | ||
go_coords <- go_terms %>% | ||
right_join(annot, by=c("ID" = "X4")) %>% | ||
dplyr::rename(chr = X1, start = X2, end = X3) | ||
go_ranges <- makeGRangesFromDataFrame(go_coords) | ||
csd_ranges <- makeGRangesFromDataFrame(regions) | ||
csd_go_inter <- GenomicRanges::findOverlaps(go_ranges, csd_ranges) | ||
go_hits <- queryHits(csd_go_inter) | ||
csd_hits <- subjectHits(csd_go_inter) | ||
csd_go <- bind_cols(go_coords[go_hits,], regions[csd_hits,]) | ||
csd_go <- csd_go %>% | ||
dplyr::select(-chr1, -start1, -end1, -n_genes, -length_bp) %>% | ||
dplyr::rename(region = index) | ||
# Store each gene only once (not once for each GO term) | ||
csd_genes <- csd_go %>% | ||
dplyr::select(-GO) %>% | ||
dplyr::distinct_all() | ||
``` | ||
|
||
```{r enrich_annot, echo=F} | ||
genome_go_freq <- go_coords %>% | ||
count(GO) %>% | ||
arrange(-n) | ||
csd_go_freq <- csd_go %>% | ||
count(GO) %>% | ||
arrange(-n) | ||
annot_counts <- merge(csd_go_freq, genome_go_freq, by="GO", suffixes=c("_csd", "_genome")) | ||
annot_counts <- annot_counts %>% mutate(tot_genome=sum(n_genome), | ||
tot_csd=sum(n_csd)) | ||
annot_counts$pval <- p.adjust( | ||
apply(annot_counts, 1, function(x) fisher.test(matrix(as.numeric(x[c(2, 5, 3, 4)]), ncol = 2))$p.value), | ||
method="BH" | ||
) | ||
top10 <- annot_counts %>% | ||
arrange(pval) %>% | ||
dplyr::rename(`q-value` = pval) %>% | ||
dplyr::select(-tot_genome, -tot_csd) %>% | ||
head(10) | ||
top10$term = unname(Term(top10$GO)) | ||
top10 %>% | ||
kable(caption = "Top 10 most significantly enriched GO terms in CSD regions") %>% | ||
kable_styling(bootstrap_options = "striped", position="left", font_size = 8) | ||
``` |