Figure 6 Clonal expansion and clonal sharing between B cell states.Rmd

---
title: "Figure 6 for amp_phase2_vdj project"
author: "Wagner, McDavid and Wang"
date: '`r Sys.Date()`'
params:
   all_contigs_ccdb: "refined/B_contigs_ccdb.rds"
   celltype: "B"
   min_cdr3_length: 5
   min_cdr3_dna_length: 15
   cdr3_identity: 0.965
   ident_etc_csv: "refined/02clust_30PC_res05_harmonized__noSCT_Bcells_cdat.csv"
   knitr_cache: FALSE
output:
  html_document:
    code_folding: hide
    toc: yes
    toc_float: yes
editor_options: 
  chunk_output_type: console
---

```{r, include = FALSE}
knitr::opts_chunk$set(echo = TRUE, autodep=TRUE, message=FALSE, warning=FALSE, fig.height = 3.5,dev = c('png', 'pdf'))
knitr_cache = params$knitr_cache
knitr::opts_chunk$set(cache = knitr_cache, autodep = knitr_cache, cache.lazy = FALSE)
#setwd("./Bcell/")
```

```{r setup, include = FALSE}
library(CellaRepertorium)
library(ggbeeswarm)
library(dplyr)
library(ggplot2)
library(readr)
library(tidyr)
library(stringr)
library(purrr)
library(knitr)
library(ggraph)
library(igraph)
library(rmdformats)
library(broom.mixed)
library(vegan)
library(ggridges)
library(scales)
library(circlize)
#setwd("./Bcell")
```

This file was rendered with the following parameters:
```{r}
print(params)
# Define the category names
ident <- c("B-Naïve(IgD_high)", "B-Naïve(IgD_low)", "B-Naïve(HSPA1B+)", "B-Mito_high", "B-Activated", "B-ABC", "B-Memory", "B-Clonal(LILRA4+)", "Plasmacells", "Plasmablasts", "BT-doublets")
# Define the corresponding colors
colors <- c("#A6CEE3", "#5399C6", "#3E9F32", "#99CD91", "#FDB762", "#CAC848", "#F47676", "#5C0905", "#CCAFCA", "#825D99","grey") 
# Create a named vector
category_colors <- setNames(colors, ident)

cluster_order = c("B-Naïve(IgD_high)","B-Naïve(IgD_low)","B-Naïve(HSPA1B+)","B-Mito_high","B-Activated",
                  "B-ABC","B-Memory","B-Clonal(LILRA4+)","Plasmablasts","Plasmacells")
```

# Load filtered contig files
We begin with a dataframe of contigs from the `r params$celltype` cells that have passed through initial quality control.
```{r read_data}
cdb = readRDS(params$all_contigs_ccdb)

if(!is.null(params$ident_etc_csv)){
  ident_etc = readr::read_csv(params$ident_etc_csv) %>% select(has_vdj, discard, keep, Barcode, ident, umap_1, umap_2)
  ident_etc$ident[is.na(ident_etc$ident)] <-"BT-doublets"
  ident_etc$ident <- gsub("\\bPlasma\\b", "Plasmacells", ident_etc$ident)
  cdb$cell_tbl = left_join(cdb$cell_tbl, ident_etc)
}

#Reorder idents to match other plots
cdb$cell_tbl$ident = factor(cdb$cell_tbl$ident, levels = c("B-Naïve(IgD_high)","B-Naïve(IgD_low)","B-Naïve(HSPA1B+)","B-Mito_high",
                                                           "B-Activated","B-ABC","B-Memory","Plasmablasts","Plasmacells","B-Clonal(LILRA4+)"))

#mutate in shm rates:
cdb$contig_tbl = cdb$contig_tbl %>% mutate(v_germ_length = v_germline_end - v_germline_start,
                                             j_germ_length = j_germline_end - j_germline_start,vshm = (1-v_identity)*v_germ_length,
                                             jshm = (1-j_identity)*j_germ_length,shm = vshm + jshm,
                                             len_without_d = v_germ_length + j_germ_length, vj_shm_rate = (shm/len_without_d))
```

```{r sample_Id_recode}
#Some reason ContigCellDB doesn't play well with dplyr::recode, contig_tbl loses basically all recodes with a match when directly updating with cdb$contig_tbl = cdb$contig_tbl %>% mutate(recode())
cell_tbl_recode = cdb$cell_tbl %>% mutate(sample_ID = recode(sample_ID,
                             '300_0150' = 'RA01',
                             '300_0171' = 'RA02',
                             '300_0173' = 'RA03',
                             '300_0174' = 'RA04',
                             '300_0392' = 'RA05',
                             '300_0410' = 'RA06',
                             '300_0414' = 'RA07',
                             '300_0416' = 'RA08',
                             '300_1883' = 'RA09',
                             '300_1930' = 'RA10',
                             '301_0174' = 'RA11',
                             '301_0270' = 'RA12'))

contig_tbl_recode = cdb$contig_tbl %>% mutate(sample_ID = recode(sample_ID,
                             '300_0150' = 'RA01',
                             '300_0171' = 'RA02',
                             '300_0173' = 'RA03',
                             '300_0174' = 'RA04',
                             '300_0392' = 'RA05',
                             '300_0410' = 'RA06',
                             '300_0414' = 'RA07',
                             '300_0416' = 'RA08',
                             '300_1883' = 'RA09',
                             '300_1930' = 'RA10',
                             '301_0174' = 'RA11',
                             '301_0270' = 'RA12'))

cdb@cell_tbl = cell_tbl_recode
cdb@contig_tbl = contig_tbl_recode
```

Initially we start with  `r nrow(cdb)` cells and `r nrow(cdb$contig_tbl)` contigs.  We keep contigs that are 
* full - length 
* productive
* high-confidence
* and with CDR3 sufficiently long, in this report, contigs with CDR3 string length at least `r params$min_cdr3_length`

```{r filtering}
cdb$contig_tbl = dplyr::filter(cdb$contig_tbl, full_length, productive == 'True', high_confidence, chain != 'Multi', 
                               str_length(cdr3) > params$min_cdr3_length, str_length(cdr3_nt) > params$min_cdr3_dna_length)
#Add in cell-level SHM rates
cell_shm = cdb$contig_tbl %>% group_by(!!!syms(cdb$cell_pk)) %>% summarize(shm = mean(vj_shm_rate, na.rm = TRUE))
cdb$cell_tbl = left_join(cdb$cell_tbl, cell_shm, by = cdb$cell_pk)
```

After filtering, there are `r nrow(cdb)` cells, `r nrow(cdb$contig_tbl)` contigs and `r nrow(cdb$cell_tbl)` contigs.
To find equivalance classes of CDR3 sequences, we use the program [CD-HIT](http:https://weizhongli-lab.org/cdhit_suite/cgi-bin/index.cgi?cmd=cd-hit).

Plot displaying the distribution of clones across each tissue and clones shared between tissues

To find equivalance classes of CDR3 sequences, we use the program [CD-HIT](http:https://weizhongli-lab.org/cdhit_suite/cgi-bin/index.cgi?cmd=cd-hit).
# DNA CDR3 clusters filter with `r params$cdr3_identity`
Here we partition contigs into sets with > `r params$cdr3_identity` similarity in the DNA sequence. 
```{r cdr3_dna_clustering, results='hide', cache = knitr_cache}
CDR3_DNA = cdhit_ccdb(cdb, 'cdr3_nt', type = 'DNA', cluster_pk = 'DNA97', identity = params$cdr3_identity, min_length = params$min_cdr3_dna_length, G = 1)
CDR3_DNA = fine_clustering(CDR3_DNA, sequence_key = 'cdr3_nt', type = 'DNA')

clone_ident = CDR3_DNA %>% filter_cdb(chain == 'IGH', tbl = 'contig_tbl') %>% canonicalize_cell(contig_fields = c('DNA97'))
clone_ident = filter_cdb(clone_ident,!is.na(ident), tbl = 'cell_tbl')
```

Fig 6.A and Fig 6.B
```{r clone_expand_umap1, fig.width = 6, fig.height=4}
clonality_umap = clone_ident$cell_tbl %>% group_by(DNA97, sample_ID) %>%
  mutate(n_cluster = n(), n_cluster = ifelse(is.na(DNA97), NA, n_cluster)) %>% #don't count NAs as unified group
  ungroup() %>%
  mutate(clonal_expansion = cut(n_cluster, breaks = c(-1, 1, 5, 20, 100, Inf), 
                                labels = c('Singleton','Small (1<X<=5)', 'Medium (5<X<=20)', 'Large (20<X<100)',  'Hyperexpanded (100<X)')), 
         clonal_expansion = forcats::fct_explicit_na(forcats::fct_rev(clonal_expansion), 'No BCR'))

clonality_umap_wide = clonality_umap %>% group_by(ident,clonal_expansion) %>% summarize(value = n())
clonality_umapplot = ggplot(clonality_umap, aes(x = umap_1,y = umap_2, color = clonal_expansion)) + geom_point(size = .1) + theme_classic() + scale_color_brewer("IGH Expansion", palette = 'Reds',direction = -1) + guides(colour = guide_legend(override.aes = list(size=5))) + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())

clonality_umapplot
write.csv(clonality_umap,"figure_plots/clonality_umap.csv")
saveRDS(clonality_umapplot,'figure_plots/clonality_umapplot.rds')

clonality_umap  = clonality_umap %>% filter(ident != 'B-Clonal(LILRA4+)')
clonality_barplot_tissue = ggplot(clonality_umap %>% group_by(ident,clonal_expansion,tissue_source) %>% summarize(value = n()), aes(x = ident, fill = clonal_expansion, y = value)) + geom_bar(position="fill", stat="identity") + coord_flip() + theme_minimal() + ylab('proportion') + xlab('ident') + scale_fill_brewer("IGH Expansion", palette = 'Reds', direction = -1) + facet_wrap(~tissue_source) + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), legend.position="none") 
clonality_barplot_tissue
```

### SYN shared clone
Fig 6.D
```{r}
top50clones = clone_ident$cell_tbl %>% 
  filter(tissue_source == 'Syn') %>%
  group_by(DNA97,ident) %>% 
  summarise(n_clones = n()) %>% 
  arrange(desc(n_clones)) %>% 
  filter(!is.na(DNA97)) %>%
  group_by(DNA97) %>%
  filter(n_distinct(ident) >= 2)

tmp = clone_ident$cell_tbl %>% 
  filter(tissue_source == 'Syn')%>% 
  filter(DNA97 %in% top50clones$DNA97) %>% left_join(top50clones, by = c('DNA97','ident')) 

ggplot(tmp, aes(x = reorder(as.factor(DNA97), -n_clones), fill = ident)) + geom_bar() + 
  ylab('Cells in clone') + xlab('SYN shared clones') + 
  theme_classic() + scale_fill_manual(values = category_colors)+
  labs(fill = "Cluster") +
  theme(axis.text.x = element_blank(), axis.ticks.x = element_blank())

write.csv(tmp,"figure_plots/top50clones_shared_clone.csv")
```


### Top 50 clone in SYN
Fig 6.E
```{r}
top50clones = clone_ident$cell_tbl %>% 
  filter(tissue_source == 'Syn') %>%
  group_by(DNA97) %>% 
  summarise(n_clones = n()) %>% 
  arrange(desc(n_clones)) %>% 
  filter(!is.na(DNA97)) %>% head(50) 

tmp = clone_ident$cell_tbl %>% 
  filter(tissue_source == 'Syn')%>% 
  filter(DNA97 %in% top50clones$DNA97) %>% left_join(top50clones, by = 'DNA97')

ggplot(tmp, aes(x = reorder(as.factor(DNA97), -n_clones), fill = ident)) + geom_bar() + ylab('Cells in clone') + xlab('Top 50 Synovial Clones') + 
  theme_classic() + scale_fill_manual(values = category_colors)+
  labs(fill = "Cluster") +
  theme(axis.text.x = element_blank(), axis.ticks.x = element_blank())

write.csv(tmp,"figure_plots/top50clones_Syn.csv")
```

### Top 50 clone in PBL
Fig 6.F
```{r}
top50clones = clone_ident$cell_tbl %>% 
  filter(tissue_source == 'PBL' & ident != "B-Clonal(LILRA4+)") %>%
  group_by(DNA97) %>% 
  summarise(n_clones = n()) %>% 
  arrange(desc(n_clones)) %>% 
  filter(!is.na(DNA97)) %>% 
  head(50) 

tmp = clone_ident$cell_tbl %>%
    filter(tissue_source == 'PBL' & ident != "B-Clonal(LILRA4+)") %>% 
  filter(DNA97 %in% top50clones$DNA97) %>% left_join(top50clones, by = 'DNA97')


ggplot(tmp, aes(x = reorder(as.factor(DNA97), -n_clones), fill = ident)) + geom_bar() + ylab('Cells in clone') + xlab('Top 50 Blood Clones') + 
  theme_classic() + scale_fill_manual(values = category_colors)+
  labs(fill = "Cluster") +
  theme(axis.text.x = element_blank(), axis.ticks.x = element_blank())
write.csv(tmp,"figure_plots/top50clones_PBL.csv")
```

```{r clones_plot, fig.width=7}
summarize_clones = function(cdb, fcts){
  clones = cdb$contig_tbl %>% filter(chain %in% c('IGH', 'TRB')) %>% 
    group_by(sample_ID, !!!syms(fcts), !!sym(cdb$cluster_pk)) %>% 
    summarise(num_clones = n()) %>% 
    group_by(sample_ID, !!!syms(fcts)) %>% 
    mutate(cluster_abund = num_clones/n()) %>% 
    mutate(cut_clust = cut(num_clones, breaks = c(-1, 0, 1, 10, 100, 1000)))
  clones = ungroup(clones)
  attr(clones, 'fcts') = fcts
  attr(clones, 'cluster_pk') = cdb$cluster_pk
  clones
}
dna_clones = summarize_clones(clone_ident, 'tissue_source')
```

Fig 6.G
```{r DNA_clones_2}
cross_tab_clones = function(clones_){
  id_cols = c('sample_ID',attr(clones_, 'cluster_pk'))
  clones_wider = pivot_wider(clones_, names_from = attr(clones_, 'fcts'), values_from = num_clones, id_cols = all_of(id_cols), values_fill = 0) 
  clones_wider %>% mutate(across(.cols = - seq_along(id_cols), cut, breaks = c(-1, 0, 1, 10, 100, 1000)))
}

dna_clones_wider = cross_tab_clones(dna_clones)
dna_clones_wider = dna_clones_wider %>% filter(!(Syn == "(100,1e+03]" | PBL == "(100,1e+03]")) %>%
  mutate(Clone_Type = ifelse((Syn != "(-1,0]" & PBL == "(-1,0]") | (Syn == "(-1,0]" & PBL != "(-1,0]"), "Non-shared clones", "Shared clones"))

#Scatter of PBL versus SYN where each point is a cluster within a sample_ID
dna_pblsyn_plot = ggplot(dna_clones_wider, aes(x = Syn,y = PBL ,colour=Clone_Type)) + geom_jitter(width = .3, height = .3) + theme_minimal() +  scale_color_grey(start=0.8, end=0.2)
dna_pblsyn_plot

write.csv(dna_clones_wider,"figure_plots/dna_clones_wider.csv")
```

Fig 6.H
```{r Plot of cell population composition of the tissue-trafficked clones}
shared_clones <- dna_clones_wider %>% filter(Clone_Type == "Shared clones") 
tmp = clone_ident@contig_tbl %>% filter(DNA97 %in% shared_clones$DNA97) 
tmp = tmp %>% left_join(clone_ident$cell_tbl, by = c('barcode',"Barcode"))
tmp <- tmp[, c("DNA97.x", "ident", "tissue_source.x")]
dat <- tmp %>% group_by(tissue_source.x,DNA97.x,ident) %>% summarise(num_clones = n())  

# Calculate percentage of num_clones in each DNA97 for each tissue_source.x
result_Syn <- dat %>%
  filter(tissue_source.x == "Syn") %>%
  group_by(DNA97.x) %>%
  summarise(total_num_clones = sum(num_clones)) %>% 
  left_join(dat %>% filter(tissue_source.x == "Syn"), by = c('DNA97.x')) %>%
  mutate(percentage = (num_clones / total_num_clones) * 100) 

result_PBL <- dat %>%
  filter(tissue_source.x == "PBL") %>%
  group_by(DNA97.x) %>%
  summarise(total_num_clones = sum(num_clones)) %>% 
  left_join(dat %>% filter(tissue_source.x == "PBL"), by = c('DNA97.x')) %>%
  mutate(percentage = (num_clones / total_num_clones) * 100) 

result_filtered <- rbind(result_PBL,result_Syn)

ggplot(result_filtered, aes(x=as.factor(DNA97.x))) +
  geom_bar(data=result_filtered[result_filtered$tissue_source.x=="Syn",], aes(y=-percentage, fill=ident), stat="identity") +
  geom_bar(data=result_filtered[result_filtered$tissue_source.x=="PBL",], aes(y=percentage, fill=ident), stat="identity") +
  geom_hline(yintercept=0, colour="white", lwd=1) +
  labs(y="Tissue source of cell", x="Tissue-trafficked Clones") +
  labs(fill = "Cluster") + scale_fill_manual(values = category_colors) + 
  theme_classic() +  
  theme(axis.text.x = element_blank(), axis.ticks.x = element_blank()) +
  scale_y_continuous( breaks = seq(-50, 50, by = 100),labels = c("SYN", "PBL"))

write.csv(result_filtered,"figure_plots/tissue_trafficked_clones.csv")
```

# Seurat and Cellarepertorium object to make plots:
```{r}
#Load and preprocess BCR data
B_contigs_DNA = readRDS("./refined/B_contigs_DNA.rds") %>% filter_cdb(sample_ID != '300_0415',!is.na(ident),ident != 'B-Clonal(LILRA4+)',tbl = 'cell_tbl') %>% filter_cdb(sample_ID != '300_0415',!is.na(ident),ident != 'B-Clonal(LILRA4+)', tbl = 'contig_tbl')

enforce_cluster_order = function(col, levs = cluster_order) {factor(col, levels = levs)}
B_contigs_DNA@cell_tbl$ident = enforce_cluster_order(B_contigs_DNA@cell_tbl$ident)
B_contigs_DNA@contig_tbl$ident = enforce_cluster_order(B_contigs_DNA@contig_tbl$ident)
B_contigs_DNA@cell_tbl$tissue_source = toupper(B_contigs_DNA@cell_tbl$tissue_source)
B_contigs_DNA@contig_tbl$tissue_source = toupper(B_contigs_DNA@contig_tbl$tissue_source)

B_contigs_heavy = canonicalize_cell(B_contigs_DNA %>% filter_cdb(chain == 'IGH',tbl = "contig_tbl"),contig_fields = 'DNA97')
```

```{r subset_downstream}
##Filter out BT doublets and B-clonal(LILRA4+) [All downstream analysis does not include BT doublets, clonal, or cells without 5']
ssce = subset(cdb, subset = ident != "B-Clonal(LILRA4+)" & !is.na(ident))
cluster_order_new = cluster_order[which(cluster_order != 'B-Clonal(LILRA4+)')]
B_contigs_heavy@cell_tbl$ident = enforce_cluster_order(B_contigs_heavy@cell_tbl$ident,levs = cluster_order_new)
B_contigs_heavy@contig_tbl$ident = enforce_cluster_order(B_contigs_heavy@contig_tbl$ident,levs = cluster_order_new)
```

# Circos plot
Fig 5.I
```{r circosplots}
library(varhandle)
count_shared_clones = function(pop1,pop2,clone_field = 'DNA97',data = B_contigs_heavy) {
  clones_p1 = data %>% filter(ident == pop1) %>% pull(DNA97)
  clones_p2 = data %>% filter(ident == pop2) %>% pull(DNA97)
  n_shared = length(intersect(clones_p1,clones_p2))
  data.frame(from = pop1, to = pop2, value = n_shared)
}

pop_combinations = t(combn(B_contigs_heavy@contig_tbl$ident %>% unique %>% na.omit,2))

sect_colors = category_colors[!(names(category_colors) %in% c("B-Clonal(LILRA4+)", "BT-doublets"))]

####PBL###
share_df_pbl = list()
share_df_syn = list()
for(i in 1:nrow(pop_combinations)) {
  share_df_pbl[[i]] = count_shared_clones(pop1 = pop_combinations[i,1],pop2 = pop_combinations[i,2], data = B_contigs_heavy@contig_tbl %>% filter(tissue_source == 'PBL'))
  share_df_syn[[i]] = count_shared_clones(pop1 = pop_combinations[i,1],pop2 = pop_combinations[i,2], data = B_contigs_heavy@contig_tbl %>% filter(tissue_source == 'SYN'))
}

share_df_pbl = dplyr::bind_rows(share_df_pbl) %>% filter(value >0)
share_df_syn = dplyr::bind_rows(share_df_syn) %>% filter(value >0)

#Ordering that impart biological meaning in each compartment:
switch_vec_pbl = c(rep(0,8),rep(0,3),rep(1,5),1,0,1,0,0,1,1)
switch_vec_syn = c(rep(1,4),0,1,1,0)

share_df_pbl = share_df_pbl %>% mutate(from = unfactor(from), to = unfactor(to),
                        switch = switch_vec_pbl,
                        from_ordered = ifelse(switch == 1, to, from),
                        to_ordered = ifelse(switch == 1, from, to)) %>% dplyr::select(from_ordered,to_ordered,value)

share_df_syn = share_df_syn %>% mutate(from = unfactor(from), to = unfactor(to),
                        switch = switch_vec_syn,
                        from_ordered = ifelse(switch == 1, to, from),
                        to_ordered = ifelse(switch == 1, from, to)) %>% dplyr::select(from_ordered,to_ordered,value)


#Joint plot
circos.clear()
chordDiagram(share_df_pbl,grid.col = sect_colors,annotationTrack = c("grid"), annotationTrackHeight = mm_h(12),transparency = 0)
for(si in get.all.sector.index())
  {
    xlim = get.cell.meta.data("xlim", sector.index = si, track.index = 1)
    ylim = get.cell.meta.data("ylim", sector.index = si, track.index = 1)
    circos.text(mean(xlim), mean(ylim) + 7, si, sector.index = si, track.index = 1, 
        facing = "bending.inside", niceFacing = TRUE, col = "black")
    circos.axis(h = "top", labels.cex = 2, sector.index = si, track.index = 1)
}
title(main = 'PBL',cex = 22,line = -20)

chordDiagram(share_df_syn,grid.col = sect_colors,annotationTrack = c("grid"), annotationTrackHeight = mm_h(12),transparency = 0)
for(si in get.all.sector.index())
  {
    xlim = get.cell.meta.data("xlim", sector.index = si, track.index = 1)
    ylim = get.cell.meta.data("ylim", sector.index = si, track.index = 1)
    circos.text(mean(xlim), mean(ylim) + 7, si, sector.index = si, track.index = 1, 
        facing = "bending.inside", niceFacing = TRUE, col = "black")
    circos.axis(h = "top", labels.cex = 2, sector.index = si, track.index = 1)
}
title(main = 'SYN',cex = 22,line = -20)
```

Test statistic is on what we'll call the "log concentration index" -- the entropy vs uniform distribution.  Smaller values indicate more oligoclonality -- exponentiate to roughly get the ratio between n distinct clones vs n barcodes.
CDR3_DNA
B-Naive(IgD_high) log concentration index is compared against 0, remainder vs the B-Naive LCI.  Ultimately the permutation is testing:

perm_resid = (LCI(pop) - LCI(Naive)) - E(LCI(pop)) + E(LCI(Naive)), 
Thus is the log-ratio of excess concentration.

Note there's something paradoxical about cases where expected is positive, like HSPA1B+.

# AA properties of CDR3
```{r cdr3_props, fig.width = 8, fig.height=8}
#Get the medoid contig to represent each clone for AA analyses
CDR3_canon = canonicalize_cluster(CDR3_DNA)
CDR3_canon@cell_tbl <- CDR3_canon@cell_tbl %>% filter(!is.na(ident))
#Get Amino acid properties from CDR3 DNA sequence on all clones (using medoid contig for non-singleton clones)
tmp = alakazam::aminoAcidProperties(CDR3_canon$contig_tbl %>% filter(is_medoid, cdr3_nt != 'None'), seq = 'cdr3_nt', nt = TRUE)

tmp = merge(tmp, CDR3_canon@cell_tbl[,c("barcode","Barcode","ident")], by = c("barcode","Barcode"), all = TRUE)
#Pivot on cdr_property
props = tmp %>% tidyr::gather('cdr_prop','value',starts_with("cdr3_nt_aa")) %>% mutate(cdr_prop_short = str_remove(cdr_prop,'cdr3_nt_aa_'))
#Get average SHM rates to order plots by and force levels of ident to be in order of decreasing shm
shm_ordering = cdb$cell_tbl %>% filter(!is.na(ident),!is.na(shm)) %>% group_by(ident) %>% summarize(avg_shm = 100*mean(shm)) %>% arrange(avg_shm) %>% pull(ident)

props$ident = factor(props$ident, levels = shm_ordering)
#subtitle = 'ordered by increasing average SHM rate per subpopulation'

tmp_theme = theme_bw() + theme(legend.position="bottom")
```

## Test for cell population effect on Amino acid properties 
SFig 10.A
B-Naïve(IgD_high) used as intercept in mixed effect model
# Model on ident
```{r cdr3_tests, fig.width = 8, fig.height=8, warning=FALSE}
#Get cell population with lowest shm rate, which will be set as intercept in models below:
plt_intercept = levels(props$ident)[1]

#subtitle_intercept = paste0(subtitle,'\n','\n',"Baseline population set as: ",plt_intercept)

### Testing for differences differences in cdr3_AA_properties via linear mixed effect model in heavy chains
foo = props %>% filter(chain == 'IGH') %>% group_by(cdr_prop_short) %>% 
  do(lme4::lmer(value ~ ident + (1|sample_ID), data = .) %>% tidy(effects = 'fixed')
     %>% mutate(ident = gsub('ident',"",term)))

foo$ident = factor(foo$ident, levels = shm_ordering)

foo = foo %>% mutate(ymin = estimate - std.error*1.96, ymax = estimate + std.error*1.96,significant = ifelse(ymin<0 & 0<ymax,0,1)) %>% filter(term != '(Intercept)')

AA_plot_simple = ggplot(foo,aes(x = ident, y = estimate, ymin = ymin, ymax = ymax)) + geom_pointrange(position = position_dodge(width = .3),aes(colour=ident)) + facet_wrap(~cdr_prop_short, scale = 'free_y') + geom_hline(yintercept = 0, lty = 2) + theme(axis.text.x = element_text(angle = 90)) + tmp_theme + theme(axis.text.x=element_blank()) + ggtitle('IgH CDR3 amino acid properties') + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())

AA_plot_simple

write.csv(foo,"figure_plots/Effect size and 95% confidence intervals of population on IgH CDR3 amino acid properties.csv")
```

SFig 10.C 
```{r}
chain_pair_plot = ggplot(cdb$cell_tbl %>% filter(!is.na(ident) & ident != "B-Clonal(LILRA4+)"), aes(y = ident, fill = pairing)) +
  geom_bar() + theme_classic() + 
  labs(fill='Pairing') +  ylab(NULL)  + xlab(NULL) +
  coord_flip()+
  scale_fill_brewer(palette = 'Set1', type = 'qual') + guides(colour = guide_legend(override.aes = list(size=5))) + 
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))
chain_pair_plot
```