-
Notifications
You must be signed in to change notification settings - Fork 0
/
microbial_diversity.R
executable file
·412 lines (409 loc) · 20.1 KB
/
microbial_diversity.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
#' ---
#' title: "Microbial data: overview of data, diversity statistics"
#' author: "Beau Larkin\n"
#' date: "Last updated: `r format(Sys.time(), '%d %B, %Y')`"
#' output:
#' github_document:
#' toc: true
#' toc_depth: 3
#' df_print: paged
#' ---
#'
#' # Description
#' Microbial data analyzed here include site-species tables derived from high-throughput sequencing of
#' ITS and 18S genes and clustering into 97% similar OTUs.
#' This report presents basic statistics and visualizations of species richness, Shannon's
#' diversity/evenness, and Simpson's diversity/evenness in the microbial species data across field types.
#'
#' - Diversity and evenness of microbial communities
#' - Interpretation of differences in diversity among regions and field types, and over years
#'
#' # Packages and libraries
packages_needed = c(
"rsq",
"lme4",
"multcomp",
"tidyverse",
"vegan",
"ggbeeswarm",
"knitr",
"conflicted",
"colorspace"
)
packages_installed = packages_needed %in% rownames(installed.packages())
#+ packages,message=FALSE
if (any(!packages_installed)) {
install.packages(packages_needed[!packages_installed])
}
#+ libraries,message=FALSE
for (i in 1:length(packages_needed)) {
library(packages_needed[i], character.only = T)
}
#+ conflicts,message=FALSE
conflict_prefer("filter", "dplyr")
conflict_prefer("select", "dplyr")
#'
#' # Data
#' ## Sites-species tables
#' List *spe* holds average sequence abundances for the top 6 samples per field.
#' CSV files were produced in `process_data.R`
spe <- list(
its_raw = read_csv(paste0(getwd(), "/clean_data/spe_ITS_raw.csv"),
show_col_types = FALSE),
its_rfy = read_csv(paste0(getwd(), "/clean_data/spe_ITS_rfy.csv"),
show_col_types = FALSE),
amf_raw = read_csv(paste0(getwd(), "/clean_data/spe_18S_raw.csv"),
show_col_types = FALSE),
amf_rfy = read_csv(paste0(getwd(), "/clean_data/spe_18S_rfy.csv"),
show_col_types = FALSE)
)
#'
#' ## Site metadata and design
sites <- read_csv(paste0(getwd(), "/clean_data/sites.csv"), show_col_types = FALSE) %>%
mutate(field_type = factor(field_type, ordered = TRUE, levels = c("corn", "restored", "remnant"))) %>%
select(-lat, -long, -yr_restore, -yr_rank)
#'
#' # Functions
#' The following functions are used to streamline code and reduce errors:
#'
#' ## Calculate Hill's series on a samples-species matrix
#+ calc_diversity_function
calc_diversity <- function(spe) {
spe_mat <- data.frame(spe, row.names = 1)
N0 <- apply(spe_mat > 0, MARGIN = 1, FUN = sum)
N1 <- exp(diversity(spe_mat))
N2 <- diversity(spe_mat, "inv")
E10 <- N1 / N0
E20 <- N2 / N0
return(
data.frame(N0, N1, N2, E10, E20) %>%
rownames_to_column(var = "field_key") %>%
mutate(field_key = as.integer(field_key)) %>%
left_join(sites, by = join_by(field_key)) %>%
pivot_longer(
cols = N0:E20,
names_to = "hill_index",
values_to = "value"
) %>%
mutate(hill_index = factor(
hill_index,
ordered = TRUE,
levels = c("N0", "N1", "N2", "E10", "E20")
))
)
}
#'
#' ## Test diversity measures across site types with mixed model
#+ test_diversity_function
test_diversity <- function(data) {
hills <- levels(data$hill_index)
for(i in 1:length(hills)) {
cat("---------------------------------\n")
print(hills[i])
cat("---------------------------------\n\n")
mod_data <- data %>%
filter(hill_index == hills[i]) %>%
mutate(field_type = factor(field_type, ordered = FALSE))
mmod <- lmer(value ~ field_type + (1 | region), data = mod_data, REML = FALSE)
print(mmod)
cat("\n---------------------------------\n\n")
mmod_null <- lmer(value ~ 1 + (1 | region), data = mod_data, REML = FALSE)
print(mmod_null)
cat("\n---------------------------------\n\n")
print(anova(mmod, mmod_null))
mod_tuk <- glht(mmod, linfct = mcp(field_type = "Tukey"), test = adjusted("holm"))
print(mod_tuk)
print(cld(mod_tuk))
cat("\n\n\n")
}
}
#' ## Change in diversity over time
#' Do Hill's numbers correlate with years since restoration?
#' This may only be appropriate to attempt in the Blue Mounds region, and even there, it may be difficult
#' to justify that the area meets the criteria for a chronosequence.
test_age <- function(data, caption=NULL) {
temp_df <-
data %>%
filter(field_type == "restored", region == "BM") %>%
pivot_wider(names_from = hill_index, values_from = value) %>%
select(-starts_with("field"),-region)
lapply(temp_df %>% select(-yr_since), function(z) {
test <-
cor.test(temp_df$yr_since,
z,
alternative = "two.sided",
method = "pearson")
return(data.frame(
cor = round(test$estimate, 2),
R2 = round(test$estimate^2, 2),
pval = round(test$p.value, 3)
))
}) %>%
bind_rows(.id = "hill_num") %>%
remove_rownames() %>%
mutate(sig = case_when(pval <= 0.05 ~ "*", TRUE ~ "")) %>%
kable(format = "pandoc", caption = caption)
}
#'
#' # Analysis and Results
#'
#' ## OTUs pre/post corrections
#'
#' Datasets were corrected for survey effort (min samples per field) and sequencing depth
#' (min sequences per field). What was the effect of these actions on the number of OTUs recovered?
#' After rarefying, zero-abundance OTUs were removed.
#' Few were lost due to rarefying, as we can see by counting columns (less column 1 because it
#' has field site keys):
Map(function(x) ncol(x)-1, spe)
#' It appears that little will be lost in terms of richness or diversity by rarefying.
#'
#' ## Microbial diversity
#'
#' Microbial diversity is considered for 18S or ITS gene
#' datasets. For each set, Hill's numbers are produced ([Hill 1973](http:https://doi.wiley.com/10.2307/1934352),
#' [Borcard and Legendere 2018, p. 373](http:https://link.springer.com/10.1007/978-3-319-71404-2)) and plotted,
#' with means differences tested using mixed-effects linear models in `lmer` ([Bates et al. 2015](https://doi.org/10.18637/jss.v067.i01)).
#' Correlations are then produced to visualize change in diversity trends over time,
#' with similar mixed-effects tests performed.
#'
#' The statistical tests are on shaky ground due to imbalance, but are presented here as an
#' attempt to at least explore some differences and think more later about how we could
#' present them in a more valid way.
#'
#' Hill's numbers, brief description:
#'
#' - $N_{0}$ = species richness
#' - $N_{1}$ = Shannon's diversity ($e^H$; excludes rarest species, considers the number of "functional" species)
#' - $N_{2}$ = Simpson's diversity ($1 / \lambda$; number of "codominant" species)
#' - $E_{10}$ = Shannon's evenness (Hill's ratio $N_{1} / N_{0}$)
#' - $E_{20}$ = Simpson's evenness (Hill's ratio $N_{2} / N_{0}$)
#'
#+ diversity_calculations
# Rarefied tables only
div <- Map(calc_diversity, spe[c(2,4)])
#'
#' ### Fungi (ITS gene)
#' #### Diversity across field types
#' Run the linear model and test differences among field types for diversity.
#+ test_div_its_otu
test_diversity(div$its_rfy)
#'
#' #### Result: ITS diversity
#'
#' - $N_{0}$: field type is significant by likelihood ratio test at p<0.001 with region as
#' a random effect. Species richness in corn fields was less than restored or remnants, which
#' didn't differ at p=0.05.
#' - $N_{1}$: field type is significant
#' by likelihood ratio test at p<0.05 with region as a random effect. Shannon's diversity in corn fields was less
#' than restored or remnants, which didn't differ at p=0.05.
#' - $N_{2}$, $E_{10}$, and $E_{20}$: model fits for both null and full models were singular and NS at p<0.05.
#'
#' Figure labels are generated and the diversity data are plotted below. An interaction plot follows,
#' and is useful to consider what the model can and cannot say about differences in regions and field types.
#+ div_its_otu_labs
labs_its <- data.frame(
hill_index = factor(c(rep("N0", 3), rep("N1", 3)),
ordered = TRUE,
levels = c("N0", "N1", "N2", "E10", "E20")),
lab = c("a", "b", "b", "a", "b", "b"),
xpos = rep(c(1,2,3), 2),
ypos = rep(c(590, 170), each = 3)
)
#+ plot_div_its_rfy,fig.width=9,fig.height=7,fig.align='center'
ggplot(div$its_rfy, aes(x = field_type, y = value)) +
facet_wrap(vars(hill_index), scales = "free_y") +
geom_boxplot(varwidth = TRUE, fill = "gray90", outlier.shape = NA) +
geom_beeswarm(aes(fill = region), shape = 21, size = 2, dodge.width = 0.2) +
geom_label(data = labs_its, aes(x = xpos, y = ypos, label = lab), label.size = NA) +
labs(x = "", y = "Index value", title = "TGP microbial diversity (Hill's), ITS, 97% OTU",
caption = "N0-richness, N1-e^Shannon, N2-Simpson, E10=N1/N0, E20=N2/N0, width=n,\nletters indicate significant differences at p<0.05") +
scale_fill_discrete_qualitative(palette = "Dark3") +
theme_bw()
#' Let's also make a figure to show just the diversity measures, and with better labels.
hill_labs <- c("Richness", "Shannon's", "Simpson's")
names(hill_labs) <- c("N0", "N1", "N2")
#+ div_its_box,fig.width=7,fig.height=3
div$its_rfy %>%
filter(hill_index %in% c("N0", "N1", "N2")) %>%
ggplot(aes(x = field_type, y = value)) +
facet_wrap(vars(hill_index), scales = "free_y", labeller = labeller(hill_index = hill_labs)) +
geom_boxplot(varwidth = TRUE, fill = "gray90", outlier.shape = NA) +
geom_beeswarm(aes(shape = region, fill = field_type), size = 2, dodge.width = 0.3) +
geom_label(data = labs_its, aes(x = xpos, y = ypos, label = lab), label.size = NA) +
labs(y = "Species equivalent (n)") +
scale_fill_discrete_qualitative(name = "Field Type", palette = "Harmonic") +
scale_shape_manual(name = "Region", values = c(21, 22, 23, 24)) +
theme_bw() +
theme(axis.title.x = element_blank()) +
guides(fill = guide_legend(override.aes = list(shape = 21)))
#' Richness and evenness parameters increase from corn, to restored, to remnant fields, and some
#' support exists for this pattern to occur across regions.
#+ plot_div_its_otu_interaction,fig.width=9,fig.height=7,fig.align='center'
ggplot(
div$its_rfy %>%
group_by(field_type, region, hill_index) %>%
summarize(avg_value = mean(value), .groups = "drop"),
aes(x = field_type, y = avg_value, group = region)) +
facet_wrap(vars(hill_index), scales = "free_y") +
geom_line(aes(linetype = region)) +
geom_point(aes(fill = region), size = 2, shape = 21) +
labs(x = "", y = "Average value", title = "Interaction plot of Hill's numbers, ITS, 97% OTU") +
scale_fill_discrete_qualitative(palette = "Dark3") +
theme_bw()
#'
#' #### Key observations:
#'
#' - The restored field at LP contains very high diversity, co-dominance, and evenness of fungi.
#' - The restored field at FG contains low diversity, co-dominance, and evenness.
#' - Interactions are less an issue with $N_{0}$ and $N_{1}$
#'
#' #### Diversity over time
#' Next, trends in diversity are correlated with years since restoration. This can only be attempted
#' with Fermi and Blue Mounds sites; elsewhere, blocks cannot be statistically accounted for because
#' treatments aren't replicated within them.
#+ plot_yrs_since_resto_FLBM,fig.width=9,fig.height=7,fig.align='center'
div$its_rfy %>%
filter(field_type == "restored",
hill_index %in% c("N0", "N1", "N2"),
region %in% c("BM", "FL")) %>%
ggplot(aes(x = yr_since, y = value)) +
facet_grid(cols = vars(region), rows = vars(hill_index), scales = "free_y") +
geom_point(shape = 21, fill = "gray50", size = 2) +
labs(x = "Years since restoration", y = "index value", title = "TGP microbial diversity (Hill's) over time, ITS, 97% OTU",
caption = "N0-richness, N1-e^Shannon, N2-Simpson") +
theme_bw()
#' If the relationship exists, it is in Blue Mounds only.
#'
#' It's probably justified to correlate diversity with field age in Blue Mounds's restored fields.
#' A Pearson's correlation is used:
#+ test_age_its_otu
test_age(div$its_rfy,
caption = "Correlation between Hill's numbers and field age in the Blue Mounds region: ITS, 97% OTU")
#'
#' Hill's $N_{1}$ decreases with age since restoration in the Blue Mounds area ($R$=-0.84, p<0.05).
#' It's driven primarily by Karla Ott's older restoration which differs in plant community.
#' Removing this field, the correlation is no longer significant, and it not even close.
#' This is odd and points to a confounding effect driven by difference in restoration strategy over time.
#' Karla Ott's field is heavily dominated by a C4 grass and has relatively poor plant species diversity.
#' It's possible that other site differences (soils, etc.) also confound this relationship. It's possible
#' that we cannot attempt to present this as a time-based result at all. Or, maybe the number of
#' functionally dominant species slowly declines over time due to lack of disturbance and substrate
#' diversity.
#'
#' That diversity metrics aren't changing over time, or are possibly declining over time
#' after restoration is concerning and worth mentioning.
#'
#' In any case, let's take a look at Shannon's diversity over time in Blue Mounds' restored fields.
#+ bm_test_age,message=FALSE,fig.width=7,fig.height=6,fig.align='center'
div$its_rfy %>%
filter(region == "BM", field_type == "restored", hill_index == "N1") %>%
ggplot(aes(x = yr_since, y = value)) +
geom_smooth(method = "lm", se = TRUE, linewidth = 0.8, fill = "gray70") +
geom_label(aes(label = field_name)) +
labs(x = "Years since restoration", y = expression("Shannon's diversity"~(N[1]))) +
theme_classic()
#' Karla Ott's field was almost exclusively dominated by big bluestem, possibly leading to
#' a simpler microbial community. That field has too much leverage on this plot.
#' Right now, my interpretation is that restoration strategies changed
#' over time and although restored plant communities persisted, microbial communities simplified over time.
#' Immediately after restoration, microbial diversity increased rapidly and was not sustained
#' because the soil properties ultimately didn't change very much.
#'
#' Site factors (soil type) are hard to tease out, but in later analyses we will try using measured
#' soil chemical properties.
#'
#'
#' ### AMF (18S gene)
#' Run the linear model and test differences among field types for diversity.
#+ test_div_amf_otu
test_diversity(div$amf_rfy)
#'
#' #### Result: AMF diversity
#' Despite apparent trends across field types, variances are large and interactions apparent. All
#' model fits are questionable due to [singularity](https://rdrr.io/cran/lme4/man/isSingular.html).
#' The following results and plots are provisional and included here for consideration only.
#'
#' - $N_{0}$: Species richness differences by field type are significant by likelihood ratio test at p<0.005 with region as
#' a random effect. Species richness in corn fields was less than restored, and neither differed
#' from remnants at p=0.05.
#' - $N_{1}$: field type is significant by likelihood ratio test at p<0.001 with region as a random effect.
#' Shannon's diversity in corn fields was less than restored or remnants, which didn't differ at p=0.05.
#' - $N_{2}$: field type is significant by likelihood ratio test at p<0.001 with region as a random effect.
#' Restored and remnant fields host a larger group of co-dominant AMF than are found in cornfields
#' (p=0.05).
#' - $E_{10}$: field type is significant by likelihood ratio test at p<0.05 with region as a random effect.
#' Weak support for higher evenness of functionally abundant species in remnant fields was found
#' (p<0.05).
#' - $E_{20}$: Similar trend as $E_{10}$ but NS (p>0.05).
#'
#' Figure labels are generated and the diversity data are plotted below. An interaction plot follows,
#' and is useful to consider what the model can and cannot say about differences in regions and field types.
#+ div_amf_otu_labs
labs_amf <- data.frame(
hill_index = factor(c(rep("N0", 3), rep("N1", 3), rep("N2", 3), rep("E10", 3)),
ordered = TRUE,
levels = c("N0", "N1", "N2", "E10", "E20")),
lab = c("a", "b", "b", "a", "b", "b", "a", "b", "b", "a", "a", "b"),
xpos = rep(c(1,2,3), 4),
ypos = rep(c(66, 35, 26, 0.59), each = 3)
)
#+ plot_div_amf_otu,fig.width=9,fig.height=7,fig.align='center'
ggplot(div$amf_rfy, aes(x = field_type, y = value)) +
facet_wrap(vars(hill_index), scales = "free_y") +
geom_boxplot(varwidth = TRUE, fill = "gray90", outlier.shape = NA) +
geom_beeswarm(aes(fill = region), shape = 21, size = 2, dodge.width = 0.2) +
geom_label(data = labs_amf, aes(x = xpos, y = ypos, label = lab), label.size = NA) +
labs(x = "", y = "Index value", title = "TGP microbial diversity (Hill's), AMF (18S), 97% OTU",
caption = "N0-richness, N1-e^Shannon, N2-Simpson, E10=N1/N0, E20=N2/N0, width=n,\nletters indicate significant differences at p<0.05") +
scale_fill_discrete_qualitative(palette = "Dark3") +
theme_bw()
#' Let's also make a figure to show just the diversity measures, and with better labels.
#+ div_amf_box,fig.width=7,fig.height=3
div$amf_rfy %>%
filter(hill_index %in% c("N0", "N1", "N2")) %>%
ggplot(aes(x = field_type, y = value)) +
facet_wrap(vars(hill_index), scales = "free_y", labeller = labeller(hill_index = hill_labs)) +
geom_boxplot(varwidth = TRUE, fill = "gray90", outlier.shape = NA) +
geom_beeswarm(aes(shape = region, fill = field_type), size = 2, dodge.width = 0.3) +
geom_label(data = labs_amf %>% filter(hill_index != "E10"), aes(x = xpos, y = ypos, label = lab), label.size = NA) +
labs(y = "Species equivalent (n)") +
scale_fill_discrete_qualitative(name = "Field Type", palette = "Harmonic") +
scale_shape_manual(name = "Region", values = c(21, 22, 23, 24)) +
theme_bw() +
theme(axis.title.x = element_blank()) +
guides(fill = guide_legend(override.aes = list(shape = 21)))
#' Richness increases from corn, to restored and remnant fields, and some
#' support exists for this pattern to occur across regions. The trend is weakest with $N_{0}$, suggesting
#' that both restored and remnant soils contain more functionally abundant and co-dominant species than are found in cornfields,
#' but some cornfields have "long tails" of rare species. Wide variances stifle inferences. The
#' trend detected in evenness suggests that a few weedy AMF species dominate cornfields but most restored
#' fields host more balanced communities that are more similar to remnants.
#+ plot_div_amf_otu_interaction,fig.width=9,fig.height=7,fig.align='center'
ggplot(
div$amf_rfy %>%
group_by(field_type, region, hill_index) %>%
summarize(avg_value = mean(value), .groups = "drop"),
aes(x = field_type, y = avg_value, group = region)) +
facet_wrap(vars(hill_index), scales = "free_y") +
geom_line(aes(linetype = region)) +
geom_point(aes(fill = region), size = 2, shape = 21) +
labs(x = "", y = "Average value", title = "Interaction plot of Hill's numbers, AMF (18S), 97% OTU") +
scale_fill_discrete_qualitative(palette = "Dark3") +
theme_bw()
#'
#' #### Key observations:
#'
#' - Cornfields differ from restored and remnant fields, with lower richness, fewer dominant species,
#' and greater dominance of those species.
#' - Differences between restored and remnant fields change directions based on region, with FL
#' and LP matching the hypothesized pattern but BM and FG reversing it.
#' - Particular species may be strong interactors here.
#'
#' #### Diversity over time at Blue Mounds (AMF)
#' It's probably justified to correlate diversity with field age in Blue Mounds's restored fields.
#' A Pearson's correlation is used:
#+ test_age_amf_otu
test_age(div$amf_rfy, caption = "Correlation between Hill's numbers and field age in the Blue Mounds region: AMF (18S), 97% OTU")
#'
#' The relationships are too weak to examine further.
#'