-
Notifications
You must be signed in to change notification settings - Fork 3
/
S1.Rmd
1097 lines (858 loc) · 36.8 KB
/
S1.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
---
title: "S1 Appendix"
output:
pdf_document:
number_sections: true
header-includes:
- \usepackage{booktabs}
urlcolor: blue
bibliography: bibliography.bib
---
\tableofcontents
```{r setup, include=FALSE, message=FALSE, echo=FALSE}
knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE)
```
\newpage
# Supporting tools
First, we load the libraries and custom functions that support the analysis.
```{r}
library(bayesplot)
library(cmdstanr)
library(dplyr)
library(extraDistr)
library(GGally)
library(ggplot2)
library(ggpubr)
library(ggridges)
library(gridExtra)
library(kableExtra)
library(lubridate)
library(Metrics)
library(patchwork)
library(posterior)
library(purrr)
library(readr)
library(readsdr)
library(scales)
library(stringr)
library(tidyr)
library(viridisLite)
# Custom functions
source("./R/helpers.R")
source("./R/plots.R")
source("./R/incidence_comparison.R")
# Backup folder
fldr <- "./backup_objs/main_text"
dir.create(fldr, showWarnings = FALSE, recursive = TRUE)
```
# Data
Drawing on the *Tidyverse* (readr, dplyr, lubridate), we read the _csv_ file
that contains the incidence data and transform it into a format suitable for the
analysis.
```{r, echo = TRUE, fig.height = 3.5}
flu_data <- read_csv("./data/Cumberland_data_1918.csv") %>%
rename(time = Time, y = Cases) %>%
mutate(Date = dmy(Date),
Week = epiweek(Date))
```
## Daily data
For this graph and the remaining figures, *ggplot2* provides the framework for
creating visualisations. Here, we show the number of new flu cases in Cumberland
(Maryland) between September and October 1918.
```{r, fig.height = 3.5}
g <- ggplot(flu_data, aes(x = Date, y = y)) +
geom_col(fill = "steelblue") +
theme_pubclean() +
labs(x = "Time (days)",
y = "Incidence (People per day)")
print(g)
```
```{r, echo = FALSE}
ggsave("./plots/Fig1_incidence_report.pdf", plot = g, dpi = "print", height = 5,
width = 8)
```
## Weekly data
Here, we convert the daily data into weekly data.
```{r, fig.height = 3.5}
weekly_data <- flu_data %>% group_by(Week) %>%
summarise(y = sum(y)) %>%
rename(time = Week) %>%
mutate(time = time - min(time) + 1)
ggplot(weekly_data, aes(x = time, y = y)) +
geom_col(fill = "slategray3") +
theme_pubclean() +
labs(title = "Weekly incidence", x = "Week", y = "Incidence [People / wk]")
```
# Calibration workflow
## Prior information - $\pi(\theta)$
The following list corresponds to the time-independent variables and initial
conditions of the SEIR model that will be fitted to the Cumberland data. For
each parameter, we indicate their prior knowledge. We construct prior
distributions for parameters for which we cannot obtain direct estimates and
present them in a density plot.
* Recovery rate: $\gamma$ = 1 / 2 [1 / day]
* Rate of onset of infectiousness: $\sigma$ = 1 / 2 [1 / day]
* Initial recovered: $R(0)$ = 0 [People]
* Initial exposed: $E(0)$ = 0 [People]
* Initial susceptible: $S(0)$) = 5234 - $I(0)$ [People]
* Initial infected: $I(0) \sim lognormal(0, 1)$ [People]
* Rate of effective contacts per infected individual: $\beta \sim lognormal(0, 1)$ [1 / day]
* Average reporting rate: $\rho \sim beta(2, 2)$ [1 / day]
```{r, fig.height = 2}
g1 <- ggplot(NULL, aes(c(0, 10))) +
geom_area(stat = "function", fun = dlnorm, fill = "grey95",
colour = "grey60") +
scale_x_continuous(breaks = c(0, 5, 10)) +
theme_pubr() +
labs(y = "",
x = bquote(beta)) +
theme(axis.line.y = element_blank(),
axis.ticks = element_blank(),
axis.text.y = element_blank())
g2 <- ggplot(NULL, aes(c(0, 1))) +
geom_area(stat = "function", fun = dbeta, fill = "grey95",
colour = "grey60", args = list(shape1 = 2, shape2 = 2)) +
scale_x_continuous(breaks = c(0, 0.5, 1)) +
theme_pubr() +
labs(y = "",
x = bquote(rho)) +
theme(axis.line.y = element_blank(),
axis.ticks = element_blank(),
axis.text.y = element_blank())
g3 <- g1 + labs(y = "",
x = "I(0)",
title = "")
g <- g1 + g2 + g3
print(g)
```
```{r, echo = FALSE}
ggsave("./plots/Fig3_Prior_distribution.pdf", dpi = "print", height = 3, width = 7)
```
## Prior predictive checks
Here, we incorporate the SEIR model built in Stella. We parse the XML code to
R using the function **read_xmile** from the *readsdr* package. Then, we
draw 500 samples from the prior distribution and simulate the model with these
inputs. Given the computational burden of this process, we save the results
in an RDS file.
```{r}
file_path <- file.path(fldr, "prior_sims.rds")
n_sims <- 500
pop_size <- 5234
R0 <- round(5234 * 0.3, 0) # Initial value for the recovered stock
const_list <- list(N = pop_size, sigma = 0.5, par_gamma = 0.5)
filepath <- "./models/SEIR.stmx"
mdl <- read_xmile(filepath, const_list = const_list)
if(!file.exists(file_path)) {
set.seed(300194)
I_0_sims <- rlnorm(n_sims, 0, 1)
rho_sims <- rbeta(n_sims, 2, 2)
beta_sims <- rlnorm(n_sims, 0, 1)
consts_df <- data.frame(par_beta = beta_sims,
rho = rho_sims)
stocks_df <- data.frame(S = pop_size - R0 - I_0_sims,
I = I_0_sims,
C = I_0_sims)
sens_o <- sd_sensitivity_run(mdl$deSolve_components, start_time = 0,
stop_time = 91, timestep = 1 / 32,
multicore = TRUE, n_cores = 4,
integ_method = "rk4", stocks_df = stocks_df,
consts_df = consts_df)
saveRDS(sens_o, file_path)
} else {
sens_o <- readRDS(file_path)
}
set.seed(346282)
sens_inc <- predictive_checks(n_sims, sens_o)
```
Here, we plot the results of the prior predictive checks. Dots indicate the
actual data.
```{r, fig.height = 3.5, fig.width = 7}
g1 <- ggplot(sens_inc, aes(x = time, y = y)) +
geom_line(aes(group = iter), alpha = 0.1, colour = "grey50") +
geom_point(data = flu_data, aes(x = time, y = y), size = 0.5,
colour = "steelblue") +
theme_pubclean() +
labs(y = "Incidence [People / day]",
x = "Time [Days]",
subtitle = "A)")
print(g1)
```
In the above simulations, it can be seen that the prior distribution produces
simulations far off the actual data. To have a closer inspection, we filter out
trajectories whose peak is greater than 200 new cases in a day.
```{r, fig.height = 3.5, fig.width = 7}
max_y_df <- sens_inc %>% group_by(iter) %>% summarise(max_y = max(y))
filtered_df <- filter(max_y_df, max_y < 200)
iters <- filtered_df$iter
sens_inc2 <- filter(sens_inc, iter %in% iters)
g2 <- ggplot(sens_inc2, aes(x = time, y = y)) +
geom_line(aes(group = iter), alpha = 0.1, colour = "grey50") +
geom_point(data = flu_data, aes(x = time, y = y), size = 0.5,
colour = "steelblue") +
theme_pubclean() +
labs(y = "Incidence [People / day]",
x = "Time [Days]",
subtitle = "B)")
print(g2)
```
```{r, echo = FALSE}
g <- g1 / g2
ggsave("./plots/Fig4_prior_predictive_checks.pdf", plot = g, dpi = "print",
height = 5, width = 7)
```
### Measurement model
In this section, we discuss the choice of the measurement model and how we
use prior predictive checks to guide such a decision. Initially, we opt for the
default choice, i.e., the normal distribution. This choice implies that, at
every time step, the difference between the measurement and the true value
(error) follows a normal distribution. Additionally, this distribution entails
that the error across all time steps is similar (homoscedasticity). In other
words, the magnitude of the error is indifferent to the magnitude of the true
value, an assumption that may seem unrealistic. As could be expected, the
normal distribution adds a new unknown, the standard deviation ($\delta$). To
test this model, we assume $\delta \sim Cauchy(0,1)$, and simulate 500
trajectories. We notice that this configuration yields continuous unconstrained
values, in contrast to Cumberland's discrete non-negative measurements. These
dissimilitudes hint that more suitable specifications should be found.
Next, we consider the Poisson distribution given that it has been used in the
empirical treatment of count data, particularly concerning counts of events per
unit of time. This distribution lifts the constraint of equal variance across
measurements as the error magnitude is proportional to the true number of
reported individuals at each time step. As with the normal distribution, we
generate 500 simulations. As we saw above, the Poisson model adequately captures
the given data.
Finally, to count data, the Negative Binomial distribution is an alternative to
the Poisson should the latter fail to capture overdispersion in the data. We
model such overdispersion in the Negative Binomial's scale parameter
($\phi \sim \operatorname{Half-normal}(0,1)$). We can see that the trajectories
generated by the Negative Binomial are more dispersed than the ones produced by
the Poisson distribution. Nevertheless, in this case, they do not provide a more
accurate representation of the data. Consequently, we adopt the Poisson
distribution as the measurement model.
```{r, fig.height = 6}
set.seed(102667)
pois_meas <- sens_inc %>% mutate(dist = "Poisson")
# Normal distributed measurements
norm_meas <- predictive_checks(n_sims, sens_o, "norm") %>%
mutate(dist = "Normal")
# Negative binomial measurements
nbinom_meas <- predictive_checks(n_sims, sens_o, "nbinom") %>%
mutate(dist = "Neg binom")
meas_df <- bind_rows(pois_meas, norm_meas, nbinom_meas)
n1 <- ggplot(meas_df, aes(x = time, y = y)) +
geom_line(aes(group = iter), alpha = 0.1, colour = "grey50") +
geom_point(data = flu_data, aes(x = time, y = y), size = 0.5,
colour = "steelblue") +
facet_wrap(~dist, ncol = 1, scales = "free") +
theme_pubclean() +
labs(y = "Incidence [People / day]",
x = "Time [Days]",
subtitle = "")
print(n1)
```
## Fitting
For calibrating SD models in Stan, it is necessary to create a file written in
Stan's own language. This kind of file structures the code in blocks. For this
example, five blocks are necessary: _Functions_, _data_, _parameters_,
_transformed parameters_, and _model_. Here model refers to prior distributions
and the likelihood (measurement model). The SD model is considered a function,
which can be constructed from the Stella file. This translation is possible
thanks to the function *stan_ode_function* from the *readsdr* package.
Similarly, *readsdr* supports the creation of the data block. The other blocks
must be built manually.
```{r}
pars_hat <- c("I0", "rho", "beta")
gamma <- 0.5
sigma <- 0.5
consts <- sd_constants(mdl)
ODE_fn <- "SEIR"
stan_fun <- stan_ode_function(filepath, ODE_fn, pars = consts$name[c(2, 1)],
const_list = list(N = pop_size, par_gamma = gamma,
sigma = sigma))
fun_exe_line <- str_glue(" o = ode_rk45({ODE_fn}, x0, t0, ts, params);")
```
```{r}
stan_data <- stan_data("y", type = "int", inits = FALSE)
stan_params <- paste(
"parameters {",
" real<lower = 0> beta;",
" real<lower = 0, upper = 1> rho;",
" real<lower = 0> I0;",
"}", sep = "\n")
stan_tp <- paste(
"transformed parameters{",
" vector[n_difeq] o[n_obs]; // Output from the ODE solver",
" real x[n_obs];",
" vector[n_difeq] x0;",
" real params[n_params];",
" x0[1] = 3664 - I0;",
" x0[2] = 0;",
" x0[3] = I0;",
" x0[4] = 1570;",
" x0[5] = I0;",
" params[1] = beta;",
" params[2] = rho;",
fun_exe_line,
" x[1] = o[1, 5] - x0[5];",
" for (i in 1:n_obs-1) {",
" x[i + 1] = o[i + 1, 5] - o[i, 5] + 1e-5;",
" }",
"}", sep = "\n")
stan_model <- paste(
"model {",
" rho ~ beta(2, 2);",
" beta ~ lognormal(0, 1);",
" I0 ~ lognormal(0, 1);",
" y ~ poisson(x);",
"}",
sep = "\n")
stan_gc <- paste(
"generated quantities {",
" real log_lik;",
" log_lik = poisson_lpmf(y | x);",
"}",
sep = "\n")
stan_text <- paste(stan_fun, stan_data, stan_params,
stan_tp, stan_model, stan_gc, sep = "\n")
stan_fldr <- "./Stan_files/example"
dir.create(stan_fldr, showWarnings = FALSE, recursive = TRUE)
stan_filepath <- file.path(stan_fldr, "flu_poisson.stan")
create_stan_file(stan_text, stan_filepath)
```
We show below the code contained in the Stan file.
```{r}
cat(stan_text)
```
To perform the calibration via HMC, we must provide Stan with the calibration
parameters. We specify 2,000 iterations (1,000 for warming-up and 1,000 for
sampling) and four chains. We also supply the number of parameters to be fitted
for the SD model, the number of stocks (_n_difeq_), the simulation time, and
Cumberland's data.
```{r fit_norm, results = 'hide'}
fldr <- "./backup_objs/main_text"
file_path <- file.path(fldr, "fit.rds")
if(!file.exists(file_path)) {
stan_d <- list(n_obs = nrow(flu_data),
y = flu_data$y,
n_params = 2,
n_difeq = 5,
t0 = 0,
ts = 1:length(flu_data$y))
mod <- cmdstan_model(stan_filepath)
fit <- mod$sample(data = stan_d,
seed = 553616,
chains = 4,
parallel_chains = 4,
iter_warmup = 1000,
iter_sampling = 1000,
refresh = 5,
save_warmup = TRUE,
output_dir = fldr)
fit$save_object(file_path)
} else {
fit <- readRDS(file_path)
}
```
### Diagnostics
Before inspecting the samples, Stan returns global diagnostics to the user
about the sampling process. It is expected that the result is free from
divergent iterations or indications of pathological behaviour from the
Bayesian Fraction of Missing Information metric. Also, iterations that
saturate the maximum tree depth indicate a complex posterior surface. The
[Stan manual](https://mc-stan.org/misc/warnings.html#runtime-warnings) provides
intuitive interpretations of these metrics.
```{r, message = TRUE}
# We generated this file from $fit$cmdstan_diagnose()
fileName <- file.path(fldr, "diags.txt")
if(!file.exists(fileName)) {
diagnosis <- fit$cmdstan_diagnose()
writeLines(diagnosis$stdout, fileName)
} else {
readChar(fileName, file.info(fileName)$size) %>% cat()
}
```
#### Trace plots
\hfill
A common approach to inspect calibration results is to check for convergence
in trace plots.
```{r, fig.height = 3}
a <- trace_plot(fit, n_samples = 100, "A)") +
theme(axis.text = element_text(size = 5))
b <- trace_plot(fit, subtitle = "B)") +
theme(axis.text = element_text(size = 5))
print(a + b)
```
```{r, echo = FALSE}
ggsave("./plots/Fig5_traceplot.pdf", plot = a + b, dpi = "print", height = 3,
width = 8)
```
#### Potential scale reduction factor ($\widehat{R}$) & Effective Sample Size ($\hat{n}_{eff}$)
\hfill
$\widehat{R}$ is a convergence diagnostic, which compares the between- and
within-chain estimates for model parameters and other univariate quantities of
interest. If chains have not mixed well, R-hat is larger than 1. It is
recommended to run at least *four chains* by default and only using the sample
if R-hat is less than 1.01 [@Vehtari_2021]. **Stan reports R-hat, which is the
maximum of rank normalized split-R-hat and rank normalized folded-split-R-hat,
which works for thick-tailed distributions and is sensitive also to differences
in scale**.
For each parameter $\theta$, we split each chain from the *sampling phase* in
two halves. That is, from **four** chains of 1000 draws each one, we obtain
**eight** split chains of 500 draws each one. Then, we label the simulations as
$\theta_{ij} (i = 1, . . . , N; j = 1, . . . , M)$, where $N$ is the number of
samples per split chain, $M$ is the number of split chains, and $S = NM$ is the
total number of draws from all chains. We subsequently transform these
simulations to their corresponding rank normalized values $z_{ij}$. According to
@Vehtari_2021, we replace each value $\theta_{ij}$ by its rank $r_{ij}$ within
the pooled draws from all chains. Second, we transform ranks to normal scores
using the inverse normal transformation and a fractional offset via Equation 1:
\begin{equation}
z_{ij} = \Phi^{-1}\left(\frac{r_{ij} - 3/8}{S - 1/4}\right)
\end{equation}
Using these normal scores, we calculate $\widehat{R}$ following the formulation
proposed by @gelman2013bayesian. Initially, we compute $B$ and $W$ , the
between- and within-sequence variances, respectively:
\begin{equation}
B = \frac{N}{M-1}\sum^{M}_{j = 1}(\bar{z}_{.j} - \bar{z}_{..})^2,
\;where\; \bar{z}_{.j} = \frac{1}{n}\sum^{n}_{i = 1}z_{ij},\;
\bar{z}_{..} = \frac{1}{M} \sum^{M}_{j=1} \bar{z}_{.j}
\end{equation}
\begin{equation}
W = \frac{1}{M}\sum_{j = 1}^{M}s_{j}^2,\; where\;
s_{j}^2 = \frac{1}{N-1}\sum^{n}_{i = 1}(z_{ij} - \bar{z}_{.j})^2
\end{equation}
Then, we can estimate $\widehat{var}^{+}(\theta|y)$, the marginal posterior
variance of the parameter, by a weighted average of $W$ and $B$:
\begin{equation}
\widehat{var}^{+}(\theta|y) = \frac{N-1}{N}W + \frac{1}{N}B
\end{equation}
From (3) and (4), we obtain the rank normalized split $\widehat{R}$:
\begin{equation}
\widehat{R} = \sqrt{\frac{\widehat{var}^{+}(\theta|y)}{W}}
\end{equation}
To obtain the rank normalized folded-split $\widehat{R}$, we simply transform
the simulations (Equation 6) and then apply the procedure described above
(Equations 1-5).
\begin{equation}
\zeta_{ij} = |\theta_{ij} - median(\theta)|
\end{equation}
For MCMC draws, we define the estimated effective sample size as
\begin{equation}
\hat{n}_{eff} = \frac{MN}{1 + 2 \sum_{t=1}^{T}\hat{\rho}_t}
\end{equation}
This quantity requires an estimate of the sum of the correlations $\rho$
up to lag T (the first odd positive integer for which
${\hat{\rho}}_{T+1} + {\hat{\rho}}_{T+2}$ is negative). The correlation at any
specific lag $t$ (Equation 8) depends upon the estimate $\widehat{var}^{+}$
and the *Variogram* at each $t$ (Equation 9).
\begin{equation}
\hat{\rho_t} = 1 - \frac{V_t}{2\widehat{var}^{+}}
\end{equation}
\begin{equation}
V_t = \frac{1}{M(N - t)} \sum_{j=1}^{M}\sum_{i=t+1}^{N}(z_{i,j} - z_{i -t, j})^2
\end{equation}
We use the term *bulk effective sample size* to refer to the effective sample
size based on the rank normalized draws. To ensure reliable estimates of
variances and autocorrelations needed for $\hat{R}$ and $\hat{n}_{eff}$,
@Vehtari_2021 recommend that the rank-normalized effective sample size must be
greater than 400 (100 per chain).
```{r}
unk_pars <- c("beta", "rho", "I0")
par_matrices <- lapply(unk_pars, function(par) {
extract_variable_matrix(fit$draws(), par)
})
ess_vals <- map_dbl(par_matrices, ess_bulk)
r_hat_vals <- map_dbl(par_matrices, rhat)
smy_df <- data.frame(par = c("beta", "rho", "I(0)"),
ess_bulk = ess_vals,
rhat = r_hat_vals)
```
```{r, echo = FALSE}
kable(smy_df, booktab = TRUE)
```
#### Check deterministic fit
\hfill
Once the computation has been deemed satisfactory, we subsequently check that
the model's expected value captures the underlying trajectory of the data.
The reader should take into account that in each sampling iteration, Stan
generates a trajectory from the SD model. We compare these simulated time series
against the measured incidence (dots) to visually examine whether the trend is
consistent with the data. The solid blue line denotes the mean trajectory, and
the grey shaded area indicates the 95 % credible interval.
```{r, fig.height = 3.5}
posterior_df <- fit$draws() %>% as_draws_df()
samples_normal <- posterior_df[ , pars_hat] %>%
mutate(R0 = beta / gamma, ll = "Normal")
y_hat_df_norm <- extract_timeseries_var("x", posterior_df)
summary_df <- y_hat_df_norm %>% group_by(time) %>%
summarise(lb = quantile(value, c(0.025, 0.975)[[1]]),
ub = quantile(value, c(0.025, 0.975)[[2]]),
y = mean(value))
ggplot(summary_df, aes(x = time , y)) +
geom_ribbon(aes(ymin = lb, ymax = ub), fill = "grey90") +
geom_line(colour = "steelblue") +
geom_point(data = flu_data) +
theme_pubclean() +
labs(y = "Incidence [New cases / day]", x = "Time [Days]")
```
We also evaluate the fit against the weekly data. It can be seen that the
aggregation filters out the variation in daily data.
```{r, fig.height = 3.5}
weekly_sum <- y_hat_df_norm %>% mutate(week = time %/% 7 + 1) %>%
group_by(iter, week) %>%
summarise(.groups = "drop", value = sum(value)) %>%
group_by(week) %>%
summarise(lb = quantile(value, c(0.025, 0.975)[[1]]),
ub = quantile(value, c(0.025, 0.975)[[2]]),
y = mean(value)) %>%
rename(time = week)
ggplot(weekly_sum, aes(x = time, y = y)) +
geom_ribbon(aes(ymin = lb, ymax = ub), fill = "grey90") +
geom_line(colour = "steelblue") +
geom_point(data = weekly_data) +
theme_pubclean() +
labs(x = "Time [Week]",
y = "Incidence [New cases / week]")
```
### Posterior information
#### Prior & posterior comparison
\hfill
A critical goal of model calibration is to gain information from the data
about the parameters of interest. One way to accomplish such a purpose is
through the comparison of marginal prior and marginal posterior distributions.
Namely, we evaluate the knowledge acquired from the process.
```{r, fig.height = 3}
pars_df <- posterior_df[, c("beta", "rho", "I0")]
g <- prior_posterior_plot(pars_df)
```
```{r, echo = FALSE}
ggsave("./plots/Fig6_prior_posterior_comparison.pdf",plot = g, dpi = "print",
height = 3,
width = 7)
```
#### Pair plots
\hfill
This plot helps us evaluate parameter interactions or joint distributions. The
lower triangular shows, through heat maps, the concentration of values in the
x-y plane among all possible parameter pairs. The upper triangular quantifies
such interactions by the correlation coefficients. The diagonal displays
marginal posterior distributions.
```{r}
g <- pairs_posterior(pars_df, strip_text = 10, axis_text_size = 8)
print(g)
```
```{r, echo = FALSE}
ggsave("./plots/Fig7_pairs_plot.pdf", plot = g, dpi = "print", height = 4,
width = 8)
```
#### Posterior predictive checks
\hfill
The other main goal of model calibration is related to the argument that this
procedure is a stringent validity test of SD models. Simply put, we ask, "Is the
structure capable of generating an approximation of the historical behavior?".
To answer this question, we draw 500 samples from the posterior distribution
that serve as inputs to the SD model. Unlike the check for the deterministic
fit, this step includes the measurement error, which we assumed was Poisson
distributed. Thus, we insert the expected reported incidences into the Poisson distribution to predict measured incidences. In A), we show the 500 predicted
measurements and the actual data (dots). The dotted line indicates the expected
reported incidence. In B), we present three examples of the predicted
measurements.
```{r}
file_path <- file.path(fldr, "posterior_sims.rds")
set.seed(1800)
row_ids <- sample.int(nrow(pars_df), 500, replace = TRUE)
post_samples <- pars_df[row_ids, ]
if(!file.exists(file_path)) {
consts_df <- dplyr::select(post_samples, beta, rho) %>%
rename(par_beta = beta)
stocks_df <- dplyr::select(post_samples, I0) %>%
rename(I = I0) %>%
mutate(S = pop_size - I - R0,
R = R0,
C = I)
sens_o <- sd_sensitivity_run(mdl$deSolve_components, start_time = 0,
stop_time = 91, timestep = 1 / 32,
multicore = TRUE, n_cores = 4,
integ_method = "rk4", stocks_df = stocks_df,
consts_df = consts_df)
saveRDS(sens_o, file_path)
} else {
sens_o <- readRDS(file_path)
}
# Posterior predictive checks
ppc <- predictive_checks(n_sims, sens_o)
```
```{r}
sample_traj <- sample.int(n_sims, 3) %>% sort()
ppc_rest <- ppc %>% filter(!iter %in% sample_traj)
ppc1 <- ppc %>% filter(iter == sample_traj[[1]])
ppc2 <- ppc %>% filter(iter == sample_traj[[2]])
ppc3 <- ppc %>% filter(iter == sample_traj[[3]])
line_cols <- viridisLite::viridis(3)
g1 <- ggplot(ppc_rest, aes(x = time, y = y)) +
geom_line(aes(group = iter), alpha = 0.01, colour = "grey50") +
geom_point(data = flu_data, aes(x = time, y = y), size = 1,
colour = "steelblue", alpha = 0.75) +
geom_line(data = ppc1, colour = line_cols[[1]], alpha = 0.9) +
geom_line(data = ppc2, colour = line_cols[[2]], alpha = 0.9) +
geom_line(data = ppc3, colour = line_cols[[3]], alpha = 0.9) +
theme_pubclean() +
labs(y = "Incidence [People / day]",
x = "Days",
subtitle = "A)")
summary_ppc <- ppc %>% group_by(time) %>%
summarise(mean = mean(y),
value = quantile(y, c(0.025, 0.975)),
bound = c("ll", "ul")) %>% pivot_wider(names_from = bound,
values_from = value)
g2 <- ggplot(summary_ppc, aes(x = time, y = mean)) +
geom_line(colour = "grey30", linetype = "dashed") +
geom_point(data = flu_data, aes(x = time, y = y), size = 1,
colour = "steelblue", alpha = 0.75) +
geom_ribbon(aes(ymin = ll, ymax = ul), fill = "grey70", alpha = 0.5) +
theme_pubclean() +
labs(y = "Incidence [People / day]",
x = "Days",
subtitle = "B)")
g <- g1 + g2
print(g)
```
```{r, echo = FALSE}
ggsave("./plots/Fig8_Posterior_predictive_checks.pdf", plot = g, dpi = "print",
height = 5, width = 8)
```
#### Mean Absolute Scaled Error
\hfill
This definition is taken from [Hyndman(2006)](https://robjhyndman.com/papers/foresight.pdf):
*The MASE was proposed by Hyndman and Koehler (2006) as a generally applicable
measurement of forecast accuracy. They proposed scaling the errors based on the
in-sample Mean Absolute Error (MAE) from the naïve forecast method. Using the
naïve method, we generate one-period-ahead forecasts from each data point in the
sample. Accordingly, a scaled error is defined as:*
\begin{equation}
q_t = \frac{e_t}{\frac{1}{n-1} \sum\limits_{i=2}^n |y_i - y_{i-1}|}
\end{equation}
where the numerator $e_t$ is the absolute forecast error ($|\hat{y}_t -y_t|$)
for a specific time. Here $\hat{y_t}$ denotes the predicted data from
the fitted model and $y_t$ the actual data. Further, $n$ represents the number
of data points. The denominator is the MAE from the one-step "naive forecast
method", defined as the actual value ($y_t$) minus the forecast value
($y_{t-1}$) for t > 1.
Thus, the mean absolute scaled error is:
\begin{equation}
MASE = mean(|q_t|)
\end{equation}
MASE is a scale-independent metric defined if there are zero values. A scaled
error is less than one if it arises from a better forecast than the naïve
forecast. Conversely, it is greater than one if the forecasts worse than the
average one-step, naïve forecast computed in-sample.
Based on the above, we estimate the MASE for each simulated trajectory and
present its distribution.
```{r}
df_list <- split(ppc, f = ppc$iter)
mases <- sapply(df_list, function(df) mase(flu_data$y, df$y))
MASE_df <- data.frame(MASE = mases)
g2 <- ggplot(MASE_df, aes(x = MASE)) +
geom_density(fill = "grey95") +
scale_x_continuous(limits = c(0.60, 1)) +
geom_vline(xintercept = 1, linetype = "dotted") +
theme_pubr() +
labs(y = "Density") +
theme(axis.line.y = element_blank(),
axis.ticks = element_blank(),
axis.text.y = element_blank())
print(g2)
```
# Point estimates vs neighbourhoods
Throughout this work, we have intentionally focused on probability distributions
rather than on point estimates. In this section, we provide the rationale for
such a choice. The reader should recall that another approach to performing
model calibration is via optimisation. That is, using non-linear optimisation
routines, one finds the maximum or mode of a likelihood function $\pi(y|\theta)$,
which is a point estimate. Around this estimate, we can construct confidence
intervals whose probability is defined in terms of coverage. For instance, a 95%
confidence interval means that 95 out of 100 intervals constructed from 100
measurements obtained from an identical process will capture the actual value.
In this context, a single measurement is the entire time series of case counts
reported in Cumberland.
We plot the likelihood function for each parameter, where the dashed line
denotes the mode or Maximum Likelihood Estimate (MLE). To construct this
function, we tap into the output returned by Stan. This software calculates the likelihood for each of the draws obtained from the _sampling_ phase.
```{r}
pars_df2 <- posterior_df[, c("beta", "rho", "I0", "log_lik")] %>%
mutate(sample_id = row_number(),
is_mode = log_lik == max(log_lik))
mode_df <- pars_df2 %>% filter(is_mode == TRUE) %>%
dplyr::select(-is_mode, - log_lik) %>% mutate(Point = "Mode")
tidy_pars <- pars_df2 %>%
select(-is_mode) %>%
pivot_longer(c(-sample_id, -log_lik))
plot_log_lik(tidy_pars, mode_df)
```
However, we ponder whether a point close to the MLE is less important than the
MLE itself. As an example, the MLE for $\beta$ is 1.28879**7**... (please note
that since this is a point estimate on the real line, after the last digit -7-,
there is an additional infinite number of digits). Does this imply that
1.28879**8**... is less useful given that this other point estimate yields a
lower likelihood value? Certainly, this is a value judgement. To provide an
answer to these enquiries, we explore three point estimates: the previously
estimated mode, the mean of each marginal posterior, and a random point from the posterior distribution. We indicate these estimates as lines superimposed over
the posterior distribution.
```{r}
mean_df <- summarise_all(pars_df, mean) %>%
mutate(Point = "Mean")
set.seed(123)
random_row <- sample.int(nrow(pars_df), 1)
random_point <- slice(pars_df, random_row) %>%
mutate(Point = "Random")
sim_points <- bind_rows(dplyr::select(mode_df, -sample_id) ,
mean_df, random_point) %>%
arrange(Point)
plot_points_on_posterior(pars_df2, sim_points)
```
```{r, echo = FALSE}
kable_df <- sim_points
colnames(kable_df ) <- c("$\\beta$", "$\\rho$", "I(0)", "Estimate")
knitr::kable(kable_df, "latex", booktabs = TRUE, escape = FALSE)
```
\hfill
We use such point estimates to simulate the SD model and compare the results
against the incidence data. It thus follows that _usefulness_ in this context is
defined as how accurately the simulated behaviour matches the observed data. To
measure this accuracy, we employ three different metrics: the mean absolute
scaled error (MASE), the Mean Square Error (MSE), and $R^2$. For the MASE and
MSE, lower values indicate better performance. Conversely, high values of $R^2$
indicate more accurate predictions. These metrics, though, tell a contradictory
story. On the one hand, the mean produces the most precise estimate according to
the MASE and the MSE. On the other hand, the random point results in the most
precise estimate according to $R^2$. Interestingly, the mode underperforms under
the three metrics.
On the contrary, from a **Bayesian** perspective, ascertaining which point
estimate stands out above the others is uninteresting. The critical feature lies
in their location. That is, they form a common neighbourhood, which we
refer to as the posterior distribution $\pi(\theta|y)$. In this case, this
distribution is continuous, so the probability of any given point is **zero**.
Thus, from a Bayesian standpoint, we are interested in finding which regions
(collections of points) within the parameter space are consistent with the data.
This feature highlights a key difference between optimisation algorithms and
MCMC methods: their **targets**. Whereas the former aims for the point of
highest density, the latter aims for concentration of probability mass, also
known as the *typical set*. In lower dimensions, they tend to agree, but as the
number of parameter increases, the difference becomes evident. Colloquially,
this is known as the *Curse of dimensionality*. To complement this argument, we
refer the reader to Betancourt’s paper [@betancourt2017conceptual] and
[Carpenter’s case study](https://mc-stan.org/users/documentation/case-studies/curse-dims.html) to build intuition about this complex issue.
Consequently, statistics such as means, medians, standard deviations, and
quantiles are merely used for descriptive purposes. Specifically, quantiles
describe where in the parameter space the collection of plausible point estimates concentrate. These quantiles known as *credible intervals* quantify the
uncertainty of the unobserved parameter values. For instance, a 95 % credible
interval implies that we _believe_ there is a 95% chance of finding the actual
value within those bounds. Nevertheless, given that we have defined model
calibration as a Bayesian inference process, which quantifies inferences by an
entire distribution, no point estimate or interval characterises that
distribution.
\hfill
```{r}
consts_df <- dplyr::select(sim_points, beta, rho) %>%
rename(par_beta = beta)
stocks_df <- dplyr::select(sim_points, I0) %>%
rename(I = I0) %>%
mutate(S = pop_size - I - R0,
R = R0,
C = I)
sens_o <- sd_sensitivity_run(mdl$deSolve_components, start_time = 0,
stop_time = 91, timestep = 1 / 32,
multicore = FALSE, integ_method = "rk4",
stocks_df = stocks_df,
consts_df = consts_df)
sim_df <- sens_o %>% filter(time - trunc(time) == 0) %>%
group_by(iter) %>%
mutate(x = C - lag(C), 0) %>% filter(time > 0) %>%
ungroup() %>%
mutate(Point = case_when(
iter == 1 ~ "Mean",
iter == 2 ~ "Mode",
iter == 3 ~ "Random")) %>%
select(Point, time, x)
ggplot(sim_df, aes(x = time, y = x)) +
geom_line(aes(group = Point, linetype = Point), alpha = 0.8,
colour = "steelblue") +
geom_point(data = flu_data, aes(y = y), size = 0.25) +
scale_linetype_manual(values = c("solid", "dashed", "dotted"),
name = "Estimate") +
labs(y = "Reported incidence", x = "Days") +