-
Notifications
You must be signed in to change notification settings - Fork 6
/
fillGap.R
195 lines (164 loc) · 6.48 KB
/
fillGap.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
#' Fill gaps in the rainfall time series.
#'
#' @param dataset A dataframe with first column the time, the rest columns are rainfall data of different gauges
#' @param corPeriod A string showing the period used in the correlation computing,
#' e.g. daily, monthly, yearly.
#' @return The filled dataframe
#' @details
#' the gap filler follows the rules below:
#'
#' 1. The correlation coefficient of every two columns (except time column) is calculated.
#' the correlation coefficient calculation can be based on 'daily', 'monthly', 'annual',
#' in each case, the daily data, the monthly mean daily data and annual mean daily data of
#' each column will be taken in the correlation calculation.
#'
#' Then the correlation matrix is got, then based on the matrix, for each column,
#' the 1st, 2nd, 3rd,... correlated column will be got. So if there is missing value in the
#' column, it will get data from orderly 1st, 2nd, 3rd column.
#'
#' 2. The simple linear regress is calculated between every two columns. When generating the
#' linear coefficient, the incept should be force to 0. i.e. y = a*x + b should be forec to
#' y = a*x.
#'
#' 3. Gap filling. E.g., on a certain date, there is a missing value in column A, then the
#' correlation order is column B, column C, column D, which means A should take values from
#' B firstly, if B is also missing data, then C, then D.
#'
#' Assuming finally value from column C is taken. Then according to step 2, A = a*C, then the
#' final value filled in column A is missing_in_A = a*value_in_C, a is the linear coeffcient.
#'
#' @examples
#' b <- read.table(text = ' Date AAA BBB CCC DDD EEE
#' 49 1999-12-15 24.8 21.4 25.6 35.0 17.4
#' 50 1999-12-16 NA 0.6 1.5 6.3 2.5
#' 51 1999-12-17 NA 16.3 20.3 NA 19.2
#' 52 1999-12-18 13 1.6 NA 6.3 0.0
#' 53 1999-12-19 10 36.4 12.5 26.8 24.9
#' 54 1999-12-20 NA 0.0 0.0 0.2 0.0
#' 55 1999-12-21 0.2 0.0 0.0 0.0 0.0
#' 56 1999-12-22 0.0 0.0 0.0 0.0 0.0')
#'
#' b1 <- fillGap(b) # if corPeriod is missing, 'daily' is taken as default.
#'
#' data(testdl)
#' a <- extractPeriod(testdl, commonPeriod = TRUE)
#' a1 <- list2Dataframe(a)
#' a2 <- fillGap(a1)
#' a3 <- fillGap(a1, corPeriod = 'monthly')
#'
#'
#' # More examples can be found in the user manual on https://yuanchao-xu.github.io/hyfo/
#'
#' @references
#' Gap fiiling method based on correlation and linear regression.
#'
#' \itemize{
#' \item Hirsch, Robert M., et al. "Statistical analysis of hydrologic data." Handbook of hydrology. (1992): 17-1.
#' Salas, Jose D. "Analysis and modeling of hydrologic time series." Handbook of hydrology 19 (1993): 1-72.
#'
#' }
#'
#'
#' @export
fillGap <- function(dataset, corPeriod = 'daily') {
if (!grepl('-|/', dataset[1, 1])) {
stop('First column is not date or Wrong Date formate, check the format in ?as.Date{base}
and use as.Date to convert.')
}
Date <- as.Date(dataset[, 1])
data <- data.frame(dataset[, 2:dim(dataset)[2]])
names <- colnames(data)
corN <- fillGap_cor(data, corPeriod = corPeriod, Date = Date)
cat('\nCorrelation Coefficient\n')
print(corN)
corOrder <- apply(corN, MARGIN = 1, FUN = function(x) order(-x))
corOrder <- corOrder[2:dim(corOrder)[1], ]
corOrderName <- t(apply(corOrder, MARGIN = 2, FUN = function(x) names[x]))
cat('\nCorrelation Order\n')
colnames(corOrderName) <- seq(1 : dim(corOrderName)[2])
print(corOrderName)
lmCoef <- fillGap_lmCoef(data, corOrder)
cat('\nLinear Coefficients\n')
rownames(lmCoef) <- seq(1 : dim(corOrderName)[2])
print(t(lmCoef))
output <- lapply(1:dim(data)[2], fillGap_column, data = data,
corOrder = corOrder, lmCoef = lmCoef)
output <- data.frame(output)
colnames(output) <- names
output <- cbind(Date, output)
return(output)
}
#' Get monthly rainfall
#'
#' @param TS A rainfall time series.
#' @param year A list showing the year index of the time series.
#' @param mon A list showing the mon index of the time series.
#' @return the monthly rainfall matrix of the rainfall time series.
monthlyPreci <- function(TS, year, mon) {
# monthly daily mean is used in order not to affected by missing values.
monTS <- tapply(TS, INDEX = list(year, mon), FUN = mean, na.rm = TRUE)
output <- t(monTS)
dim(output) <- c(dim(monTS)[1] * dim(monTS)[2], 1)
return(output)
}
fillGap_column <- function(i, data, corOrder, lmCoef) {
TS <- data[, i] # extract target column
l <- dim(data)[2] # length
for (j in 1:l) {
if (!any(is.na(TS))) break
NAindex <- which(is.na(TS))
TS[NAindex] <- round(lmCoef[j, i] * data[NAindex, corOrder[j, i]], 3)
if (j == l) stop('Error: One time consists of all NA values')
}
return(TS)
}
#' @importFrom stats cor na.omit
#' @references
#'
#' \itemize{
#' \item R Core Team (2015). R: A language and environment for statistical computing. R Foundation for
#' Statistical Computing, Vienna, Austria. URL https://www.R-project.org/.
#' }
#'
#'
fillGap_cor <- function(data, corPeriod = 'daily', Date) {
names <- colnames(data)
year <- format(Date, '%Y')
if (corPeriod == 'monthly') {
#based on monthly rainfall
mon <- format(Date, '%m')
monthlyPreci <- lapply(data, FUN = monthlyPreci, year = year, mon = mon)
corData <- do.call('cbind', monthlyPreci)
} else if (corPeriod == 'yearly') {
year <- format(Date, '%Y')
# yearly daily mean is used in order not to affected by missing values.
annualPreci <- lapply(data, FUN = function(x) tapply(x, INDEX = year, FUN = mean, na.rm = TRUE))
corData <- do.call('cbind', annualPreci)
} else if (corPeriod == 'daily') {
corData <- data
} else {
stop('Pleas choose among "daily", "monthly", "yearly".')
}
corData <- data.frame(na.omit(corData))
colnames(corData) <- names
corN <- cor(corData)
return(corN)
}
#' @importFrom utils combn
#' @importFrom stats coef lm
#' @references
#' R Core Team (2015). R: A language and environment for statistical computing. R Foundation for
#' Statistical Computing, Vienna, Austria. URL https://www.R-project.org/.
#'
fillGap_lmCoef <- function(data, corOrder) {
l <- dim(data)[2]
m <- diag(l)# m is the coeficients matrix
m[lower.tri(m)] <- combn(data, 2, function(x) coef(lm(x[, 2] ~ x[, 1] + 0)))
tm <- t(m)
tm[lower.tri(tm)] <- combn(data, 2, function(x) coef(lm(x[, 1] ~ x[, 2] + 0)))
m <- t(tm)
lmCoef <- lapply(1 : l, function(x) m[x,corOrder[, x]])
lmCoef <- do.call('rbind', lmCoef)
rownames(lmCoef) <- colnames(data)
return(t(lmCoef))
}