-
Notifications
You must be signed in to change notification settings - Fork 25
/
format.timestamps.R
260 lines (248 loc) · 10.7 KB
/
format.timestamps.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
# Copyright 2019 Steven L. Scott. All rights reserved.
#
# Copyright 2018 Google LLC. All Rights Reserved.
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
TimestampInfo <- function(response, data = NULL, timestamps = NULL) {
## Args:
## response: A vector or matrix. If the response is a zoo object with
## timestamps then the timestamps will be processed.
## data: an optional data frame, list or environment (or object coercible by
## ‘as.data.frame’ to a data frame) containing the variables in the bsts
## model. This is only used as source of timestamps if they are not
## provided otherwise.
## timestamps: A vector of user-supplied timestamps (of the same length as
## 'response'), or NULL.
##
## Returns:
## An object of class TimestampInfo, which is a list containing the
## following elements:
##
## * timestamps.are.trivial: Logical. TRUE indicates that 'timestamps' are
## either NULL, or that there are no duplicates and no skipped values. If
## timestamps are trivial then the response is a simple uninterrupted
## sequence (like you'd expect at time series to be.
##
## * number.of.time.points: The number of unique time points contained in
## the data, including any skipped time points containing no
## observations.
##
## * timestamps: the time stamps taken from the original response or zoo
## data frame containing the data. This might be NULL. It might also
## contain duplicate values.
##
## * regular.timestamps: A regular grid of timestamps containing the
## smallest and largest and largest entries in 'timestamps', with an
## increment equal to the smallest nonzero increment in 'timestamps'.
## This will be NULL if 'timestamps' is NULL.
##
## * timestamp.mapping: This is only present if timestamps.are.trivial is
## FALSE. It is a numeric vector giving the index of the entry in
## 'regular.timestamps' to which each observation in 'response' is
## associated.
if (is.null(timestamps)) {
if (is.zoo(response)) {
timestamps <- index(response)
} else if (!is.null(data) && is.zoo(data)) {
timestamps <- index(data)
}
}
number.of.observations <-
if (is.matrix(response)) nrow(response) else length(response)
if (is.null(timestamps)) {
## Handle the trivial case when no timestamps are passed.
number.of.time.points <- number.of.observations
stopifnot(number.of.time.points > 0)
ans <- list(timestamps.are.trivial = TRUE,
number.of.time.points = as.integer(number.of.time.points),
timestamps = 1:number.of.observations,
regular.timestamps = 1:number.of.time.points)
} else {
## If timestamps were passed then process them.
stopifnot(number.of.observations == length(timestamps))
regular.timestamps <- RegularizeTimestamps(timestamps)
ans <- list(timestamps.are.trivial = IsRegular(timestamps),
number.of.time.points = as.integer(length(regular.timestamps)),
timestamps = timestamps,
regular.timestamps = regular.timestamps)
if (!ans$timestamps.are.trivial) {
## A hack to handle numeric timestamps appropriately.
class(timestamps) <- class(regular.timestamps)
ans$timestamp.mapping <- as.integer(
zoo::MATCH(timestamps, regular.timestamps))
}
if (length(ans$regular.timestamps) > 2 * length(ans$timestamps)) {
warning("Expanding the time series to a regular interval resulted ",
"in very large amounts of missing data.")
}
}
class(ans) <- "TimestampInfo"
return(ans)
}
NoDuplicates <- function(timestamps) {
## Returns TRUE iff the vector of timestamps contains no duplicate values.
return(length(timestamps) == length(unique(timestamps)))
}
HasDuplicateTimestamps <- function(bsts.object) {
# Returns TRUE iff the object has nontrivial timestamps and at least one time
# stamp is associated with more than one observation.
if (bsts.object$timestamp.info$timestamps.are.trivial) {
return(FALSE)
}
return(!NoDuplicates(bsts.object$timestamps))
}
NoGaps <- function(timestamps) {
## Returns TRUE iff there are no gaps in the series of time stamps. The
## deltas between time points do not need to be equally spaced. A distance
## between observations counts as a 'gap' if it is at least twice as large as
## the smallest duration between two time points. If we take "twice"
## literally then we are vulnerable to floating point junk, so take a factor
## of 1.8 instead of 2.
unique.timestamps <- unique(sort(timestamps))
dt <- diff(unique.timestamps)
min.dt <- min(dt)
return(all(dt < 1.8 * min.dt))
}
IsRegular <- function(timestamps) {
## A sequence of time stamps is regular if it contains no duplicates and no
## gaps.
return(NoDuplicates(timestamps) && NoGaps(timestamps))
}
RegularizeTimestamps <- function(timestamps) {
## Args:
## timestamps: A set of timestamps. This can be NULL, a numeric object
## (presumably time 1, 2, 3), a numeric set of timestamps that came from a
## ts object (like 1945.08333, 1945.127, ...), a Date object, or a POSIXt
## object.
##
## Returns:
## A vector of unique, regularly spaced time stamps containing the
## (deduplicated) entries in the argument, and maybe more, if there were
## some time steps passed over by the supplied time stamps.
##
## Note that 'regularly spaced' time steps do not always have the same time
## gap between them. Monthly data is the most obvious example, but daily
## POSIXt data passing over a DST boundary is another.
if (is.null(timestamps) || IsRegular(timestamps)) {
return(timestamps)
}
UseMethod("RegularizeTimestamps")
}
RegularizeTimestamps.default <- function(timestamps) {
## Args:
## timestamps: A set of timestamps. This can be NULL, a numeric object
## (presumably time 1, 2, 3), a Date object, or a POSIXt object.
##
## Returns:
## A vector of unique, regularly spaced time stamps containing the
## (deduplicated) entries in the argument, and maybe more, if there were
## some time steps passed over by the supplied time stamps.
unique.timestamps <- sort(unique(timestamps))
dt <- diff(unique.timestamps)
dt <- min(dt)
regular.timestamps <- seq(from = unique.timestamps[1],
to = tail(unique.timestamps, 1),
by = dt)
if (!all(timestamps %in% regular.timestamps)) {
stop("Time stamps must occur at regular intervals.")
}
return(regular.timestamps)
}
RegularizeTimestamps.numeric <- function(timestamps) {
## If the timestamps are numeric then we need to guard against floating point
## garbage.
unique.timestamps <- sort(unique(timestamps))
dt <- min(diff(unique.timestamps))
regular.timestamps <- seq(from = unique.timestamps[1],
to = tail(unique.timestamps, 1),
by = dt)
if (!all(signif(timestamps, digits = 8) %in%
signif(regular.timestamps, digits = 8))) {
stop("Time stamps must occur at regular intervals.")
}
class(regular.timestamps) <- "NumericTimestamps"
return(regular.timestamps)
}
MATCH.NumericTimestamps <- function(x, table, nomatch = NA, ...) {
## An S3 generic for the MATCH function provided by the zoo package. Numeric
## timestamps match if they agree on 8 significant digits.
match(signif(x, digits = 8),
signif(table, digits = 8),
nomatch = nomatch,
...)
}
RegularizeTimestamps.Date <- function(timestamps) {
unique.timestamps <- sort(unique(timestamps))
dt <- diff(unique.timestamps)
min.dt <- min(dt)
dt.table <- table(dt)
if (min.dt == 1 || min.dt == 7) {
## Daily or weekly data is regular.
return(RegularizeTimestamps.default(timestamps))
} else if (min.dt %in% 28:31 &&
any(dt.table[c("30", "31")] > 0)) {
## Monthly data
regular.timestamps <- seq(from = unique.timestamps[1],
to = tail(unique.timestamps, 1),
by = "month")
} else if (min.dt %in% 89:93) {
regular.timestamps <- seq(from = unique.timestamps[1],
to = tail(unique.timestamps, 1),
by = "quarter")
} else if (min.dt == 365) {
regular.timestamps <- seq(from = unique.timestamps[1],
to = tail(unique.timestamps, 1),
by = "year")
} else {
stop("Could not determine an appropriate time scale.")
}
if (!all(timestamps %in% regular.timestamps)) {
stop("Date time stamps must occur at regular intervals.")
}
return(regular.timestamps)
}
RegularizeTimestamps.POSIXt <- function(timestamps) {
dt <- diff(sort(unique(timestamps)))
if (attributes(dt)$units == "days") {
days <- as.POSIXct(RegularizeTimestamps(as.Date(timestamps)))
offset <- min(timestamps) - min(days)
times <- as.POSIXlt(days + offset)
dst <- as.logical(times$isdst)
if (dst[1]) {
## If the first time period was in daylight savings time then all the time
## points not in daylight savings time need to be shifted back by an hour.
times[!dst] <- times[!dst] + as.difftime(1, units = "hours")
} else {
## If the first time period was in standard time then all the time
## points in daylight savings time need to be shifted forward by an hour.
times[dst] <- times[dst] - as.difftime(1, units = "hours")
}
return(as.POSIXct(times))
} else if(attributes(dt)$units == "hours") {
## If the difference in times is on the hourly scale, it might be because
## we've got daily data with DST effects yielding differences of 23, 24, and
## 25 hours.
dt.table <- table(dt)
min.dt <- min(dt)
mode.dt <- as.numeric(names(dt.table))[which.max(dt.table)]
if (min.dt == 23 && mode.dt == 24) {
return(seq(from = min(timestamps), to = max(timestamps), by = "DSTday"))
} else {
return(RegularizeTimestamps.default(timestamps))
}
} else {
return(RegularizeTimestamps.default(timestamps))
}
}