-
Notifications
You must be signed in to change notification settings - Fork 66
/
gradient_boosted.py
378 lines (348 loc) · 15.5 KB
/
gradient_boosted.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
# Copyright (c) 2018 The Harmonica Developers.
# Distributed under the terms of the BSD 3-Clause License.
# SPDX-License-Identifier: BSD-3-Clause
#
# This code is part of the Fatiando a Terra project (https://www.fatiando.org)
#
"""
Gradient-boosted equivalent sources in Cartesian coordinates
"""
import warnings
import numpy as np
import verde.base as vdb
from sklearn import utils
from verde import get_region, rolling_window
from .cartesian import EquivalentSources
from .utils import predict_numba_parallel
class EquivalentSourcesGB(EquivalentSources):
r"""
Gradient-boosted equivalent sources for generic harmonic functions.
Gradient-boosted version of the :class:`harmonica.EquivalentSources`,
introduced in [Soler2021]_. These equivalent sources are intended to be
used to fit very large datasets, where the Jacobian matrices generated by
regular equivalent sources (like :class:`harmonica.EquivalentSources`) are
larger than the available memory. They fit the sources coefficients
iteratively using overlapping windows of equal size, greatly reducing the
memory requirements.
Smaller windows lower the memory requirements. Using very small windows may
impact the accuracy of the interpolations. We recommend using the larger
windows that generate Jacobian matrices that fit in the available memory.
Parameters
----------
damping : None or float
The positive damping regularization parameter. Controls how much
smoothness is imposed on the estimated coefficients.
If None, no regularization is used.
points : None or list of arrays (optional)
List containing the coordinates of the equivalent point sources.
Coordinates are assumed to be in the following order:
(``easting``, ``northing``, ``upward``).
If None, will place one point source below each observation point at
a fixed relative depth below the observation point [Cooper2000]_.
Defaults to None.
depth : float
Parameter used to control the depth at which the point sources will be
located.
Each source is located beneath each data point (or block-averaged
location) at a depth equal to its elevation minus the ``depth`` value.
This parameter is ignored if *points* is specified.
Defaults to 500.
block_size: float, tuple = (s_north, s_east) or None
Size of the blocks used on block-averaged equivalent sources.
If a single value is passed, the blocks will have a square shape.
Alternatively, the dimensions of the blocks in the South-North and
West-East directions can be specified by passing a tuple.
If None, no block-averaging is applied.
This parameter is ignored if *points* are specified.
Default to None.
window_size : float or "default"
Size of overlapping windows used during the gradient-boosting
algorithm. Smaller windows reduce the memory requirements of the source
coefficients fitting process. Very small windows may impact on the
accuracy of the interpolations.
Defaults to estimating a window size such that approximately 5000 data
points are in each window.
parallel : bool
If True any predictions and Jacobian building is carried out in
parallel through Numba's ``jit.prange``, reducing the computation time.
If False, these tasks will be run on a single CPU. Default to True.
dtype : dtype, optional
The desired data-type for the predictions and the Jacobian matrix.
Default to ``"float64"``.
Attributes
----------
points_ : 2d-array
Coordinates of the equivalent point sources.
coefs_ : array
Estimated coefficients of every point source.
region_ : tuple
The boundaries (``[W, E, S, N]``) of the data used to fit the
interpolator. Used as the default region for the
:meth:`~harmonica.EquivalentSources.grid` method.
window_size_ : float or None
Size of the overlapping windows used in gradient-boosting equivalent
point sources. It will be set to None if ``window_size = "default"``
and less than 5000 data points were used to fit the sources; a single
window will be used in such case.
References
----------
[Soler2021]_
"""
# Define amount of overlapping between adjacent windows to 50%.
overlapping = 0.5
def __init__(
self,
damping=None,
points=None,
depth=500,
block_size=None,
window_size="default",
parallel=True,
random_state=None,
dtype="float64",
):
if isinstance(window_size, str) and window_size != "default":
raise ValueError(
f"Found invalid 'window_size' value equal to '{window_size}'."
"It should be 'default' or a numeric value."
)
super().__init__(
damping=damping,
points=points,
depth=depth,
block_size=block_size,
parallel=parallel,
dtype=dtype,
)
self.random_state = random_state
self.window_size = window_size
def estimate_required_memory(self, coordinates):
"""
Estimate the memory required for storing the largest Jacobian matrix
Parameters
----------
coordinates : tuple of arrays
Arrays with the coordinates of each data point. Should be in the
following order: (``easting``, ``northing``, ``upward``, ...).
Only ``easting``, ``northing``, and ``upward`` will be used, all
subsequent coordinates will be ignored.
Returns
-------
memory_required : int
Amount of memory required to store the largest Jacobian matrix in
bytes.
Examples
--------
>>> import verde as vd
>>> coordinates = vd.scatter_points(
... region=(-1e3, 3e3, 2e3, 5e3),
... size=100,
... extra_coords=100,
... random_state=42,
... )
>>> eqs = EquivalentSourcesGB(window_size=2e3)
>>> n_bytes = eqs.estimate_required_memory(coordinates)
>>> int(n_bytes)
9800
"""
# Build the sources and assign the points_ attribute
coordinates = vdb.n_1d_arrays(coordinates, 3)
points = self._build_points(coordinates)
self.points_ = points
# Build the windows and get the indices
source_windows, data_windows = self._create_windows(coordinates)
# Get the number of sources and data for each window
source_sizes = np.array([w.size for w in source_windows])
data_sizes = np.array([w.size for w in data_windows])
# Compute the size of the Jacobian matrix for each window
jacobian_sizes = source_sizes * data_sizes
# Estimate size of a single element of the Jacobian matrix in bytes
return jacobian_sizes.max() * np.dtype(self.dtype).itemsize
def fit(self, coordinates, data, weights=None):
"""
Fit the coefficients of the equivalent sources.
The fitting process is carried out through the gradient-boosting
algorithm.
The data region is captured and used as default for the
:meth:`~harmonica.EquivalentSourcesGB.grid` method.
All input arrays must have the same shape.
Parameters
----------
coordinates : tuple of arrays
Arrays with the coordinates of each data point. Should be in the
following order: (``easting``, ``northing``, ``upward``, ...).
Only ``easting``, ``northing``, and ``upward`` will be used, all
subsequent coordinates will be ignored.
data : array
The data values of each data point.
weights : None or array
If not None, then the weights assigned to each data point.
Typically, this should be 1 over the data uncertainty squared.
Returns
-------
self
Returns this estimator instance for chaining operations.
"""
coordinates, data, weights = vdb.check_fit_input(coordinates, data, weights)
# Capture the data region to use as a default when gridding.
self.region_ = get_region(coordinates[:2])
# Ravel coordinates, data and weights to 1d-arrays
coordinates = vdb.n_1d_arrays(coordinates, 3)
data = data.ravel()
if weights is not None:
weights = weights.ravel()
# Build point sources
if self.points is None:
self.points_ = self._build_points(coordinates)
else:
self.points_ = vdb.n_1d_arrays(self.points, 3)
# Initialize coefficients
self.coefs_ = np.zeros_like(self.points_[0])
# Fit coefficients through gradient boosting
self._gradient_boosting(coordinates, data, weights)
return self
def _gradient_boosting(self, coordinates, data, weights):
"""
Fit source coefficients through gradient boosting
"""
# Create rolling windows
point_windows, data_windows = self._create_windows(coordinates)
# Get number of windows
n_windows = len(point_windows)
# Initialize RMSE array
errors = [np.sqrt(np.mean(data**2))]
# Set weights_chunk to None (will be changed unless weights is None)
weights_chunk = None
# Initialized the predicted and residue arrays
predicted = np.empty_like(data)
residue = data.copy()
# Iterate over the windows
for window in range(n_windows):
# Get source and data points indices for current window
point_window, data_window = point_windows[window], data_windows[window]
# Choose source and data points that fall inside the window
points_chunk = tuple(p[point_window] for p in self.points_)
coords_chunk = tuple(c[data_window] for c in coordinates)
# Choose weights for data points inside the window (if not None)
if weights is not None:
weights_chunk = weights[data_window]
# Compute Jacobian (for sources and data points in current window)
jacobian = self.jacobian(coords_chunk, points_chunk)
# Fit coefficients of sources with residue points inside window
coeffs_chunk = vdb.least_squares(
jacobian,
residue[data_window],
weights_chunk,
self.damping,
)
# Predict field of the sources in the window on every data point
predicted[:] = 0
predict_numba_parallel(
coordinates,
points_chunk,
coeffs_chunk,
predicted,
self.greens_function,
)
# Update the residue
residue -= predicted
# Add RMS of the residue to the RMSE
errors.append(np.sqrt(np.mean(residue**2)))
# Update source coefficients
self.coefs_[point_window] += coeffs_chunk
self.rmse_per_iteration_ = np.array(errors)
def _create_windows(self, coordinates, shuffle=True):
"""
Create indices of sources and data points for each overlapping window
Parameters
----------
coordinates : tuple
Arrays with the coordinates of each data point. Should be in the
following order: (``easting``, ``northing``, ``upward``).
shuffle : bool
Enable or disable the random shuffling of windows order. It's is
highly recommended to enable shuffling for better fitting results.
This argument is mainly included for testing purposes. Default to
True.
Returns
-------
source_windows_nonempty : list
List containing arrays with the indices of the sources that fall
under each window. The order of the windows is randomly shuffled if
``shuffle_windows`` is True, although the order of the windows is
the same as the one in ``data_windows_nonempty``.
data_windows_nonempty : list
List containing arrays with the indices of the data points that
fall under each window. The order of the windows is randomly
shuffled if ``shuffle_windows`` is True, although the order of the
windows is the same as the one in ``source_windows_nonempty``.
"""
# Get the region that contains every data point and every source
region = _get_region_data_sources(coordinates, self.points_)
# Calculate the window size such that there are approximately 5000 data
# points in each window. Otherwise use the given window size.
if self.window_size == "default":
area = (region[1] - region[0]) * (region[3] - region[2])
ndata = coordinates[0].size
if ndata <= 5e3:
warnings.warn(
f"Found {ndata} number of coordinates (<= 5e3). "
"Only one window will be used.",
stacklevel=1,
)
source_windows_nonempty = [np.arange(self.points_[0].size)]
data_windows_nonempty = [np.arange(ndata)]
self.window_size_ = None
return source_windows_nonempty, data_windows_nonempty
points_per_m2 = ndata / area
window_area = 5e3 / points_per_m2
self.window_size_ = np.sqrt(window_area)
else:
self.window_size_ = self.window_size
# Compute window spacing based on overlapping
window_spacing = self.window_size_ * (1 - self.overlapping)
# The windows for sources and data points are the same, but the
# verde.rolling_window function creates indices for the given
# coordinates. That's why we need to create two set of window indices:
# one for the sources and one for the data points.
# We pass the same region, size and spacing to be sure that both set of
# windows are the same.
kwargs = dict(region=region, size=self.window_size_, spacing=window_spacing)
_, source_windows = rolling_window(self.points_, **kwargs)
_, data_windows = rolling_window(coordinates, **kwargs)
# Ravel the indices
source_windows = [i[0] for i in source_windows.ravel()]
data_windows = [i[0] for i in data_windows.ravel()]
# Shuffle windows
if shuffle:
source_windows, data_windows = utils.shuffle(
source_windows, data_windows, random_state=self.random_state
)
# Remove empty windows
source_windows_nonempty = []
data_windows_nonempty = []
for src, data in zip(source_windows, data_windows):
if src.size > 0 and data.size > 0:
source_windows_nonempty.append(src)
data_windows_nonempty.append(data)
return source_windows_nonempty, data_windows_nonempty
def _get_region_data_sources(coordinates, points):
"""
Return the region that contains every observation and every source
Parameters
----------
coordinates : tuple
points : tuple
Returns
-------
region : tuple
"""
data_region = get_region(coordinates)
sources_region = get_region(points)
region = (
min(data_region[0], sources_region[0]),
max(data_region[1], sources_region[1]),
min(data_region[2], sources_region[2]),
max(data_region[3], sources_region[3]),
)
return region