diff --git a/harmonica/_equivalent_sources/gradient_boosted.py b/harmonica/_equivalent_sources/gradient_boosted.py index a50b287ec..7198423cc 100644 --- a/harmonica/_equivalent_sources/gradient_boosted.py +++ b/harmonica/_equivalent_sources/gradient_boosted.py @@ -7,6 +7,8 @@ """ Gradient-boosted equivalent sources in Cartesian coordinates """ +import warnings + import numpy as np import verde.base as vdb from sklearn import utils @@ -60,12 +62,13 @@ class EquivalentSourcesGB(EquivalentSources): If None, no block-averaging is applied. This parameter is ignored if *points* are specified. Default to None. - window_size : float + window_size : float or "default" Size of overlapping windows used during the gradient-boosting algorithm. Smaller windows reduce the memory requirements of the source coefficients fitting process. Very small windows may impact on the accuracy of the interpolations. - Defaults to 5000. + Defaults to estimating a window size such that approximately 5000 data + points are in each window. parallel : bool If True any predictions and Jacobian building is carried out in parallel through Numba's ``jit.prange``, reducing the computation time. @@ -84,6 +87,11 @@ class EquivalentSourcesGB(EquivalentSources): The boundaries (``[W, E, S, N]``) of the data used to fit the interpolator. Used as the default region for the :meth:`~harmonica.EquivalentSources.grid` method. + window_size_ : float or None + Size of the overlapping windows used in gradient-boosting equivalent + point sources. It will be set to None if ``window_size = "default"`` + and less than 5000 data points were used to fit the sources; a single + window will be used in such case. References ---------- @@ -99,11 +107,16 @@ def __init__( points=None, depth=500, block_size=None, - window_size=5e3, + window_size="default", parallel=True, random_state=None, dtype="float64", ): + if isinstance(window_size, str) and window_size != "default": + raise ValueError( + f"Found invalid 'window_size' value equal to '{window_size}'." + "It should be 'default' or a numeric value." + ) super().__init__( damping=damping, points=points, @@ -274,7 +287,7 @@ def _create_windows(self, coordinates, shuffle=True): coordinates : tuple Arrays with the coordinates of each data point. Should be in the following order: (``easting``, ``northing``, ``upward``). - shuffle_windows : bool + shuffle : bool Enable or disable the random shuffling of windows order. It's is highly recommended to enable shuffling for better fitting results. This argument is mainly included for testing purposes. Default to @@ -289,21 +302,40 @@ def _create_windows(self, coordinates, shuffle=True): the same as the one in ``data_windows_nonempty``. data_windows_nonempty : list List containing arrays with the indices of the data points that - under each window. The order of the windows is randomly shuffled if - ``shuffle_windows`` is True, although the order of the windows is - the same as the one in ``source_windows_nonempty``. + fall under each window. The order of the windows is randomly + shuffled if ``shuffle_windows`` is True, although the order of the + windows is the same as the one in ``source_windows_nonempty``. """ - # Compute window spacing based on overlapping - window_spacing = self.window_size * (1 - self.overlapping) + # Get the region that contains every data point and every source region = _get_region_data_sources(coordinates, self.points_) + # Calculate the window size such that there are approximately 5000 data + # points in each window. Otherwise use the given window size. + if self.window_size == "default": + area = (region[1] - region[0]) * (region[3] - region[2]) + ndata = coordinates[0].size + if ndata <= 5e3: + warnings.warn( + f"Found {ndata} number of coordinates (<= 5e3). Only one window will be used." + ) + source_windows_nonempty = [np.arange(self.points_[0].size)] + data_windows_nonempty = [np.arange(ndata)] + self.window_size_ = None + return source_windows_nonempty, data_windows_nonempty + points_per_m2 = ndata / area + window_area = 5e3 / points_per_m2 + self.window_size_ = np.sqrt(window_area) + else: + self.window_size_ = self.window_size + # Compute window spacing based on overlapping + window_spacing = self.window_size_ * (1 - self.overlapping) # The windows for sources and data points are the same, but the # verde.rolling_window function creates indices for the given # coordinates. That's why we need to create two set of window indices: # one for the sources and one for the data points. # We pass the same region, size and spacing to be sure that both set of # windows are the same. - kwargs = dict(region=region, size=self.window_size, spacing=window_spacing) + kwargs = dict(region=region, size=self.window_size_, spacing=window_spacing) _, source_windows = rolling_window(self.points_, **kwargs) _, data_windows = rolling_window(coordinates, **kwargs) # Ravel the indices diff --git a/harmonica/tests/test_gradient_boosted_eqs.py b/harmonica/tests/test_gradient_boosted_eqs.py index 320a99423..9492b1f1a 100644 --- a/harmonica/tests/test_gradient_boosted_eqs.py +++ b/harmonica/tests/test_gradient_boosted_eqs.py @@ -373,3 +373,40 @@ def test_error_ignored_args(coordinates_small, data_small, region): msg = "The 'bla' arguments are being ignored." with pytest.warns(FutureWarning, match=msg): eqs.grid(coordinates=grid_coords, bla="bla") + + +def test_window_size_less_than_5000(): + region = (0, 10e3, -5e3, 5e3) + grid_coords = vd.grid_coordinates(region=region, shape=(64, 64), extra_coords=0) + grid_coords = [c.ravel() for c in grid_coords] + eqs = EquivalentSourcesGB() + eqs.points_ = eqs._build_points( + grid_coords + ) # need to build sources first before creating windows. + with pytest.warns(UserWarning, match=f"Found {64**2} number of coordinates"): + source_windows, data_windows = eqs._create_windows(grid_coords) + assert eqs.window_size_ is None + assert len(source_windows) == 1 + assert len(data_windows) == 1 + # Check if all sources and data points are inside the window + for coord in eqs.points_: + npt.assert_allclose(coord, coord[source_windows[0]]) + for coord in grid_coords: + npt.assert_allclose(coord, coord[data_windows[0]]) + + +def test_window_size(): + region = (0, 10e3, -5e3, 5e3) + grid_coords = vd.grid_coordinates(region=region, shape=(100, 100), extra_coords=0) + eqs = EquivalentSourcesGB() + eqs.points_ = eqs._build_points( + grid_coords + ) # need to build sources first before creating windows. + eqs._create_windows(grid_coords) + expected_window_size = np.sqrt(5e3 / (100**2 / 10e3**2)) + npt.assert_allclose(eqs.window_size_, expected_window_size) + + +def test_invalid_window_size(): + with pytest.raises(ValueError, match="Found invalid 'window_size' value equal to"): + EquivalentSourcesGB(window_size="Chuckie took my soul!")