From 7bd1a4049a917bf9355c977f24e2b762dff878ef Mon Sep 17 00:00:00 2001 From: indiauppal Date: Tue, 2 Apr 2024 19:20:38 +0100 Subject: [PATCH 01/10] Change window size in EquivalentSourcesGB to calculate a window size such that approximately 5000 data points are in each window. --- .../_equivalent_sources/gradient_boosted.py | 22 ++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/harmonica/_equivalent_sources/gradient_boosted.py b/harmonica/_equivalent_sources/gradient_boosted.py index 7800add12..22efdc811 100644 --- a/harmonica/_equivalent_sources/gradient_boosted.py +++ b/harmonica/_equivalent_sources/gradient_boosted.py @@ -73,7 +73,8 @@ class EquivalentSourcesGB(EquivalentSources): algorithm. Smaller windows reduce the memory requirements of the source coefficients fitting process. Very small windows may impact on the accuracy of the interpolations. - Defaults to 5000. + Defaults to estimating a window size such that approximately 5000 data + points are in each window. parallel : bool If True any predictions and Jacobian building is carried out in parallel through Numba's ``jit.prange``, reducing the computation time. @@ -108,7 +109,7 @@ def __init__( depth=500, depth_type="relative", block_size=None, - window_size=5e3, + window_size="default", parallel=True, random_state=None, dtype="float64", @@ -303,17 +304,28 @@ def _create_windows(self, coordinates, shuffle=True): ``shuffle_windows`` is True, although the order of the windows is the same as the one in ``source_windows_nonempty``. """ - # Compute window spacing based on overlapping - window_spacing = self.window_size * (1 - self.overlapping) + # Get the region that contains every data point and every source region = _get_region_data_sources(coordinates, self.points_) + # Calculate the window size such that there are approximately 5000 data + # points in each window. Otherwise use the given window size. + if self.window_size == "default": + area = (region[1] - region[0]) * (region[3] - region[2]) + ndata = coordinates[0].size + points_per_m2 = ndata / area + window_area = 5e3 / points_per_m2 + self.window_size_ = np.sqrt(window_area) + else: + self.window_size_ = self.window_size + # Compute window spacing based on overlapping + window_spacing = self.window_size_ * (1 - self.overlapping) # The windows for sources and data points are the same, but the # verde.rolling_window function creates indices for the given # coordinates. That's why we need to create two set of window indices: # one for the sources and one for the data points. # We pass the same region, size and spacing to be sure that both set of # windows are the same. - kwargs = dict(region=region, size=self.window_size, spacing=window_spacing) + kwargs = dict(region=region, size=self.window_size_, spacing=window_spacing) _, source_windows = rolling_window(self.points_, **kwargs) _, data_windows = rolling_window(coordinates, **kwargs) # Ravel the indices From b97064acbe8ccafdb5e5b1e98da8aec935a9fa8d Mon Sep 17 00:00:00 2001 From: indiauppal Date: Tue, 23 Apr 2024 17:56:20 +0100 Subject: [PATCH 02/10] Add a test for the window size. Test for window size less than 5000 raising an error. Need to raise an error for data points less than and equal to 5e3. --- harmonica/tests/test_gradient_boosted_eqs.py | 23 ++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/harmonica/tests/test_gradient_boosted_eqs.py b/harmonica/tests/test_gradient_boosted_eqs.py index 320a99423..46ccc507c 100644 --- a/harmonica/tests/test_gradient_boosted_eqs.py +++ b/harmonica/tests/test_gradient_boosted_eqs.py @@ -373,3 +373,26 @@ def test_error_ignored_args(coordinates_small, data_small, region): msg = "The 'bla' arguments are being ignored." with pytest.warns(FutureWarning, match=msg): eqs.grid(coordinates=grid_coords, bla="bla") + + +def test_window_size_less_than_5000(): + region = (0, 10e3, -5e3, 5e3) + grid_coords = vd.grid_coordinates(region=region, shape=(64, 64), extra_coords=0) + eqs = EquivalentSourcesGB() + eqs.points_ = eqs._build_points( + grid_coords + ) # need to build sources first before creating windows. + eqs._create_windows(grid_coords) + expected_window_size = np.sqrt(5e3 / (64**2 / 10e3**2)) + npt.assert_allclose(eqs.window_size_, expected_window_size) + +def test_window_size(): + region = (0, 10e3, -5e3, 5e3) + grid_coords = vd.grid_coordinates(region=region, shape=(100, 100), extra_coords=0) + eqs = EquivalentSourcesGB() + eqs.points_ = eqs._build_points( + grid_coords + ) # need to build sources first before creating windows. + eqs._create_windows(grid_coords) + expected_window_size = np.sqrt(5e3 / (100**2 / 10e3**2)) + npt.assert_allclose(eqs.window_size_, expected_window_size) From 3f0beeaf4c0481849df0d4551d35e48074fb51fa Mon Sep 17 00:00:00 2001 From: indiauppal Date: Tue, 7 May 2024 16:54:50 +0100 Subject: [PATCH 03/10] Raise warning for data points less than or equal to 5e3. Update test to check for data points less than 5e3 and the associated warning. --- harmonica/_equivalent_sources/gradient_boosted.py | 11 +++++++++-- harmonica/tests/test_gradient_boosted_eqs.py | 6 +++--- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/harmonica/_equivalent_sources/gradient_boosted.py b/harmonica/_equivalent_sources/gradient_boosted.py index 1fde00248..dc2fb0a49 100644 --- a/harmonica/_equivalent_sources/gradient_boosted.py +++ b/harmonica/_equivalent_sources/gradient_boosted.py @@ -14,6 +14,7 @@ from .cartesian import EquivalentSources from .utils import cast_fit_input, predict_numba_parallel +import warnings class EquivalentSourcesGB(EquivalentSources): @@ -275,7 +276,7 @@ def _create_windows(self, coordinates, shuffle=True): coordinates : tuple Arrays with the coordinates of each data point. Should be in the following order: (``easting``, ``northing``, ``upward``). - shuffle_windows : bool + shuffle : bool Enable or disable the random shuffling of windows order. It's is highly recommended to enable shuffling for better fitting results. This argument is mainly included for testing purposes. Default to @@ -290,7 +291,7 @@ def _create_windows(self, coordinates, shuffle=True): the same as the one in ``data_windows_nonempty``. data_windows_nonempty : list List containing arrays with the indices of the data points that - under each window. The order of the windows is randomly shuffled if + fall under each window. The order of the windows is randomly shuffled if ``shuffle_windows`` is True, although the order of the windows is the same as the one in ``source_windows_nonempty``. """ @@ -302,6 +303,12 @@ def _create_windows(self, coordinates, shuffle=True): if self.window_size == "default": area = (region[1] - region[0]) * (region[3] - region[2]) ndata = coordinates[0].size + if ndata <= 5e3: + warnings.warn(f"Found {ndata} number of coordinates (<= 5e3). Only one window will be used.") + source_windows_nonempty = [np.arange(self.points_[0].size)] + data_windows_nonempty = [np.arange(ndata)] + self.window_size_ = None + return source_windows_nonempty, data_windows_nonempty points_per_m2 = ndata / area window_area = 5e3 / points_per_m2 self.window_size_ = np.sqrt(window_area) diff --git a/harmonica/tests/test_gradient_boosted_eqs.py b/harmonica/tests/test_gradient_boosted_eqs.py index 46ccc507c..f9d1b89e2 100644 --- a/harmonica/tests/test_gradient_boosted_eqs.py +++ b/harmonica/tests/test_gradient_boosted_eqs.py @@ -382,9 +382,9 @@ def test_window_size_less_than_5000(): eqs.points_ = eqs._build_points( grid_coords ) # need to build sources first before creating windows. - eqs._create_windows(grid_coords) - expected_window_size = np.sqrt(5e3 / (64**2 / 10e3**2)) - npt.assert_allclose(eqs.window_size_, expected_window_size) + with pytest.warns(UserWarning, match=f"Found {64**2} number of coordinates"): + eqs._create_windows(grid_coords) + assert eqs.window_size_ is None def test_window_size(): region = (0, 10e3, -5e3, 5e3) From 0474691676914f1119602f385a657a3ac5bd19fd Mon Sep 17 00:00:00 2001 From: indiauppal Date: Tue, 7 May 2024 17:01:06 +0100 Subject: [PATCH 04/10] Run black. --- harmonica/_equivalent_sources/gradient_boosted.py | 13 ++++++++----- harmonica/tests/test_gradient_boosted_eqs.py | 1 + 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/harmonica/_equivalent_sources/gradient_boosted.py b/harmonica/_equivalent_sources/gradient_boosted.py index dc2fb0a49..629d77411 100644 --- a/harmonica/_equivalent_sources/gradient_boosted.py +++ b/harmonica/_equivalent_sources/gradient_boosted.py @@ -7,6 +7,8 @@ """ Gradient-boosted equivalent sources in Cartesian coordinates """ +import warnings + import numpy as np import verde.base as vdb from sklearn import utils @@ -14,7 +16,6 @@ from .cartesian import EquivalentSources from .utils import cast_fit_input, predict_numba_parallel -import warnings class EquivalentSourcesGB(EquivalentSources): @@ -291,9 +292,9 @@ def _create_windows(self, coordinates, shuffle=True): the same as the one in ``data_windows_nonempty``. data_windows_nonempty : list List containing arrays with the indices of the data points that - fall under each window. The order of the windows is randomly shuffled if - ``shuffle_windows`` is True, although the order of the windows is - the same as the one in ``source_windows_nonempty``. + fall under each window. The order of the windows is randomly + shuffled if ``shuffle_windows`` is True, although the order of the + windows is the same as the one in ``source_windows_nonempty``. """ # Get the region that contains every data point and every source @@ -304,7 +305,9 @@ def _create_windows(self, coordinates, shuffle=True): area = (region[1] - region[0]) * (region[3] - region[2]) ndata = coordinates[0].size if ndata <= 5e3: - warnings.warn(f"Found {ndata} number of coordinates (<= 5e3). Only one window will be used.") + warnings.warn( + f"Found {ndata} number of coordinates (<= 5e3). Only one window will be used." + ) source_windows_nonempty = [np.arange(self.points_[0].size)] data_windows_nonempty = [np.arange(ndata)] self.window_size_ = None diff --git a/harmonica/tests/test_gradient_boosted_eqs.py b/harmonica/tests/test_gradient_boosted_eqs.py index f9d1b89e2..306dc0e47 100644 --- a/harmonica/tests/test_gradient_boosted_eqs.py +++ b/harmonica/tests/test_gradient_boosted_eqs.py @@ -386,6 +386,7 @@ def test_window_size_less_than_5000(): eqs._create_windows(grid_coords) assert eqs.window_size_ is None + def test_window_size(): region = (0, 10e3, -5e3, 5e3) grid_coords = vd.grid_coordinates(region=region, shape=(100, 100), extra_coords=0) From 14b1f6534b5d2fdf08aeb9a07ff740abf8562bc0 Mon Sep 17 00:00:00 2001 From: indiauppal Date: Wed, 8 May 2024 12:12:51 +0100 Subject: [PATCH 05/10] Add India Uppal to AUTHORS.md Add India Uppal to the author list. --- AUTHORS.md | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS.md b/AUTHORS.md index 38023f8f3..42039f754 100644 --- a/AUTHORS.md +++ b/AUTHORS.md @@ -12,4 +12,5 @@ order by last name) and are considered "The Harmonica Developers": * [Santiago Soler](https://github.com/santisoler) - Department of Earth, Ocean and Atmospheric Sciences, University of British Columbia (ORCID: 0000-0001-9202-5317) * [Matthew Tankersley](https://github.com/mdtanker) - Antarctic Research Centre, Victoria University of Wellington, New Zealand (ORCID: 0000-0003-4266-8554) * [Leonardo Uieda](https://github.com/leouieda) - Universidade de São Paulo, Brazil (ORCID: 0000-0001-6123-9515) +* [India Uppal](https://github.com/indiauppal) - The University of Liverpool, UK (ORCID:0000-0003-3531-2656) * [ziebam](https://github.com/ziebam) (not included in Zenodo) From e33afa3957e859786409f2b273a06b5e2e0de21d Mon Sep 17 00:00:00 2001 From: India Uppal <89797209+indiauppal@users.noreply.github.com> Date: Wed, 8 May 2024 12:22:02 +0100 Subject: [PATCH 06/10] Revert "Add India Uppal to AUTHORS.md" This reverts commit 14b1f6534b5d2fdf08aeb9a07ff740abf8562bc0. --- AUTHORS.md | 1 - 1 file changed, 1 deletion(-) diff --git a/AUTHORS.md b/AUTHORS.md index 42039f754..38023f8f3 100644 --- a/AUTHORS.md +++ b/AUTHORS.md @@ -12,5 +12,4 @@ order by last name) and are considered "The Harmonica Developers": * [Santiago Soler](https://github.com/santisoler) - Department of Earth, Ocean and Atmospheric Sciences, University of British Columbia (ORCID: 0000-0001-9202-5317) * [Matthew Tankersley](https://github.com/mdtanker) - Antarctic Research Centre, Victoria University of Wellington, New Zealand (ORCID: 0000-0003-4266-8554) * [Leonardo Uieda](https://github.com/leouieda) - Universidade de São Paulo, Brazil (ORCID: 0000-0001-6123-9515) -* [India Uppal](https://github.com/indiauppal) - The University of Liverpool, UK (ORCID:0000-0003-3531-2656) * [ziebam](https://github.com/ziebam) (not included in Zenodo) From 3eec5ac6b74f3630e70030598d507875ce99ceeb Mon Sep 17 00:00:00 2001 From: indiauppal Date: Fri, 31 May 2024 16:50:29 +0100 Subject: [PATCH 07/10] Add to test_window_size_less_than_5000 Check only 1 data and source window is created. Check if all sources and data points are inside the window. --- harmonica/tests/test_gradient_boosted_eqs.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/harmonica/tests/test_gradient_boosted_eqs.py b/harmonica/tests/test_gradient_boosted_eqs.py index 306dc0e47..dbe900f5a 100644 --- a/harmonica/tests/test_gradient_boosted_eqs.py +++ b/harmonica/tests/test_gradient_boosted_eqs.py @@ -378,13 +378,21 @@ def test_error_ignored_args(coordinates_small, data_small, region): def test_window_size_less_than_5000(): region = (0, 10e3, -5e3, 5e3) grid_coords = vd.grid_coordinates(region=region, shape=(64, 64), extra_coords=0) + grid_coords = [c.ravel() for c in grid_coords] eqs = EquivalentSourcesGB() eqs.points_ = eqs._build_points( grid_coords ) # need to build sources first before creating windows. with pytest.warns(UserWarning, match=f"Found {64**2} number of coordinates"): - eqs._create_windows(grid_coords) + source_windows, data_windows = eqs._create_windows(grid_coords) assert eqs.window_size_ is None + assert len(source_windows) == 1 + assert len(data_windows) == 1 + # Check if all sources and data points are inside the window + for coord in eqs.points_: + npt.assert_allclose(coord, coord[source_windows[0]]) + for coord in grid_coords: + npt.assert_allclose(coord, coord[data_windows[0]]) def test_window_size(): From 793234f5b6f6c2fab79d46e5fd2bf48da3aea56d Mon Sep 17 00:00:00 2001 From: indiauppal Date: Fri, 31 May 2024 17:15:13 +0100 Subject: [PATCH 08/10] Add test to check invalid window_size. --- harmonica/_equivalent_sources/gradient_boosted.py | 5 +++++ harmonica/tests/test_gradient_boosted_eqs.py | 5 +++++ 2 files changed, 10 insertions(+) diff --git a/harmonica/_equivalent_sources/gradient_boosted.py b/harmonica/_equivalent_sources/gradient_boosted.py index 629d77411..6accf9779 100644 --- a/harmonica/_equivalent_sources/gradient_boosted.py +++ b/harmonica/_equivalent_sources/gradient_boosted.py @@ -107,6 +107,11 @@ def __init__( random_state=None, dtype="float64", ): + if isinstance(window_size, str) and window_size != "default": + raise ValueError( + f"Found invalid 'window_size' value equal to '{window_size}'." + "It should be 'default' or a numeric value." + ) super().__init__( damping=damping, points=points, diff --git a/harmonica/tests/test_gradient_boosted_eqs.py b/harmonica/tests/test_gradient_boosted_eqs.py index dbe900f5a..31b18db10 100644 --- a/harmonica/tests/test_gradient_boosted_eqs.py +++ b/harmonica/tests/test_gradient_boosted_eqs.py @@ -405,3 +405,8 @@ def test_window_size(): eqs._create_windows(grid_coords) expected_window_size = np.sqrt(5e3 / (100**2 / 10e3**2)) npt.assert_allclose(eqs.window_size_, expected_window_size) + + +def test_invalid_window_size(): + with pytest.raises(ValueError, match=f"Found invalid 'window_size' value equal to"): + EquivalentSourcesGB(window_size="Chuckie took my soul!") From 34392e617c883b26d8381e6f5df1c66847d88290 Mon Sep 17 00:00:00 2001 From: indiauppal Date: Fri, 31 May 2024 17:31:18 +0100 Subject: [PATCH 09/10] Add window_size_ to Attributes. --- harmonica/_equivalent_sources/gradient_boosted.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/harmonica/_equivalent_sources/gradient_boosted.py b/harmonica/_equivalent_sources/gradient_boosted.py index 6accf9779..4fa7966e4 100644 --- a/harmonica/_equivalent_sources/gradient_boosted.py +++ b/harmonica/_equivalent_sources/gradient_boosted.py @@ -62,7 +62,7 @@ class EquivalentSourcesGB(EquivalentSources): If None, no block-averaging is applied. This parameter is ignored if *points* are specified. Default to None. - window_size : float + window_size : float or "default" Size of overlapping windows used during the gradient-boosting algorithm. Smaller windows reduce the memory requirements of the source coefficients fitting process. Very small windows may impact on the @@ -87,6 +87,11 @@ class EquivalentSourcesGB(EquivalentSources): The boundaries (``[W, E, S, N]``) of the data used to fit the interpolator. Used as the default region for the :meth:`~harmonica.EquivalentSources.grid` method. + window_size_ : float or None + Size of the overlapping windows used in gradient-boosting equivalent + point sources. It will be set to None if ``window_size = "default"`` + and less than 5000 data points were used to fit the sources; a single + window will be used in such case. References ---------- From 82efb7564624f49b465c65177ba6fb64619aa9f0 Mon Sep 17 00:00:00 2001 From: indiauppal Date: Fri, 31 May 2024 17:35:32 +0100 Subject: [PATCH 10/10] Fix style. --- harmonica/_equivalent_sources/gradient_boosted.py | 4 ++-- harmonica/tests/test_gradient_boosted_eqs.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/harmonica/_equivalent_sources/gradient_boosted.py b/harmonica/_equivalent_sources/gradient_boosted.py index 4fa7966e4..7198423cc 100644 --- a/harmonica/_equivalent_sources/gradient_boosted.py +++ b/harmonica/_equivalent_sources/gradient_boosted.py @@ -88,8 +88,8 @@ class EquivalentSourcesGB(EquivalentSources): interpolator. Used as the default region for the :meth:`~harmonica.EquivalentSources.grid` method. window_size_ : float or None - Size of the overlapping windows used in gradient-boosting equivalent - point sources. It will be set to None if ``window_size = "default"`` + Size of the overlapping windows used in gradient-boosting equivalent + point sources. It will be set to None if ``window_size = "default"`` and less than 5000 data points were used to fit the sources; a single window will be used in such case. diff --git a/harmonica/tests/test_gradient_boosted_eqs.py b/harmonica/tests/test_gradient_boosted_eqs.py index 31b18db10..9492b1f1a 100644 --- a/harmonica/tests/test_gradient_boosted_eqs.py +++ b/harmonica/tests/test_gradient_boosted_eqs.py @@ -408,5 +408,5 @@ def test_window_size(): def test_invalid_window_size(): - with pytest.raises(ValueError, match=f"Found invalid 'window_size' value equal to"): + with pytest.raises(ValueError, match="Found invalid 'window_size' value equal to"): EquivalentSourcesGB(window_size="Chuckie took my soul!")