From 65ecf068a88b0a8f1ce86d5d52f0cf568a556447 Mon Sep 17 00:00:00 2001 From: grantbuster Date: Wed, 6 Dec 2023 14:25:24 -0700 Subject: [PATCH 01/15] refactor: sub-pixel aggregation for bias correction instead of NN distance lookup --- sup3r/bias/bias_calc.py | 116 +++++++++++++++++------------ tests/bias/test_bias_correction.py | 104 +++++++++++++++----------- 2 files changed, 128 insertions(+), 92 deletions(-) diff --git a/sup3r/bias/bias_calc.py b/sup3r/bias/bias_calc.py index 2ac31677f..cb73a5691 100644 --- a/sup3r/bias/bias_calc.py +++ b/sup3r/bias/bias_calc.py @@ -36,10 +36,12 @@ def __init__(self, bias_fps, base_dset, bias_feature, + distance_upper_bound, target=None, shape=None, base_handler='Resource', bias_handler='DataHandlerNCforCC', + base_handler_kwargs=None, bias_handler_kwargs=None, decimals=None): """ @@ -60,6 +62,10 @@ def __init__(self, bias_feature : str This is the biased feature from bias_fps to retrieve. This should be a single feature name corresponding to base_dset + distance_upper_bound : float + Upper bound on the nearest neighbor distance in decimal degrees. + This should be the approximate resolution of the low-resolution + bias data. target : tuple (lat, lon) lower left corner of raster to retrieve from bias_fps. If None then the lower left corner of the full domain will be used. @@ -67,11 +73,14 @@ def __init__(self, (rows, cols) grid size to retrieve from bias_fps. If None then the full domain shape will be used. base_handler : str - Name of rex resource handler class to be retrieved from the rex - library. + Name of rex resource handler or sup3r.preprocessing.data_handling + class to be retrieved from the rex/sup3r library. bias_handler : str Name of the bias data handler class to be retrieved from the sup3r.preprocessing.data_handling library. + base_handler_kwargs : dict | None + Optional kwargs to send to the initialization of the base_handler + class bias_handler_kwargs : dict | None Optional kwargs to send to the initialization of the bias_handler class @@ -92,36 +101,49 @@ def __init__(self, self.target = target self.shape = shape self.decimals = decimals - bias_handler_kwargs = bias_handler_kwargs or {} + self.base_handler_kwargs = base_handler_kwargs or {} + self.bias_handler_kwargs = bias_handler_kwargs or {} + self.bad_bias_gids = [] if isinstance(self.base_fps, str): self.base_fps = sorted(glob(self.base_fps)) if isinstance(self.bias_fps, str): self.bias_fps = sorted(glob(self.bias_fps)) - self.base_handler = getattr(rex, base_handler) + base_sup3r_h = getattr(sup3r.preprocessing.data_handling, + base_handler, None) + base_rex_h = getattr(rex, base_handler, None) + msg = f'Could not retrieve "{base_handler}" from sup3r or rex!' + assert base_sup3r_h is not None or base_rex_h is not None, msg + self.base_handler = base_rex_h or base_sup3r_h + self.bias_handler = getattr(sup3r.preprocessing.data_handling, bias_handler) - with self.base_handler(self.base_fps[0]) as res: - self.base_meta = res.meta - self.base_tree = KDTree(self.base_meta[['latitude', 'longitude']]) + self.base_dh = self.base_handler(self.base_fps[0], + **self.base_handler_kwargs) + self.base_meta = self.base_dh.meta self.bias_dh = self.bias_handler(self.bias_fps, [self.bias_feature], target=self.target, shape=self.shape, val_split=0.0, - **bias_handler_kwargs) + **self.bias_handler_kwargs) lats = self.bias_dh.lat_lon[..., 0].flatten() - lons = self.bias_dh.lat_lon[..., 1].flatten() - self.bias_meta = pd.DataFrame({'latitude': lats, 'longitude': lons}) + self.bias_meta = self.bias_dh.meta self.bias_ti = self.bias_dh.time_index raster_shape = self.bias_dh.lat_lon[..., 0].shape - self.bias_tree = KDTree(self.bias_meta[['latitude', 'longitude']]) + bias_lat_lon = self.bias_meta[['latitude', 'longitude']].values + self.bias_tree = KDTree(bias_lat_lon) self.bias_gid_raster = np.arange(lats.size) self.bias_gid_raster = self.bias_gid_raster.reshape(raster_shape) + out = self.bias_tree.query(self.base_meta[['latitude', 'longitude']], + k=1, + distance_upper_bound=distance_upper_bound) + self.nn_dist, self.nn_ind = out + self.out = None self._init_out() logger.info('Finished initializing DataRetrievalBase.') @@ -238,7 +260,7 @@ def get_bias_gid(self, coord): bias_gid = self.bias_gid_raster.flatten()[i] return bias_gid, d - def get_base_gid(self, bias_gid, knn): + def get_base_gid(self, bias_gid): """Get one or more base gid(s) corresponding to a bias gid. Parameters @@ -247,32 +269,29 @@ def get_base_gid(self, bias_gid, knn): gid of the data to retrieve in the bias data source raster data. The gids for this data source are the enumerated indices of the flattened coordinate array. - knn : int - Number of nearest neighbors to aggregate from the base data when - comparing to a single site from the bias data. Returns ------- dist : np.ndarray - Array of nearest neighbor distances with length == knn + Array of nearest neighbor distances with length equal to the number + of high-resolution baseline gids that map to the low resolution + bias gid pixel. base_gid : np.ndarray Array of base gids that are the nearest neighbors of bias_gid with - length == knn + length equal to the number of high-resolution baseline gids that + map to the low resolution bias gid pixel. """ - coord = self.bias_meta.loc[bias_gid, ['latitude', 'longitude']] - dist, base_gid = self.base_tree.query(coord, k=knn) + base_gid = np.where(self.nn_ind == bias_gid)[0] + dist = self.nn_dist[base_gid] return dist, base_gid - def get_data_pair(self, coord, knn, daily_reduction='avg'): + def get_data_pair(self, coord, daily_reduction='avg'): """Get base and bias data observations based on a single bias gid. Parameters ---------- coord : tuple (lat, lon) to get data for. - knn : int - Number of nearest neighbors to aggregate from the base data when - comparing to a single site from the bias data. daily_reduction : None | str Option to do a reduction of the hourly+ source base data to daily data. Can be None (no reduction, keep source time frequency), "avg" @@ -287,12 +306,13 @@ def get_data_pair(self, coord, knn, daily_reduction='avg'): 1D array of temporal data at the requested gid. base_dist : np.ndarray Array of nearest neighbor distances from coord to the base data - sites with length == knn + sites with length equal to the number of high-resolution baseline + gids that map to the low resolution bias gid pixel. bias_dist : Float Nearest neighbor distance from coord to the bias data site """ bias_gid, bias_dist = self.get_bias_gid(coord) - base_dist, base_gid = self.get_base_gid(bias_gid, knn) + base_dist, base_gid = self.get_base_gid(bias_gid) bias_data = self.get_bias_data(bias_gid) base_data = self.get_base_data(self.base_fps, self.base_dset, @@ -651,6 +671,10 @@ def fill_and_smooth(self, like: bias_data * scalar + adder. Each value is of shape (lat, lon, time). """ + if len(self.bad_bias_gids) > 0: + logger.info('Found {} bias gids that are out of bounds: {}' + .format(len(self.bad_bias_gids), self.bad_bias_gids)) + for key, arr in out.items(): nan_mask = np.isnan(arr[..., 0]) for idt in range(arr.shape[-1]): @@ -661,6 +685,9 @@ def fill_and_smooth(self, and fill_extend) or smooth_interior > 0 if needs_fill: + logger.info('Filling NaN values outside of valid spatial ' + 'extent for dataset "{}" for timestep {}' + .format(key, idt)) arr_smooth = nn_fill_array(arr_smooth) arr_smooth_int = arr_smooth_ext = arr_smooth @@ -714,8 +741,6 @@ def write_outputs(self, fp_out, out): 'Wrote scalar adder factors to file: {}'.format(fp_out)) def run(self, - knn, - threshold=0.6, fp_out=None, max_workers=None, daily_reduction='avg', @@ -727,14 +752,6 @@ def run(self, Parameters ---------- - knn : int - Number of nearest neighbors to aggregate from the base data when - comparing to a single site from the bias data. - threshold : float - If the bias data coordinate is on average further from the base - data coordinates than this threshold, no bias correction factors - will be calculated directly and will just be filled from nearest - neighbor (if fill_extend=True, else it will be nan). fp_out : str | None Optional .h5 output file to write scalar and adder arrays. max_workers : int @@ -745,13 +762,13 @@ def run(self, data. Can be None (no reduction, keep source time frequency), "avg" (daily average), "max" (daily max), or "min" (daily min) fill_extend : bool - Flag to fill data past threshold using spatial nearest neighbor. If - False, the extended domain will be left as NaN. + Flag to fill data past distance_upper_bound using spatial nearest + neighbor. If False, the extended domain will be left as NaN. smooth_extend : float Option to smooth the scalar/adder data outside of the spatial - domain set by the threshold input. This alleviates the weird seams - far from the domain of interest. This value is the standard - deviation for the gaussian_filter kernel + domain set by the distance_upper_bound input. This alleviates the + weird seams far from the domain of interest. This value is the + standard deviation for the gaussian_filter kernel smooth_interior : float Option to smooth the scalar/adder data within the valid spatial domain. This can reduce the affect of extreme values within @@ -770,14 +787,17 @@ def run(self, logger.info('Initialized scalar / adder with shape: {}' .format(self.bias_gid_raster.shape)) + self.bad_bias_gids = [] + if max_workers == 1: logger.debug('Running serial calculation.') - for i, (bias_gid, row) in enumerate(self.bias_meta.iterrows()): + for i, bias_gid in enumerate(self.bias_meta.index): raster_loc = np.where(self.bias_gid_raster == bias_gid) - coord = row[['latitude', 'longitude']] - dist, base_gid = self.base_tree.query(coord, k=knn) + dist, base_gid = self.get_base_gid(bias_gid) - if np.mean(dist) < threshold: + if not base_gid.any(): + self.bad_bias_gids.append(bias_gid) + else: bias_data = self.get_bias_data(bias_gid) single_out = self._run_single( bias_data, @@ -804,12 +824,12 @@ def run(self, futures = {} for bias_gid, bias_row in self.bias_meta.iterrows(): raster_loc = np.where(self.bias_gid_raster == bias_gid) - coord = bias_row[['latitude', 'longitude']] - dist, base_gid = self.base_tree.query(coord, k=knn) + dist, base_gid = self.get_base_gid(bias_gid) - if np.mean(dist) < threshold: + if not base_gid.any(): + self.bad_bias_gids.append(bias_gid) + else: bias_data = self.get_bias_data(bias_gid) - future = exe.submit( self._run_single, bias_data, diff --git a/tests/bias/test_bias_correction.py b/tests/bias/test_bias_correction.py index 0f6c022ad..e5dbd9cf6 100644 --- a/tests/bias/test_bias_correction.py +++ b/tests/bias/test_bias_correction.py @@ -32,17 +32,20 @@ def test_smooth_interior_bc(): """Test linear bias correction with interior smoothing""" calc = LinearCorrection(FP_NSRDB, FP_CC, 'ghi', 'rsds', - TARGET, SHAPE, bias_handler='DataHandlerNCforCC') - out = calc.run(knn=1, threshold=0.6, fill_extend=False, max_workers=1) + target=TARGET, shape=SHAPE, + distance_upper_bound=0.7, + bias_handler='DataHandlerNCforCC') + out = calc.run(fill_extend=False, max_workers=1) og_scalar = out['rsds_scalar'] og_adder = out['rsds_adder'] nan_mask = np.isnan(og_scalar) assert np.isnan(og_adder[nan_mask]).all() calc = LinearCorrection(FP_NSRDB, FP_CC, 'ghi', 'rsds', - TARGET, SHAPE, bias_handler='DataHandlerNCforCC') - out = calc.run(knn=1, threshold=0.6, fill_extend=True, smooth_interior=0, - max_workers=1) + target=TARGET, shape=SHAPE, + distance_upper_bound=0.7, + bias_handler='DataHandlerNCforCC') + out = calc.run(fill_extend=True, smooth_interior=0, max_workers=1) scalar = out['rsds_scalar'] adder = out['rsds_adder'] # Make sure smooth_interior=0 does not change interior pixels @@ -53,9 +56,10 @@ def test_smooth_interior_bc(): # make sure smoothing affects the interior pixels but not the exterior calc = LinearCorrection(FP_NSRDB, FP_CC, 'ghi', 'rsds', - TARGET, SHAPE, bias_handler='DataHandlerNCforCC') - out = calc.run(knn=1, threshold=0.6, fill_extend=True, smooth_interior=1, - max_workers=1) + target=TARGET, shape=SHAPE, + distance_upper_bound=0.7, + bias_handler='DataHandlerNCforCC') + out = calc.run(fill_extend=True, smooth_interior=1, max_workers=1) smooth_scalar = out['rsds_scalar'] smooth_adder = out['rsds_adder'] @@ -69,25 +73,27 @@ def test_linear_bc(): """Test linear bias correction""" calc = LinearCorrection(FP_NSRDB, FP_CC, 'ghi', 'rsds', - TARGET, SHAPE, bias_handler='DataHandlerNCforCC') + target=TARGET, shape=SHAPE, + distance_upper_bound=0.7, + bias_handler='DataHandlerNCforCC') # test a known in-bounds gid bias_gid = 5 - dist, base_gid = calc.get_base_gid(bias_gid, 1) + dist, base_gid = calc.get_base_gid(bias_gid) bias_data = calc.get_bias_data(bias_gid) base_data, _ = calc.get_base_data(calc.base_fps, calc.base_dset, base_gid, calc.base_handler, daily_reduction='avg') - bias_coord = calc.bias_meta.loc[bias_gid, ['latitude', 'longitude']] + bias_coord = calc.bias_meta.loc[[bias_gid], ['latitude', 'longitude']] base_coord = calc.base_meta.loc[base_gid, ['latitude', 'longitude']] true_dist = bias_coord.values - base_coord.values - true_dist = np.hypot(true_dist[0], true_dist[1]) + true_dist = np.hypot(true_dist[:, 0], true_dist[:, 1]) assert np.allclose(true_dist, dist) - assert true_dist < 0.1 + assert (true_dist < 0.5).all() # horiz res of bias data is ~0.7 deg true_scalar = base_data.std() / bias_data.std() true_adder = base_data.mean() - bias_data.mean() * true_scalar - out = calc.run(knn=1, threshold=0.6, fill_extend=False, max_workers=1) + out = calc.run(fill_extend=False, max_workers=1) scalar = out['rsds_scalar'] adder = out['rsds_adder'] @@ -106,11 +112,14 @@ def test_linear_bc(): assert np.isnan(adder[corner]) nan_mask = np.isnan(scalar) assert np.isnan(adder[nan_mask]).all() + assert len(calc.bad_bias_gids) > 0 # make sure the NN fill works for out-of-bounds pixels calc = LinearCorrection(FP_NSRDB, FP_CC, 'ghi', 'rsds', - TARGET, SHAPE, bias_handler='DataHandlerNCforCC') - out = calc.run(knn=1, threshold=0.6, fill_extend=True, max_workers=1) + target=TARGET, shape=SHAPE, + distance_upper_bound=0.7, + bias_handler='DataHandlerNCforCC') + out = calc.run(fill_extend=True, max_workers=1) scalar = out['rsds_scalar'] adder = out['rsds_adder'] @@ -118,14 +127,16 @@ def test_linear_bc(): assert np.allclose(true_scalar, scalar[iloc]) assert np.allclose(true_adder, adder[iloc]) + assert len(calc.bad_bias_gids) > 0 assert not np.isnan(scalar[nan_mask]).any() assert not np.isnan(adder[nan_mask]).any() # make sure smoothing affects the out-of-bounds pixels but not the in-bound calc = LinearCorrection(FP_NSRDB, FP_CC, 'ghi', 'rsds', - TARGET, SHAPE, bias_handler='DataHandlerNCforCC') - out = calc.run(knn=1, threshold=0.6, fill_extend=True, smooth_extend=2, - max_workers=1) + target=TARGET, shape=SHAPE, + distance_upper_bound=0.7, + bias_handler='DataHandlerNCforCC') + out = calc.run(fill_extend=True, smooth_extend=2, max_workers=1) smooth_scalar = out['rsds_scalar'] smooth_adder = out['rsds_adder'] assert np.allclose(smooth_scalar[~nan_mask], scalar[~nan_mask]) @@ -135,9 +146,10 @@ def test_linear_bc(): # parallel test calc = LinearCorrection(FP_NSRDB, FP_CC, 'ghi', 'rsds', - TARGET, SHAPE, bias_handler='DataHandlerNCforCC') - out = calc.run(knn=1, threshold=0.6, fill_extend=True, smooth_extend=2, - max_workers=2) + target=TARGET, shape=SHAPE, + distance_upper_bound=0.7, + bias_handler='DataHandlerNCforCC') + out = calc.run(fill_extend=True, smooth_extend=2, max_workers=2) par_scalar = out['rsds_scalar'] par_adder = out['rsds_adder'] assert np.allclose(smooth_scalar, par_scalar) @@ -148,28 +160,29 @@ def test_monthly_linear_bc(): """Test linear bias correction on a month-by-month basis""" calc = MonthlyLinearCorrection(FP_NSRDB, FP_CC, 'ghi', 'rsds', - TARGET, SHAPE, + target=TARGET, shape=SHAPE, + distance_upper_bound=0.7, bias_handler='DataHandlerNCforCC') # test a known in-bounds gid bias_gid = 5 - dist, base_gid = calc.get_base_gid(bias_gid, 1) + dist, base_gid = calc.get_base_gid(bias_gid) bias_data = calc.get_bias_data(bias_gid) base_data, base_ti = calc.get_base_data(calc.base_fps, calc.base_dset, base_gid, calc.base_handler, daily_reduction='avg') - bias_coord = calc.bias_meta.loc[bias_gid, ['latitude', 'longitude']] + bias_coord = calc.bias_meta.loc[[bias_gid], ['latitude', 'longitude']] base_coord = calc.base_meta.loc[base_gid, ['latitude', 'longitude']] true_dist = bias_coord.values - base_coord.values - true_dist = np.hypot(true_dist[0], true_dist[1]) + true_dist = np.hypot(true_dist[:, 0], true_dist[:, 1]) assert np.allclose(true_dist, dist) - assert true_dist < 0.1 + assert (true_dist < 0.5).all() # horiz res of bias data is ~0.7 deg base_data = base_data[:31] # just take Jan for testing bias_data = bias_data[:31] # just take Jan for testing true_scalar = base_data.std() / bias_data.std() true_adder = base_data.mean() - bias_data.mean() * true_scalar - out = calc.run(knn=1, threshold=0.6, fill_extend=True, max_workers=1) + out = calc.run(fill_extend=True, max_workers=1) scalar = out['rsds_scalar'] adder = out['rsds_adder'] @@ -191,12 +204,13 @@ def test_monthly_linear_bc(): def test_linear_transform(): """Test the linear bc transform method""" calc = LinearCorrection(FP_NSRDB, FP_CC, 'ghi', 'rsds', - TARGET, SHAPE, bias_handler='DataHandlerNCforCC') + target=TARGET, shape=SHAPE, + distance_upper_bound=0.7, + bias_handler='DataHandlerNCforCC') lat_lon = calc.bias_dh.lat_lon with tempfile.TemporaryDirectory() as td: fp_out = os.path.join(td, 'bc.h5') - out = calc.run(knn=1, threshold=0.6, fill_extend=False, max_workers=1, - fp_out=fp_out) + out = calc.run(fill_extend=False, max_workers=1, fp_out=fp_out) scalar = out['rsds_scalar'] adder = out['rsds_adder'] test_data = np.ones_like(scalar) @@ -204,8 +218,7 @@ def test_linear_transform(): out = local_linear_bc(test_data, lat_lon, 'rsds', fp_out, lr_padded_slice=None, out_range=None) - out = calc.run(knn=1, threshold=0.6, fill_extend=True, max_workers=1, - fp_out=fp_out) + out = calc.run(fill_extend=True, max_workers=1, fp_out=fp_out) scalar = out['rsds_scalar'] adder = out['rsds_adder'] test_data = np.ones_like(scalar) @@ -235,7 +248,8 @@ def test_linear_transform(): def test_montly_linear_transform(): """Test the montly linear bc transform method""" calc = MonthlyLinearCorrection(FP_NSRDB, FP_CC, 'ghi', 'rsds', - TARGET, SHAPE, + target=TARGET, shape=SHAPE, + distance_upper_bound=0.7, bias_handler='DataHandlerNCforCC') lat_lon = calc.bias_dh.lat_lon _, base_ti = calc.get_base_data(calc.base_fps, calc.base_dset, @@ -243,8 +257,7 @@ def test_montly_linear_transform(): daily_reduction='avg') with tempfile.TemporaryDirectory() as td: fp_out = os.path.join(td, 'bc.h5') - out = calc.run(knn=1, threshold=0.6, fill_extend=True, max_workers=1, - fp_out=fp_out) + out = calc.run(fill_extend=True, max_workers=1, fp_out=fp_out) scalar = out['rsds_scalar'] adder = out['rsds_adder'] test_data = np.ones((scalar.shape[0], scalar.shape[1], len(base_ti))) @@ -278,10 +291,11 @@ def test_clearsky_ratio(): 'temporal_slice': [0, 30, 1]} calc = LinearCorrection(FP_NSRDB, FP_CC, 'clearsky_ratio', 'clearsky_ratio', - TARGET, SHAPE, + target=TARGET, shape=SHAPE, + distance_upper_bound=0.7, bias_handler_kwargs=bias_handler_kwargs, bias_handler='DataHandlerNCforCC') - out = calc.run(knn=1, threshold=100, fill_extend=False, max_workers=1) + out = calc.run(fill_extend=True, max_workers=1) assert not np.isnan(out['clearsky_ratio_scalar']).any() assert not np.isnan(out['clearsky_ratio_adder']).any() @@ -446,26 +460,28 @@ def test_qa_integration(): def test_skill_assessment(): """Test the skill assessment of a climate model vs. historical data""" - calc = SkillAssessment(FP_NSRDB, FP_CC, 'ghi', 'rsds', TARGET, SHAPE, + calc = SkillAssessment(FP_NSRDB, FP_CC, 'ghi', 'rsds', + target=TARGET, shape=SHAPE, + distance_upper_bound=0.7, bias_handler='DataHandlerNCforCC') # test a known in-bounds gid bias_gid = 5 - dist, base_gid = calc.get_base_gid(bias_gid, 1) + dist, base_gid = calc.get_base_gid(bias_gid) bias_data = calc.get_bias_data(bias_gid) base_data, _ = calc.get_base_data(calc.base_fps, calc.base_dset, base_gid, calc.base_handler, daily_reduction='avg') - bias_coord = calc.bias_meta.loc[bias_gid, ['latitude', 'longitude']] + bias_coord = calc.bias_meta.loc[[bias_gid], ['latitude', 'longitude']] base_coord = calc.base_meta.loc[base_gid, ['latitude', 'longitude']] true_dist = bias_coord.values - base_coord.values - true_dist = np.hypot(true_dist[0], true_dist[1]) + true_dist = np.hypot(true_dist[:, 0], true_dist[:, 1]) assert np.allclose(true_dist, dist) - assert true_dist < 0.1 + assert (true_dist < 0.5).all() # horiz res of bias data is ~0.7 deg iloc = np.where(calc.bias_gid_raster == bias_gid) iloc += (0, ) - out = calc.run(knn=1, threshold=0.6, fill_extend=True, max_workers=1) + out = calc.run(fill_extend=True, max_workers=1) base_mean = base_data.mean() bias_mean = bias_data.mean() From 284a7d42104e25facb7262bd1fb7a0b230919806 Mon Sep 17 00:00:00 2001 From: grantbuster Date: Thu, 7 Dec 2023 09:15:08 -0700 Subject: [PATCH 02/15] new feature: bias calc can open a .nc base dataset using the sup3r DataHandler objects. The data is loaded in init and then full object is passed in serial to the run single calculation --- sup3r/bias/bias_calc.py | 188 +++++++++++++++++++++-------- tests/bias/test_bias_correction.py | 46 ++++++- 2 files changed, 180 insertions(+), 54 deletions(-) diff --git a/sup3r/bias/bias_calc.py b/sup3r/bias/bias_calc.py index cb73a5691..d39c62cd7 100644 --- a/sup3r/bias/bias_calc.py +++ b/sup3r/bias/bias_calc.py @@ -19,6 +19,7 @@ from scipy.spatial import KDTree import sup3r.preprocessing.data_handling +from sup3r.preprocessing.data_handling.base import DataHandler from sup3r.utilities import VERSION_RECORD, ModuleName from sup3r.utilities.cli import BaseCLI from sup3r.utilities.utilities import nn_fill_array @@ -74,7 +75,10 @@ def __init__(self, full domain shape will be used. base_handler : str Name of rex resource handler or sup3r.preprocessing.data_handling - class to be retrieved from the rex/sup3r library. + class to be retrieved from the rex/sup3r library. If a + sup3r.preprocessing.data_handling class is used, all data will be + loaded in this class' initialization and the subsequent bias + calculation will be done in serial bias_handler : str Name of the bias data handler class to be retrieved from the sup3r.preprocessing.data_handling library. @@ -110,20 +114,30 @@ class to be retrieved from the rex/sup3r library. if isinstance(self.bias_fps, str): self.bias_fps = sorted(glob(self.bias_fps)) - base_sup3r_h = getattr(sup3r.preprocessing.data_handling, - base_handler, None) - base_rex_h = getattr(rex, base_handler, None) - msg = f'Could not retrieve "{base_handler}" from sup3r or rex!' - assert base_sup3r_h is not None or base_rex_h is not None, msg - self.base_handler = base_rex_h or base_sup3r_h + base_sup3r_handler = getattr(sup3r.preprocessing.data_handling, + base_handler, None) + base_rex_handler = getattr(rex, base_handler, None) + + if base_rex_handler is not None: + self.base_handler = base_rex_handler + self.base_dh = self.base_handler(self.base_fps[0], + **self.base_handler_kwargs) + elif base_sup3r_handler is not None: + self.base_handler = base_sup3r_handler + self.base_handler_kwargs['features'] = [self.base_dset] + self.base_dh = self.base_handler(self.base_fps, + **self.base_handler_kwargs) + msg = ('Base data handler opened with a sup3r DataHandler class ' + 'must load cached data!') + assert self.base_dh.data is not None, msg + else: + msg = f'Could not retrieve "{base_handler}" from sup3r or rex!' + logger.error(msg) + raise RuntimeError(msg) self.bias_handler = getattr(sup3r.preprocessing.data_handling, bias_handler) - - self.base_dh = self.base_handler(self.base_fps[0], - **self.base_handler_kwargs) self.base_meta = self.base_dh.meta - self.bias_dh = self.bias_handler(self.bias_fps, [self.bias_feature], target=self.target, shape=self.shape, @@ -363,8 +377,10 @@ def get_base_data(cls, base_dset, base_gid, base_handler, + base_handler_kwargs=None, daily_reduction='avg', - decimals=None): + decimals=None, + base_dh_inst=None): """Get data from the baseline data source, possibly for many high-res base gids corresponding to a single coarse low-res bias gid. @@ -380,7 +396,11 @@ def get_base_data(cls, One or more spatial gids to retrieve from base_fps. The data will be spatially averaged across all of these sites. base_handler : rex.Resource - A rex data handler similar to rex.Resource + A rex data handler similar to rex.Resource or sup3r.DataHandler + classes (if using the latter, must also input base_dh_inst) + base_handler_kwargs : dict | None + Optional kwargs to send to the initialization of the base_handler + class daily_reduction : None | str Option to do a reduction of the hourly+ source base data to daily data. Can be None (no reduction, keep source time frequency), "avg" @@ -390,10 +410,14 @@ def get_base_data(cls, decimals, this gets passed to np.around(). If decimals is negative, it specifies the number of positions to the left of the decimal point. + base_dh_inst : sup3r.DataHandler + Instantiated DataHandler class that has already loaded the base + data (required if base files are .nc and are not being opened by a + rex Resource handler). Returns ------- - out : np.ndarray + out_data : np.ndarray 1D array of base data spatially averaged across the base_gid input and possibly daily-averaged or min/max'd as well. out_ti : pd.DatetimeIndex @@ -401,37 +425,82 @@ def get_base_data(cls, output data. """ - out = [] + out_data = [] out_ti = [] - for fp in base_fps: - with base_handler(fp) as res: - base_ti = res.time_index - - base_data, base_cs_ghi = cls._read_base_data( - res, base_dset, base_gid) - if daily_reduction is not None: - base_data = cls._reduce_base_data( - base_ti, - base_data, - base_cs_ghi, - base_dset, - daily_reduction, - ) - base_ti = np.array(sorted(set(base_ti.date))) + all_cs_ghi = [] + base_handler_kwargs = base_handler_kwargs or {} - out.append(base_data) - out_ti.append(base_ti) + if issubclass(base_handler, DataHandler) and base_dh_inst is None: + msg = ('The method `get_base_data()` is only to be used with ' + '`base_handler` as a `sup3r.DataHandler` subclass if ' + '`base_dh_inst` is also provided!') + logger.error(msg) + raise RuntimeError(msg) - out = np.hstack(out) + if issubclass(base_handler, DataHandler) and base_dh_inst is not None: + all_cs_ghi = [None] + out_ti = base_dh_inst.time_index + out_data = cls._read_base_sup3r_data(base_dh_inst, base_dset, + base_gid) + else: + for fp in base_fps: + with base_handler(fp, **base_handler_kwargs) as res: + base_ti = res.time_index + temp_out = cls._read_base_rex_data(res, base_dset, + base_gid) + base_data, base_cs_ghi = temp_out + + out_data.append(base_data) + out_ti.append(base_ti) + all_cs_ghi.append(base_cs_ghi) + + out_data = np.hstack(out_data) + out_ti = pd.DatetimeIndex(np.hstack(out_ti)) + all_cs_ghi = np.hstack(all_cs_ghi) + + if daily_reduction is not None: + out_data, out_ti = cls._reduce_base_data(out_ti, + out_data, + all_cs_ghi, + base_dset, + daily_reduction) if decimals is not None: - out = np.around(out, decimals=decimals) + out_data = np.around(out_data, decimals=decimals) + + return out_data, out_ti + + @staticmethod + def _read_base_sup3r_data(dh, base_dset, base_gid): + """Read baseline data from a sup3r DataHandler - return out, pd.DatetimeIndex(np.hstack(out_ti)) + Parameters + ---------- + dh : sup3r.DataHandler + sup3r DataHandler that is an open file handler of the base file(s) + base_dset : str + A single dataset from the base_fps to retrieve. + base_gid : int | np.ndarray + One or more spatial gids to retrieve from base_fps. The data will + be spatially averaged across all of these sites. + + Returns + ------- + base_data : np.ndarray + 1D array of base data spatially averaged across the base_gid input + """ + idf = dh.features.index(base_dset) + gid_raster = np.arange(len(dh.meta)) + gid_raster = gid_raster.reshape(dh.shape[:2]) + idy, idx = np.where(np.isin(gid_raster, base_gid)) + base_data = dh.data[idy, idx, :, idf] + assert base_data.shape[0] == len(base_gid) + assert base_data.shape[1] == len(dh.time_index) + return base_data.mean(axis=0) @staticmethod - def _read_base_data(res, base_dset, base_gid): - """Read baseline data from the resource handler with extra logic for + def _read_base_rex_data(res, base_dset, base_gid): + """Read baseline data from a rex resource handler with extra logic for special datasets (e.g. u/v wind components or clearsky_ratio) Parameters @@ -454,6 +523,9 @@ def _read_base_data(res, base_dset, base_gid): this base_cs_ghi is clearsky GHI. Otherwise this is None """ + msg = '`res` input must not be a `DataHandler` subclass!' + assert not issubclass(res.__class__, DataHandler), msg + base_cs_ghi = None if base_dset.startswith(('U_', 'V_')): @@ -509,19 +581,19 @@ def _reduce_base_data(base_ti, base_data, base_cs_ghi, base_dset, base_data : np.ndarray 1D array of base data spatially averaged across the base_gid input and possibly daily-averaged or min/max'd as well. + daily_ti : pd.DatetimeIndex + Daily DatetimeIndex corresponding to the daily base_data """ if daily_reduction is None: return base_data - slices = [ - np.where(base_ti.date == date) - for date in sorted(set(base_ti.date)) - ] + daily_ti = pd.DatetimeIndex(sorted(set(base_ti.date))) + slices = [np.where(base_ti.date == date.date()) for date in daily_ti] if base_dset == 'clearsky_ratio' and daily_reduction.lower() == 'avg': - base_data = np.array( - [base_data[s0].sum() / base_cs_ghi[s0].sum() for s0 in slices]) + base_data = np.array([base_data[s0].sum() / base_cs_ghi[s0].sum() + for s0 in slices]) elif daily_reduction.lower() == 'avg': base_data = np.array([base_data[s0].mean() for s0 in slices]) @@ -532,7 +604,12 @@ def _reduce_base_data(base_ti, base_data, base_cs_ghi, base_dset, elif daily_reduction.lower() == 'min': base_data = np.array([base_data[s0].min() for s0 in slices]) - return base_data + msg = (f'Daily reduced base data shape {base_data.shape} does not ' + f'match daily time index shape {daily_ti.shape}, ' + 'something went wrong!') + assert base_data.shape == daily_ti.shape, msg + + return base_data, daily_ti class LinearCorrection(DataRetrievalBase): @@ -613,7 +690,8 @@ def _run_single(cls, base_handler, daily_reduction, bias_ti, - decimals): + decimals, + base_dh_inst=None): """Find the nominal scalar + adder combination to bias correct data at a single site""" @@ -622,7 +700,8 @@ def _run_single(cls, base_gid, base_handler, daily_reduction=daily_reduction, - decimals=decimals) + decimals=decimals, + base_dh_inst=base_dh_inst) out = cls.get_linear_correction(bias_data, base_data, bias_feature, base_dset) @@ -789,6 +868,11 @@ def run(self, self.bad_bias_gids = [] + # sup3r DataHandler opening base files will load all data in parallel + # during the init and should not be passed in parallel to workers + if isinstance(self.base_dh, DataHandler): + max_workers = 1 + if max_workers == 1: logger.debug('Running serial calculation.') for i, bias_gid in enumerate(self.bias_meta.index): @@ -809,6 +893,7 @@ def run(self, daily_reduction, self.bias_ti, self.decimals, + base_dh_inst=self.base_dh, ) for key, arr in single_out.items(): self.out[key][raster_loc] = arr @@ -883,7 +968,8 @@ def _run_single(cls, base_handler, daily_reduction, bias_ti, - decimals): + decimals, + base_dh_inst=None): """Find the nominal scalar + adder combination to bias correct data at a single site""" @@ -892,7 +978,8 @@ def _run_single(cls, base_gid, base_handler, daily_reduction=daily_reduction, - decimals=decimals) + decimals=decimals, + base_dh_inst=base_dh_inst) base_arr = np.full(cls.NT, np.nan, dtype=np.float32) out = {} @@ -1051,13 +1138,14 @@ def _run_skill_eval(cls, bias_data, base_data, bias_feature, base_dset): @classmethod def _run_single(cls, bias_data, base_fps, bias_feature, base_dset, base_gid, base_handler, daily_reduction, bias_ti, - decimals): + decimals, base_dh_inst=None): """Do a skill assessment at a single site""" base_data, base_ti = cls.get_base_data(base_fps, base_dset, base_gid, base_handler, daily_reduction=daily_reduction, - decimals=decimals) + decimals=decimals, + base_dh_inst=base_dh_inst) arr = np.full(cls.NT, np.nan, dtype=np.float32) out = {f'bias_{bias_feature}_mean_monthly': arr.copy(), diff --git a/tests/bias/test_bias_correction.py b/tests/bias/test_bias_correction.py index e5dbd9cf6..720d25194 100644 --- a/tests/bias/test_bias_correction.py +++ b/tests/bias/test_bias_correction.py @@ -300,11 +300,14 @@ def test_clearsky_ratio(): assert not np.isnan(out['clearsky_ratio_scalar']).any() assert not np.isnan(out['clearsky_ratio_adder']).any() - assert (out['base_clearsky_ratio_mean'] > 0.3).all() - assert (out['base_clearsky_ratio_mean'] < 1.0).all() + base_cs = out['base_clearsky_ratio_mean'] + bias_cs = out['bias_clearsky_ratio_mean'] - assert (out['bias_clearsky_ratio_mean'] > 0.3).all() - assert (out['bias_clearsky_ratio_mean'] < 1.0).all() + assert (base_cs > 0.3).all() + assert (base_cs < 1.0).all() + + assert (base_cs > 0.3).all() + assert (bias_cs < 1.0).all() def test_fwp_integration(): @@ -492,3 +495,38 @@ def test_skill_assessment(): ks = stats.ks_2samp(base_data - base_mean, bias_data - bias_mean) assert np.allclose(out['rsds_ks_stat'][iloc], ks.statistic) assert np.allclose(out['rsds_ks_p'][iloc], ks.pvalue) + + +def test_nc_base_file(): + """Test a base file being a .nc like ERA5""" + calc = SkillAssessment(FP_CC, FP_CC, 'rsds', 'rsds', + target=TARGET, shape=SHAPE, + distance_upper_bound=0.7, + base_handler='DataHandlerNCforCC', + bias_handler='DataHandlerNCforCC') + + # test a known in-bounds gid + bias_gid = 5 + dist, base_gid = calc.get_base_gid(bias_gid) + assert dist == 0 + assert (calc.nn_dist == 0).all() + + with pytest.raises(RuntimeError) as exc: + base_data, _ = calc.get_base_data(calc.base_fps, calc.base_dset, + base_gid, calc.base_handler, + daily_reduction='avg') + + good_err = 'only to be used with `base_handler` as a `sup3r.DataHandler` ' + assert good_err in str(exc.value) + + base_data, _ = calc.get_base_data(calc.base_fps, calc.base_dset, + base_gid, calc.base_handler, + daily_reduction='avg', + base_dh_inst=calc.base_dh) + + out = calc.run(fill_extend=True, max_workers=1) + + assert (out['rsds_scalar'] == 1).all() + assert (out['rsds_adder'] == 0).all() + assert np.allclose(out['base_rsds_mean'], out['bias_rsds_mean']) + assert np.allclose(out['base_rsds_std'], out['bias_rsds_std']) From 948637d570fb1ce25d1578cf0973d13f8b56781f Mon Sep 17 00:00:00 2001 From: grantbuster Date: Thu, 7 Dec 2023 12:18:07 -0700 Subject: [PATCH 03/15] refactor: topo extration class will now sub-grid aggregate high-res exo source data to low-res target grid instead of doing a k nearest neighbor --- sup3r/bias/bias_calc.py | 4 +- .../data_handling/exo_extraction.py | 148 ++++++++------- tests/bias/test_bias_correction.py | 5 +- tests/data_handling/test_utils_topo.py | 170 ++++++++++-------- 4 files changed, 174 insertions(+), 153 deletions(-) diff --git a/sup3r/bias/bias_calc.py b/sup3r/bias/bias_calc.py index d39c62cd7..e5647dee4 100644 --- a/sup3r/bias/bias_calc.py +++ b/sup3r/bias/bias_calc.py @@ -877,7 +877,7 @@ def run(self, logger.debug('Running serial calculation.') for i, bias_gid in enumerate(self.bias_meta.index): raster_loc = np.where(self.bias_gid_raster == bias_gid) - dist, base_gid = self.get_base_gid(bias_gid) + _, base_gid = self.get_base_gid(bias_gid) if not base_gid.any(): self.bad_bias_gids.append(bias_gid) @@ -907,7 +907,7 @@ def run(self, max_workers)) with ProcessPoolExecutor(max_workers=max_workers) as exe: futures = {} - for bias_gid, bias_row in self.bias_meta.iterrows(): + for bias_gid in self.bias_meta.index: raster_loc = np.where(self.bias_gid_raster == bias_gid) dist, base_gid = self.get_base_gid(bias_gid) diff --git a/sup3r/preprocessing/data_handling/exo_extraction.py b/sup3r/preprocessing/data_handling/exo_extraction.py index 4896b3384..52e30c785 100644 --- a/sup3r/preprocessing/data_handling/exo_extraction.py +++ b/sup3r/preprocessing/data_handling/exo_extraction.py @@ -5,7 +5,9 @@ import pickle import shutil from abc import ABC, abstractmethod +from warnings import warn +import pandas as pd import numpy as np from rex import Resource from rex.utilities.solar_position import SolarPosition @@ -15,7 +17,8 @@ from sup3r.postprocessing.file_handling import OutputHandler from sup3r.preprocessing.data_handling.h5_data_handling import DataHandlerH5 from sup3r.preprocessing.data_handling.nc_data_handling import DataHandlerNC -from sup3r.utilities.utilities import generate_random_string, get_source_type +from sup3r.utilities.utilities import (generate_random_string, get_source_type, + nn_fill_array) logger = logging.getLogger(__name__) @@ -32,7 +35,6 @@ def __init__(self, exo_source, s_enhance, t_enhance, - s_agg_factor, t_agg_factor, target=None, shape=None, @@ -43,6 +45,7 @@ def __init__(self, cache_data=True, cache_dir='./exo_cache/', ti_workers=1, + distance_upper_bound=None, res_kwargs=None): """Parameters ---------- @@ -68,13 +71,6 @@ def __init__(self, example, if getting sza data, file_paths has hourly data, and t_enhance is 4, this class will output a sza raster corresponding to the file_paths temporally enhanced 4x to 15 min - s_agg_factor : int - Factor by which to aggregate the exo_source data to the resolution - of the file_paths input enhanced by s_enhance. For example, if - getting topography data, file_paths have 100km data, and s_enhance - is 4 resulting in a desired resolution of ~25km and topo_source_h5 - has a resolution of 4km, the s_agg_factor should be 36 so that 6x6 - 4km cells are averaged to the ~25km enhanced grid. t_agg_factor : int Factor by which to aggregate the exo_source data to the resolution of the file_paths input enhanced by t_enhance. For example, if @@ -118,6 +114,10 @@ def __init__(self, parallel and then concatenated to get the full time index. If input files do not all have time indices or if there are few input files this should be set to one. + distance_upper_bound : float | None + Maximum distance to map high-resolution data from exo_source to the + low-resolution file_paths input. None (default) will calculate this + based on the median distance between points in exo_source res_kwargs : dict | None Dictionary of kwargs passed to lowest level resource handler. e.g. xr.open_dataset(file_paths, **res_kwargs) @@ -128,13 +128,13 @@ def __init__(self, self._exo_source = exo_source self._s_enhance = s_enhance self._t_enhance = t_enhance - self._s_agg_factor = s_agg_factor self._t_agg_factor = t_agg_factor self._tree = None self._hr_lat_lon = None self._source_lat_lon = None self._hr_time_index = None self._src_time_index = None + self._distance_upper_bound = None self.cache_data = cache_data self.cache_dir = cache_dir self.temporal_slice = temporal_slice @@ -179,8 +179,7 @@ def __init__(self, def source_data(self): """Get the 1D array of source data from the exo_source_h5""" - def get_cache_file(self, feature, s_enhance, t_enhance, s_agg_factor, - t_agg_factor): + def get_cache_file(self, feature, s_enhance, t_enhance, t_agg_factor): """Get cache file name Parameters @@ -193,9 +192,6 @@ def get_cache_file(self, feature, s_enhance, t_enhance, s_agg_factor, t_enhance : int Temporal enhancement for this exogeneous data step (cumulative for all model steps up to the current step). - s_agg_factor : int - Factor by which to aggregate the exo_source data to the spatial - resolution of the file_paths input enhanced by s_enhance. t_agg_factor : int Factor by which to aggregate the exo_source data to the temporal resolution of the file_paths input enhanced by t_enhance. @@ -210,7 +206,7 @@ def get_cache_file(self, feature, s_enhance, t_enhance, s_agg_factor, or self.temporal_slice.stop is None else self.temporal_slice.stop - self.temporal_slice.start) fn = f'exo_{feature}_{self.target}_{self.shape},{tsteps}' - fn += f'_sagg{s_agg_factor}_tagg{t_agg_factor}_{s_enhance}x_' + fn += f'_tagg{t_agg_factor}_{s_enhance}x_' fn += f'{t_enhance}x.pkl' fn = fn.replace('(', '').replace(')', '') fn = fn.replace('[', '').replace(']', '') @@ -233,14 +229,10 @@ def source_temporal_slice(self): @property def source_lat_lon(self): - """Get the 2D array (n, 2) of lat, lon data for the exo source""" - if self._source_lat_lon is None: - src_enhance = int(np.sqrt(self._s_agg_factor)) - src_shape = (self.hr_shape[0] * src_enhance, - self.hr_shape[1] * src_enhance) - self._source_lat_lon = OutputHandler.get_lat_lon( - self.lr_lat_lon, src_shape).reshape((-1, 2)) - return self._source_lat_lon + """Get the 2D array (n, 2) of lat, lon data from the exo_source_h5""" + with Resource(self._exo_source) as res: + source_lat_lon = res.lat_lon + return source_lat_lon @property def lr_shape(self): @@ -259,7 +251,7 @@ def hr_shape(self): def lr_lat_lon(self): """Lat lon grid for data in format (spatial_1, spatial_2, 2) Lat/Lon array with same ordering in last dimension. This corresponds to the raw - low-resolution meta data from the file_paths input. + meta data from the file_paths input. Returns ------- @@ -271,7 +263,7 @@ def lr_lat_lon(self): def hr_lat_lon(self): """Lat lon grid for data in format (spatial_1, spatial_2, 2) Lat/Lon array with same ordering in last dimension. This corresponds to the - enhanced high-res meta data from the file_paths input * s_enhance. + enhanced meta data from the file_paths input * s_enhance. Returns ------- @@ -308,22 +300,32 @@ def hr_time_index(self): self._hr_time_index = self.input_handler.time_index return self._hr_time_index + @property + def distance_upper_bound(self): + """Maximum distance (float) to map high-resolution data from exo_source + to the low-resolution file_paths input.""" + if self._distance_upper_bound is None: + diff = np.diff(self.source_lat_lon, axis=0) + diff = np.max(np.median(diff, axis=0)) + self._distance_upper_bound = diff + return self._distance_upper_bound + @property def tree(self): - """Get the KDTree built on the source lat lon data""" + """Get the KDTree built on the target lat lon data from the file_paths + input with s_enhance""" if self._tree is None: - self._tree = KDTree(self.source_lat_lon) + lat = self.hr_lat_lon[..., 0].flatten() + lon = self.hr_lat_lon[..., 1].flatten() + hr_meta = np.vstack((lat, lon)).T + self._tree = KDTree(hr_meta) return self._tree @property def nn(self): """Get the nearest neighbor indices""" - ll2 = np.vstack( - (self.hr_lat_lon[:, :, 0].flatten(), - self.hr_lat_lon[:, :, 1].flatten())).T - _, nn = self.tree.query(ll2, k=self._s_agg_factor) - if len(nn.shape) == 1: - nn = np.expand_dims(nn, 1) + _, nn = self.tree.query(self.source_lat_lon, k=1, + distance_upper_bound=self.distance_upper_bound) return nn @property @@ -335,7 +337,6 @@ def data(self): cache_fp = self.get_cache_file(feature=self.__class__.__name__, s_enhance=self._s_enhance, t_enhance=self._t_enhance, - s_agg_factor=self._s_agg_factor, t_agg_factor=self._t_agg_factor) tmp_fp = cache_fp + f'.{generate_random_string(10)}.tmp' if os.path.exists(cache_fp): @@ -355,27 +356,18 @@ def data(self): return data[..., np.newaxis] + @abstractmethod def get_data(self): """Get a raster of source values corresponding to the high-resolution grid (the file_paths input grid * s_enhance * - t_enhance). The shape is (lats, lons, temporal, 1) + t_enhance). The shape is (lats, lons, temporal) """ - nn = self.nn - hr_data = [] - for j in range(self._s_agg_factor): - out = self.source_data[nn[:, j], self.source_temporal_slice] - out = out.reshape(self.hr_shape) - hr_data.append(out[..., np.newaxis]) - hr_data = np.concatenate(hr_data, axis=-1).mean(axis=-1) - logger.info('Finished mapping raster from {}'.format(self._exo_source)) - return hr_data @classmethod def get_exo_raster(cls, file_paths, s_enhance, t_enhance, - s_agg_factor, t_agg_factor, exo_source=None, target=None, @@ -407,13 +399,6 @@ class will output a topography raster corresponding to the example, if getting sza data, file_paths has hourly data, and t_enhance is 4, this class will output a sza raster corresponding to the file_paths temporally enhanced 4x to 15 min - s_agg_factor : int - Factor by which to aggregate the exo_source data to the resolution - of the file_paths input enhanced by s_enhance. For example, if - getting topography data, file_paths have 100km data, and s_enhance - is 4 resulting in a desired resolution of ~25km and topo_source_h5 - has a resolution of 4km, the s_agg_factor should be 36 so that 6x6 - 4km cells are averaged to the ~25km enhanced grid. t_agg_factor : int Factor by which to aggregate the exo_source data to the resolution of the file_paths input enhanced by t_enhance. For example, if @@ -467,7 +452,6 @@ class will output a topography raster corresponding to the exo = cls(file_paths, s_enhance, t_enhance, - s_agg_factor, t_agg_factor, exo_source=exo_source, target=target, @@ -491,13 +475,6 @@ def source_data(self): elev = res.get_meta_arr('elevation') return elev[:, np.newaxis] - @property - def source_lat_lon(self): - """Get the 2D array (n, 2) of lat, lon data from the exo_source_h5""" - with Resource(self._exo_source) as res: - source_lat_lon = res.lat_lon - return source_lat_lon - @property def source_time_index(self): """Time index of the source exo data""" @@ -509,20 +486,42 @@ def source_time_index(self): def get_data(self): """Get a raster of source values corresponding to the high-resolution grid (the file_paths input grid * s_enhance * - t_enhance). The shape is (lats, lons, temporal, 1) + t_enhance). The shape is (lats, lons, 1) """ - nn = self.nn - hr_data = [] - for j in range(self._s_agg_factor): - out = self.source_data[nn[:, j]] - out = out.reshape((*self.hr_shape[:-1], -1)) - hr_data.append(out[..., np.newaxis]) - hr_data = np.concatenate(hr_data, axis=-1).mean(axis=-1) + + assert len(self.source_data.shape) == 2 + assert self.source_data.shape[1] == 1 + + df = pd.DataFrame({'topo': self.source_data.flatten(), + 'gid_target': self.nn}) + n_target = np.product(self.hr_shape[:-1]) + df = df[df['gid_target'] != n_target] + df = df.sort_values('gid_target') + df = df.groupby('gid_target').mean() + + missing = set(np.arange(n_target)) - set(df.index) + if any(missing): + msg = (f'{len(missing)} target pixels did not have unique ' + 'high-resolution source data to map from. If there are a ' + 'lot of target pixels missing source data this probably ' + 'means the source data is not high enough resolution. ' + 'Filling raster with NN.') + logger.warning(msg) + warn(msg) + temp_df = pd.DataFrame({'topo': np.nan}, index=sorted(missing)) + df = pd.concat((df, temp_df)).sort_index() + + hr_data = df['topo'].values.reshape(self.hr_shape[:-1]) + if np.isnan(hr_data).any(): + hr_data = nn_fill_array(hr_data) + + hr_data = np.expand_dims(hr_data, axis=-1) + logger.info('Finished mapping raster from {}'.format(self._exo_source)) + return hr_data - def get_cache_file(self, feature, s_enhance, t_enhance, s_agg_factor, - t_agg_factor): + def get_cache_file(self, feature, s_enhance, t_enhance, t_agg_factor): """Get cache file name. This uses a time independent naming convention. Parameters @@ -535,9 +534,6 @@ def get_cache_file(self, feature, s_enhance, t_enhance, s_agg_factor, t_enhance : int Temporal enhancement for this exogeneous data step (cumulative for all model steps up to the current step). - s_agg_factor : int - Factor by which to aggregate the exo_source data to the spatial - resolution of the file_paths input enhanced by s_enhance. t_agg_factor : int Factor by which to aggregate the exo_source data to the temporal resolution of the file_paths input enhanced by t_enhance. @@ -548,7 +544,7 @@ def get_cache_file(self, feature, s_enhance, t_enhance, s_agg_factor, Name of cache file """ fn = f'exo_{feature}_{self.target}_{self.shape}' - fn += f'_sagg{s_agg_factor}_tagg{t_agg_factor}_{s_enhance}x_' + fn += f'_tagg{t_agg_factor}_{s_enhance}x_' fn += f'{t_enhance}x.pkl' fn = fn.replace('(', '').replace(')', '') fn = fn.replace('[', '').replace(']', '') @@ -605,7 +601,7 @@ def source_data(self): def get_data(self): """Get a raster of source values corresponding to the high-resolution grid (the file_paths input grid * s_enhance * - t_enhance). The shape is (lats, lons, temporal, 1) + t_enhance). The shape is (lats, lons, temporal) """ hr_data = self.source_data.reshape(self.hr_shape) logger.info('Finished computing SZA data') diff --git a/tests/bias/test_bias_correction.py b/tests/bias/test_bias_correction.py index 720d25194..0b1c39a38 100644 --- a/tests/bias/test_bias_correction.py +++ b/tests/bias/test_bias_correction.py @@ -512,9 +512,8 @@ def test_nc_base_file(): assert (calc.nn_dist == 0).all() with pytest.raises(RuntimeError) as exc: - base_data, _ = calc.get_base_data(calc.base_fps, calc.base_dset, - base_gid, calc.base_handler, - daily_reduction='avg') + calc.get_base_data(calc.base_fps, calc.base_dset, base_gid, + calc.base_handler, daily_reduction='avg') good_err = 'only to be used with `base_handler` as a `sup3r.DataHandler` ' assert good_err in str(exc.value) diff --git a/tests/data_handling/test_utils_topo.py b/tests/data_handling/test_utils_topo.py index 294bede19..ed4a037c4 100644 --- a/tests/data_handling/test_utils_topo.py +++ b/tests/data_handling/test_utils_topo.py @@ -1,11 +1,16 @@ # -*- coding: utf-8 -*- """pytests for topography utilities""" import os +import shutil +import tempfile +import pandas as pd import matplotlib.pyplot as plt import numpy as np import pytest from scipy.spatial import KDTree +from rex import Resource +from rex import Outputs from sup3r import TEST_DATA_DIR from sup3r.preprocessing.data_handling.exo_extraction import ( @@ -21,83 +26,104 @@ WRF_SHAPE = (8, 8) -@pytest.mark.parametrize('agg_factor', [1, 4, 8]) -def test_topo_extraction_h5(agg_factor, plot=False): - """Test the spatial enhancement of a test grid and then the lookup of the - elevation data to a reference WTK file (also the same file for the test)""" - te = TopoExtractH5(FP_WTK, FP_WTK, s_enhance=2, t_enhance=1, - t_agg_factor=1, s_agg_factor=agg_factor, - target=TARGET, shape=SHAPE) - hr_elev = te.data - - tree = KDTree(te.source_lat_lon) - - # bottom left - _, i = tree.query(TARGET, k=agg_factor) - elev = te.source_data[i].mean() - assert np.allclose(elev, hr_elev[-1, 0]) +def make_dummy_h5_topo(td, N=100, offset=0.1): + """Make a dummy h5 file with high-res topo for testing""" + wtk = Resource(FP_WTK) - # top right - _, i = tree.query((39.35, -105.2), k=agg_factor) - elev = te.source_data[i].mean() - assert np.allclose(elev, hr_elev[0, 0]) + lat = np.linspace(wtk.meta['latitude'].min() - offset, + wtk.meta['latitude'].max() + offset, + N) + lon = np.linspace(wtk.meta['longitude'].min() - offset, + wtk.meta['longitude'].max() + offset, + N) + idy, idx = np.meshgrid(np.arange(len(lon)), np.arange(len(lat))) + lon, lat = np.meshgrid(lon, lat) + lon, lat = lon.flatten(), lat.flatten() + idy, idx = idy.flatten(), idx.flatten() + scale = 30 + elevation = np.sin(scale * np.deg2rad(idy) + scale * np.deg2rad(idx)) + meta = pd.DataFrame({'latitude': lat, 'longitude': lon, + 'elevation': elevation}) - for idy in range(10, 20): - for idx in range(10, 20): - lat, lon = te.hr_lat_lon[idy, idx, :] - _, i = tree.query((lat, lon), k=agg_factor) - elev = te.source_data[i].mean() - assert np.allclose(elev, hr_elev[idy, idx]) + fp_temp = os.path.join(td, 'elevation.h5') + with Outputs(fp_temp, mode='w') as out: + out.meta = meta - if plot: - a = plt.scatter(te.source_lat_lon[:, 1], te.source_lat_lon[:, 0], - c=te.source_data, marker='s', s=150) - plt.colorbar(a) - plt.savefig('./source_elevation.png') - plt.close() + return fp_temp - a = plt.imshow(hr_elev) - plt.colorbar(a) - plt.savefig('./hr_elev.png') - plt.close() - -@pytest.mark.parametrize('agg_factor', [1, 4, 8]) -def test_topo_extraction_nc(agg_factor, plot=False): +@pytest.mark.parametrize('s_enhance', [1, 2]) +def test_topo_extraction_h5(s_enhance, plot=False): + """Test the spatial enhancement of a test grid and then the lookup of the + elevation data to a reference WTK file (also the same file for the test)""" + with tempfile.TemporaryDirectory() as td: + fp_exo_topo = make_dummy_h5_topo(td) + + te = TopoExtractH5(FP_WTK, fp_exo_topo, s_enhance=s_enhance, + t_enhance=1, t_agg_factor=1, + target=TARGET, shape=SHAPE) + + hr_elev = te.data + + lat = te.hr_lat_lon[..., 0].flatten() + lon = te.hr_lat_lon[..., 1].flatten() + hr_wtk_meta = np.vstack((lat, lon)).T + hr_wtk_ind = np.arange(len(lat)).reshape(te.hr_shape[:-1]) + assert te.nn.max() == len(hr_wtk_meta) + + for gid in np.random.choice(len(hr_wtk_meta), 50, replace=False): + idy, idx = np.where(hr_wtk_ind == gid) + iloc = np.where(te.nn == gid)[0] + exo_coords = te.source_lat_lon[iloc] + + # make sure all mapped high-res exo coordinates are closest to gid + for i in range(len(exo_coords)): + dist = hr_wtk_meta - exo_coords[i] + dist = np.hypot(dist[:, 0], dist[:, 1]) + assert np.argmin(dist) == gid + + # make sure the mean elevation makes sense + test_out = hr_elev[idy, idx, 0, 0] + true_out = te.source_data[iloc].mean() + assert np.allclose(test_out, true_out) + + shutil.rmtree('./exo_cache/') + + if plot: + a = plt.scatter(te.source_lat_lon[:, 1], te.source_lat_lon[:, 0], + c=te.source_data, marker='s', s=5) + plt.colorbar(a) + plt.savefig(f'./source_elevation_{s_enhance}.png') + plt.close() + + a = plt.imshow(hr_elev[:, :, 0, 0]) + plt.colorbar(a) + plt.savefig(f'./hr_elev_{s_enhance}.png') + plt.close() + + +def test_bad_s_enhance(s_enhance=10): + """Test a large s_enhance factor that results in a bad mapping with + enhanced grid pixels not having source exo data points""" + with tempfile.TemporaryDirectory() as td: + fp_exo_topo = make_dummy_h5_topo(td) + + with pytest.warns(UserWarning) as warnings: + te = TopoExtractH5(FP_WTK, fp_exo_topo, s_enhance=s_enhance, + t_enhance=1, t_agg_factor=1, + target=TARGET, shape=SHAPE, + cache_data=False) + _ = te.data + + good = ['target pixels did not have unique' in str(w.message) + for w in warnings.list] + assert any(good) + + +def test_topo_extraction_nc(): """Test the spatial enhancement of a test grid and then the lookup of the elevation data to a reference WRF file (also the same file for the test)""" - te = TopoExtractNC(FP_WRF, FP_WRF, s_enhance=2, t_enhance=1, - s_agg_factor=agg_factor, t_agg_factor=1, - target=WRF_TARGET, shape=WRF_SHAPE) + te = TopoExtractNC(FP_WRF, FP_WRF, s_enhance=1, t_enhance=1, + t_agg_factor=1, target=None, shape=None) hr_elev = te.data - - tree = KDTree(te.source_lat_lon) - - # bottom left - _, i = tree.query(WRF_TARGET, k=agg_factor) - elev = te.source_data[i].mean() - assert np.allclose(elev, hr_elev[-1, 0]) - - # top right - _, i = tree.query((19.4, -123.6), k=agg_factor) - elev = te.source_data[i].mean() - assert np.allclose(elev, hr_elev[0, 0]) - - for idy in range(4, 8): - for idx in range(4, 8): - lat, lon = te.hr_lat_lon[idy, idx, :] - _, i = tree.query((lat, lon), k=agg_factor) - elev = te.source_data[i].mean() - assert np.allclose(elev, hr_elev[idy, idx]) - - if plot: - a = plt.scatter(te.source_lat_lon[:, 1], te.source_lat_lon[:, 0], - c=te.source_data, marker='s', s=150) - plt.colorbar(a) - plt.savefig('./source_elevation.png') - plt.close() - - a = plt.imshow(hr_elev) - plt.colorbar(a) - plt.savefig('./hr_elev.png') - plt.close() + assert np.allclose(te.source_data.flatten(), hr_elev.flatten()) From b3b8009235d487bd7f4c1b70732516ad97deb358 Mon Sep 17 00:00:00 2001 From: grantbuster Date: Thu, 7 Dec 2023 14:51:59 -0700 Subject: [PATCH 04/15] parse kwargs to the exo data handler to keep possible s_agg_factor feature for new exo handlers --- .../data_handling/exogenous_data_handling.py | 34 +++++++++++-------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/sup3r/preprocessing/data_handling/exogenous_data_handling.py b/sup3r/preprocessing/data_handling/exogenous_data_handling.py index 3149887e4..50cf0ecdd 100644 --- a/sup3r/preprocessing/data_handling/exogenous_data_handling.py +++ b/sup3r/preprocessing/data_handling/exogenous_data_handling.py @@ -1,4 +1,5 @@ """Sup3r exogenous data handling""" +from inspect import signature import logging import re from typing import ClassVar @@ -573,21 +574,24 @@ def get_exo_data(self, feature, s_enhance, t_enhance, s_agg_factor, exo_handler = self.get_exo_handler(feature, self.source_file, self.exo_handler) - data = exo_handler(self.file_paths, - self.source_file, - s_enhance=s_enhance, - t_enhance=t_enhance, - s_agg_factor=s_agg_factor, - t_agg_factor=t_agg_factor, - target=self.target, - shape=self.shape, - temporal_slice=self.temporal_slice, - raster_file=self.raster_file, - max_delta=self.max_delta, - input_handler=self.input_handler, - cache_data=self.cache_data, - cache_dir=self.cache_dir, - res_kwargs=self.res_kwargs).data + kwargs = dict(file_paths=self.file_paths, + exo_source=self.source_file, + s_enhance=s_enhance, + t_enhance=t_enhance, + s_agg_factor=s_agg_factor, + t_agg_factor=t_agg_factor, + target=self.target, + shape=self.shape, + temporal_slice=self.temporal_slice, + raster_file=self.raster_file, + max_delta=self.max_delta, + input_handler=self.input_handler, + cache_data=self.cache_data, + cache_dir=self.cache_dir, + res_kwargs=self.res_kwargs) + sig = signature(exo_handler) + kwargs = {k: v for k, v in kwargs.items() if k in sig.parameters} + data = exo_handler(**kwargs).data return data @classmethod From 0106b8f79c765d807a77d384247f9f74bdf096e8 Mon Sep 17 00:00:00 2001 From: grantbuster Date: Thu, 7 Dec 2023 14:55:32 -0700 Subject: [PATCH 05/15] fix distance upper bound setting --- sup3r/bias/bias_calc.py | 2 +- sup3r/preprocessing/data_handling/exo_extraction.py | 2 +- tests/bias/test_bias_correction.py | 9 +++++---- tests/data_handling/test_utils_topo.py | 2 +- 4 files changed, 8 insertions(+), 7 deletions(-) diff --git a/sup3r/bias/bias_calc.py b/sup3r/bias/bias_calc.py index e5647dee4..2780aacf4 100644 --- a/sup3r/bias/bias_calc.py +++ b/sup3r/bias/bias_calc.py @@ -909,7 +909,7 @@ def run(self, futures = {} for bias_gid in self.bias_meta.index: raster_loc = np.where(self.bias_gid_raster == bias_gid) - dist, base_gid = self.get_base_gid(bias_gid) + _, base_gid = self.get_base_gid(bias_gid) if not base_gid.any(): self.bad_bias_gids.append(bias_gid) diff --git a/sup3r/preprocessing/data_handling/exo_extraction.py b/sup3r/preprocessing/data_handling/exo_extraction.py index 52e30c785..1e5970f4d 100644 --- a/sup3r/preprocessing/data_handling/exo_extraction.py +++ b/sup3r/preprocessing/data_handling/exo_extraction.py @@ -134,7 +134,7 @@ def __init__(self, self._source_lat_lon = None self._hr_time_index = None self._src_time_index = None - self._distance_upper_bound = None + self._distance_upper_bound = distance_upper_bound self.cache_data = cache_data self.cache_dir = cache_dir self.temporal_slice = temporal_slice diff --git a/tests/bias/test_bias_correction.py b/tests/bias/test_bias_correction.py index 0b1c39a38..e0487ee9a 100644 --- a/tests/bias/test_bias_correction.py +++ b/tests/bias/test_bias_correction.py @@ -518,10 +518,11 @@ def test_nc_base_file(): good_err = 'only to be used with `base_handler` as a `sup3r.DataHandler` ' assert good_err in str(exc.value) - base_data, _ = calc.get_base_data(calc.base_fps, calc.base_dset, - base_gid, calc.base_handler, - daily_reduction='avg', - base_dh_inst=calc.base_dh) + # make sure this doesnt raise error now that calc.base_dh is provided + calc.get_base_data(calc.base_fps, calc.base_dset, + base_gid, calc.base_handler, + daily_reduction='avg', + base_dh_inst=calc.base_dh) out = calc.run(fill_extend=True, max_workers=1) diff --git a/tests/data_handling/test_utils_topo.py b/tests/data_handling/test_utils_topo.py index ed4a037c4..f7b2e8158 100644 --- a/tests/data_handling/test_utils_topo.py +++ b/tests/data_handling/test_utils_topo.py @@ -8,7 +8,6 @@ import matplotlib.pyplot as plt import numpy as np import pytest -from scipy.spatial import KDTree from rex import Resource from rex import Outputs @@ -77,6 +76,7 @@ def test_topo_extraction_h5(s_enhance, plot=False): exo_coords = te.source_lat_lon[iloc] # make sure all mapped high-res exo coordinates are closest to gid + # pylint: disable=consider-using-enumerate for i in range(len(exo_coords)): dist = hr_wtk_meta - exo_coords[i] dist = np.hypot(dist[:, 0], dist[:, 1]) From fe030c2b303b611853af447f96400fb2fe654993 Mon Sep 17 00:00:00 2001 From: grantbuster Date: Thu, 7 Dec 2023 16:51:55 -0700 Subject: [PATCH 06/15] fixed exo data handling test --- tests/data_handling/test_exo_data_handling.py | 5 ++- tests/data_handling/test_utils_topo.py | 45 ++++++++++++++----- 2 files changed, 37 insertions(+), 13 deletions(-) diff --git a/tests/data_handling/test_exo_data_handling.py b/tests/data_handling/test_exo_data_handling.py index 601d547b4..4caefd16d 100644 --- a/tests/data_handling/test_exo_data_handling.py +++ b/tests/data_handling/test_exo_data_handling.py @@ -9,6 +9,8 @@ from sup3r import TEST_DATA_DIR from sup3r.preprocessing.data_handling import ExogenousDataHandler +from test_utils_topo import make_topo_file + FP_WTK = os.path.join(TEST_DATA_DIR, 'test_wtk_co_2012.h5') FILE_PATHS = [os.path.join(TEST_DATA_DIR, 'ua_test.nc'), @@ -37,8 +39,9 @@ def test_exo_cache(feature): 'combine_type': 'input', 'model': 0}) with TemporaryDirectory() as td: + fp_topo = make_topo_file(FILE_PATHS[0], td) base = ExogenousDataHandler(FILE_PATHS, feature, - source_file=FP_WTK, + source_file=fp_topo, steps=steps, target=TARGET, shape=SHAPE, input_handler='DataHandlerNCforCC', diff --git a/tests/data_handling/test_utils_topo.py b/tests/data_handling/test_utils_topo.py index f7b2e8158..eee1b54a9 100644 --- a/tests/data_handling/test_utils_topo.py +++ b/tests/data_handling/test_utils_topo.py @@ -25,16 +25,33 @@ WRF_SHAPE = (8, 8) -def make_dummy_h5_topo(td, N=100, offset=0.1): +def get_lat_lon_range_h5(fp): + """Get the min/max lat/lon from an h5 file""" + with Resource(fp) as wtk: + lat_range = (wtk.meta['latitude'].min(), wtk.meta['latitude'].max()) + lon_range = (wtk.meta['longitude'].min(), wtk.meta['longitude'].max()) + return lat_range, lon_range + + +def get_lat_lon_range_nc(fp): + """Get the min/max lat/lon from a netcdf file""" + import xarray as xr + dset = xr.open_dataset(fp) + lat_range = (dset['lat'].values.min(), dset['lat'].values.max()) + lon_range = (dset['lon'].values.min(), dset['lon'].values.max()) + return lat_range, lon_range + + +def make_topo_file(fp, td, N=100, offset=0.1): """Make a dummy h5 file with high-res topo for testing""" - wtk = Resource(FP_WTK) - - lat = np.linspace(wtk.meta['latitude'].min() - offset, - wtk.meta['latitude'].max() + offset, - N) - lon = np.linspace(wtk.meta['longitude'].min() - offset, - wtk.meta['longitude'].max() + offset, - N) + + if fp.endswith('.h5'): + lat_range, lon_range = get_lat_lon_range_h5(fp) + else: + lat_range, lon_range = get_lat_lon_range_nc(fp) + + lat = np.linspace(lat_range[0] - offset, lat_range[1] + offset, N) + lon = np.linspace(lon_range[0] - offset, lon_range[1] + offset, N) idy, idx = np.meshgrid(np.arange(len(lon)), np.arange(len(lat))) lon, lat = np.meshgrid(lon, lat) lon, lat = lon.flatten(), lat.flatten() @@ -56,7 +73,7 @@ def test_topo_extraction_h5(s_enhance, plot=False): """Test the spatial enhancement of a test grid and then the lookup of the elevation data to a reference WTK file (also the same file for the test)""" with tempfile.TemporaryDirectory() as td: - fp_exo_topo = make_dummy_h5_topo(td) + fp_exo_topo = make_topo_file(FP_WTK, td) te = TopoExtractH5(FP_WTK, fp_exo_topo, s_enhance=s_enhance, t_enhance=1, t_agg_factor=1, @@ -106,7 +123,7 @@ def test_bad_s_enhance(s_enhance=10): """Test a large s_enhance factor that results in a bad mapping with enhanced grid pixels not having source exo data points""" with tempfile.TemporaryDirectory() as td: - fp_exo_topo = make_dummy_h5_topo(td) + fp_exo_topo = make_topo_file(FP_WTK, td) with pytest.warns(UserWarning) as warnings: te = TopoExtractH5(FP_WTK, fp_exo_topo, s_enhance=s_enhance, @@ -122,7 +139,11 @@ def test_bad_s_enhance(s_enhance=10): def test_topo_extraction_nc(): """Test the spatial enhancement of a test grid and then the lookup of the - elevation data to a reference WRF file (also the same file for the test)""" + elevation data to a reference WRF file (also the same file for the test) + + We already test proper topo mapping and aggregation in the h5 test so this + just makes sure that the data can be extracted from a WRF file. + """ te = TopoExtractNC(FP_WRF, FP_WRF, s_enhance=1, t_enhance=1, t_agg_factor=1, target=None, shape=None) hr_elev = te.data From 73912b4155ead858f8be4131a49f365e91be6027 Mon Sep 17 00:00:00 2001 From: grantbuster Date: Fri, 8 Dec 2023 09:18:32 -0700 Subject: [PATCH 07/15] bias calc to calculate distance threshold by default --- sup3r/bias/bias_calc.py | 26 ++++++++++++++----- .../data_handling/exo_extraction.py | 26 ++++++++++++------- .../data_handling/exogenous_data_handling.py | 10 ++++--- tests/data_handling/test_utils_topo.py | 2 +- tests/forward_pass/test_forward_pass_exo.py | 5 ++++ 5 files changed, 50 insertions(+), 19 deletions(-) diff --git a/sup3r/bias/bias_calc.py b/sup3r/bias/bias_calc.py index 2780aacf4..8498afe43 100644 --- a/sup3r/bias/bias_calc.py +++ b/sup3r/bias/bias_calc.py @@ -37,7 +37,7 @@ def __init__(self, bias_fps, base_dset, bias_feature, - distance_upper_bound, + distance_upper_bound=None, target=None, shape=None, base_handler='Resource', @@ -66,7 +66,8 @@ def __init__(self, distance_upper_bound : float Upper bound on the nearest neighbor distance in decimal degrees. This should be the approximate resolution of the low-resolution - bias data. + bias data. None (default) will calculate this based on the median + distance between points in bias_fps target : tuple (lat, lon) lower left corner of raster to retrieve from bias_fps. If None then the lower left corner of the full domain will be used. @@ -108,6 +109,7 @@ class to be retrieved from the rex/sup3r library. If a self.base_handler_kwargs = base_handler_kwargs or {} self.bias_handler_kwargs = bias_handler_kwargs or {} self.bad_bias_gids = [] + self._distance_upper_bound = distance_upper_bound if isinstance(self.base_fps, str): self.base_fps = sorted(glob(self.base_fps)) @@ -153,10 +155,9 @@ class to be retrieved from the rex/sup3r library. If a self.bias_gid_raster = np.arange(lats.size) self.bias_gid_raster = self.bias_gid_raster.reshape(raster_shape) - out = self.bias_tree.query(self.base_meta[['latitude', 'longitude']], - k=1, - distance_upper_bound=distance_upper_bound) - self.nn_dist, self.nn_ind = out + self.nn_dist, self.nn_ind = self.bias_tree.query( + self.base_meta[['latitude', 'longitude']], k=1, + distance_upper_bound=self.distance_upper_bound) self.out = None self._init_out() @@ -180,6 +181,19 @@ def meta(self): 'version_record': VERSION_RECORD} return meta + @property + def distance_upper_bound(self): + """Maximum distance (float) to map high-resolution data from exo_source + to the low-resolution file_paths input.""" + if self._distance_upper_bound is None: + diff = np.diff(self.bias_meta[['latitude', 'longitude']].values, + axis=0) + diff = np.max(np.median(diff, axis=0)) + self._distance_upper_bound = diff + logger.info('Set distance upper bound to {:.4f}' + .format(self._distance_upper_bound)) + return self._distance_upper_bound + @staticmethod def compare_dists(base_data, bias_data, adder=0, scalar=1): """Compare two distributions using the two-sample Kolmogorov-Smirnov. diff --git a/sup3r/preprocessing/data_handling/exo_extraction.py b/sup3r/preprocessing/data_handling/exo_extraction.py index 1e5970f4d..b0700c017 100644 --- a/sup3r/preprocessing/data_handling/exo_extraction.py +++ b/sup3r/preprocessing/data_handling/exo_extraction.py @@ -56,9 +56,14 @@ def __init__(self, typically low-res WRF output or GCM netcdf data files that is source low-resolution data intended to be sup3r resolved. exo_source : str - Filepath to source wtk or nsrdb file to get hi-res (2km or 4km) - elevation data from which will be mapped to the enhanced grid of - the file_paths input + Filepath to source data file to get hi-res elevation data from + which will be mapped to the enhanced grid of the file_paths input. + Pixels from this exo_source will be mapped to their nearest low-res + pixel in the file_paths input. Accordingly, exo_source should be a + significantly higher resolution than file_paths. Warnings will be + raised if the low-resolution pixels in file_paths do not have + unique nearest pixels from exo_source. File format can be .h5 for + TopoExtractH5 or .nc for TopoExtractNC s_enhance : int Factor by which the Sup3rGan model will enhance the spatial dimensions of low resolution data from file_paths input. For @@ -72,12 +77,13 @@ def __init__(self, t_enhance is 4, this class will output a sza raster corresponding to the file_paths temporally enhanced 4x to 15 min t_agg_factor : int - Factor by which to aggregate the exo_source data to the resolution - of the file_paths input enhanced by t_enhance. For example, if - getting sza data, file_paths have hourly data, and t_enhance - is 4 resulting in a desired resolution of 5 min and exo_source - has a resolution of 5 min, the t_agg_factor should be 4 so that - every fourth timestep in the exo_source data is skipped. + Factor by which to aggregate / subsample the exo_source data to the + resolution of the file_paths input enhanced by t_enhance. For + example, if getting sza data, file_paths have hourly data, and + t_enhance is 4 resulting in a target resolution of 15 min and + exo_source has a resolution of 5 min, the t_agg_factor should be 3 + so that only timesteps that are a multiple of 15min are selected + e.g., [0, 5, 10, 15, 20, 25, 30][slice(0, None, 3)] = [0, 15, 30] target : tuple (lat, lon) lower left corner of raster. Either need target+shape or raster_file. @@ -308,6 +314,8 @@ def distance_upper_bound(self): diff = np.diff(self.source_lat_lon, axis=0) diff = np.max(np.median(diff, axis=0)) self._distance_upper_bound = diff + logger.info('Set distance upper bound to {:.4f}' + .format(self._distance_upper_bound)) return self._distance_upper_bound @property diff --git a/sup3r/preprocessing/data_handling/exogenous_data_handling.py b/sup3r/preprocessing/data_handling/exogenous_data_handling.py index 50cf0ecdd..0c5519dd7 100644 --- a/sup3r/preprocessing/data_handling/exogenous_data_handling.py +++ b/sup3r/preprocessing/data_handling/exogenous_data_handling.py @@ -245,9 +245,13 @@ def __init__(self, source. e.g. {'spatial': '4km', 'temporal': '60min'}. This is used only if agg factors are not provided in the steps list. source_file : str - Filepath to source wtk, nsrdb, or netcdf file to get hi-res (2km or - 4km) data from which will be mapped to the enhanced grid of the - file_paths input + Filepath to source wtk, nsrdb, or netcdf file to get hi-res data + from which will be mapped to the enhanced grid of the file_paths + input. Pixels from this file will be mapped to their nearest + low-res pixel in the file_paths input. Accordingly, the input + should be a significantly higher resolution than file_paths. + Warnings will be raised if the low-resolution pixels in file_paths + do not have unique nearest pixels from this exo source data. target : tuple (lat, lon) lower left corner of raster. Either need target+shape or raster_file. diff --git a/tests/data_handling/test_utils_topo.py b/tests/data_handling/test_utils_topo.py index eee1b54a9..01826a754 100644 --- a/tests/data_handling/test_utils_topo.py +++ b/tests/data_handling/test_utils_topo.py @@ -104,7 +104,7 @@ def test_topo_extraction_h5(s_enhance, plot=False): true_out = te.source_data[iloc].mean() assert np.allclose(test_out, true_out) - shutil.rmtree('./exo_cache/') + shutil.rmtree('./exo_cache/', ignore_errors=True) if plot: a = plt.scatter(te.source_lat_lon[:, 1], te.source_lat_lon[:, 0], diff --git a/tests/forward_pass/test_forward_pass_exo.py b/tests/forward_pass/test_forward_pass_exo.py index c6bfd4d32..747eeae36 100644 --- a/tests/forward_pass/test_forward_pass_exo.py +++ b/tests/forward_pass/test_forward_pass_exo.py @@ -2,6 +2,7 @@ """pytests for data handling""" import json import os +import shutil import tempfile import matplotlib.pyplot as plt @@ -963,6 +964,8 @@ def test_fwp_multi_step_model_multi_exo(): 'U_100m', 'V_100m', 'topography' ] + shutil.rmtree('./exo_cache', ignore_errors=True) + def test_fwp_multi_step_exo_hi_res_topo_and_sza(): """Test the forward pass with multiple ExoGan models requiring @@ -1197,3 +1200,5 @@ def test_fwp_multi_step_exo_hi_res_topo_and_sza(): for fp in handler.out_files: assert os.path.exists(fp) + + shutil.rmtree('./exo_cache', ignore_errors=True) From 9888f05a1e3f7090fd0e2c52e52e64016bb47ca0 Mon Sep 17 00:00:00 2001 From: grantbuster Date: Mon, 11 Dec 2023 11:48:41 -0700 Subject: [PATCH 08/15] moved special imports into methods --- sup3r/bias/__init__.py | 3 ++- sup3r/bias/bias_correct_means.py | 6 ++++- sup3r/utilities/era_downloader.py | 37 ++++++++++++++++++------------- sup3r/utilities/plotting.py | 7 +++++- sup3r/utilities/utilities.py | 4 ++-- 5 files changed, 37 insertions(+), 20 deletions(-) diff --git a/sup3r/bias/__init__.py b/sup3r/bias/__init__.py index e1f477055..944c9f864 100644 --- a/sup3r/bias/__init__.py +++ b/sup3r/bias/__init__.py @@ -2,4 +2,5 @@ """Bias calculation and correction modules.""" from .bias_transforms import (global_linear_bc, local_linear_bc, monthly_local_linear_bc) -from .bias_calc import LinearCorrection, MonthlyLinearCorrection +from .bias_calc import (LinearCorrection, MonthlyLinearCorrection, + SkillAssessment) diff --git a/sup3r/bias/bias_correct_means.py b/sup3r/bias/bias_correct_means.py index 7e63858cf..bd9428aff 100644 --- a/sup3r/bias/bias_correct_means.py +++ b/sup3r/bias/bias_correct_means.py @@ -13,7 +13,6 @@ import numpy as np import pandas as pd -import rioxarray import xarray as xr from rex import Resource from scipy.interpolate import interp1d @@ -116,6 +115,11 @@ def convert_month_height_tif(self, month, height): os.remove(outfile) if not os.path.exists(outfile) or self.overwrite: + try: + import rioxarray + except ImportError as e: + msg = 'Need special installation of "rioxarray" to run this!' + raise ImportError(msg) from e tmp = rioxarray.open_rasterio(infile) ds = tmp.to_dataset("band") ds = ds.rename( diff --git a/sup3r/utilities/era_downloader.py b/sup3r/utilities/era_downloader.py index cc5ef654d..d4822ef59 100644 --- a/sup3r/utilities/era_downloader.py +++ b/sup3r/utilities/era_downloader.py @@ -26,26 +26,12 @@ logger = logging.getLogger(__name__) -try: - import cdsapi - - CDS_API_CLIENT = cdsapi.Client() -except ImportError as e: - msg = f'Could not import cdsapi package. {e}' - logger.error(msg) - class EraDownloader: """Class to handle ERA5 downloading, variable renaming, file combination, and interpolation. """ - msg = ('To download ERA5 data you need to have a ~/.cdsapirc file ' - 'with a valid url and api key. Follow the instructions here: ' - 'https://cds.climate.copernicus.eu/api-how-to') - req_file = os.path.join(os.path.expanduser('~'), '.cdsapirc') - assert os.path.exists(req_file), msg - # variables available on a single level (e.g. surface) SFC_VARS: ClassVar[list] = [ '10m_u_component_of_wind', '10m_v_component_of_wind', @@ -275,6 +261,26 @@ def prep_var_lists(self, variables): logger.warning(msg) warn(msg) + @staticmethod + def get_cds_client(): + """Get the copernicus climate data store (CDS) API object for ERA + downloads.""" + try: + import cdsapi + cds_api_client = cdsapi.Client() + except ImportError as e: + msg = f'Could not import cdsapi package. {e}' + logger.error(msg) + raise ImportError(msg) from e + + msg = ('To download ERA5 data you need to have a ~/.cdsapirc file ' + 'with a valid url and api key. Follow the instructions here: ' + 'https://cds.climate.copernicus.eu/api-how-to') + req_file = os.path.join(os.path.expanduser('~'), '.cdsapirc') + assert os.path.exists(req_file), msg + + return cds_api_client + def download_process_combine(self): """Run the download routine.""" sfc_check = len(self.sfc_file_variables) > 0 @@ -338,7 +344,8 @@ def download_file(cls, variables, time_dict, area, out_file, level_type, if level_type == 'pressure': entry['pressure_level'] = levels logger.info(f'Calling CDS-API with {entry}.') - CDS_API_CLIENT.retrieve( + cds_api_client = cls.get_cds_client() + cds_api_client.retrieve( f'reanalysis-era5-{level_type}-levels', entry, out_file) else: diff --git a/sup3r/utilities/plotting.py b/sup3r/utilities/plotting.py index a0d6f1c5e..61eac0270 100644 --- a/sup3r/utilities/plotting.py +++ b/sup3r/utilities/plotting.py @@ -2,7 +2,6 @@ """Utilities module for plotting data """ -import imageio import matplotlib from matplotlib import cm import matplotlib.pyplot as plt @@ -170,6 +169,12 @@ def make_movie(ntime, movieDir, movieName, fps=24): number of frame per second for the movie, by default 24 """ + try: + import imageio + except ImportError as e: + msg = f'Need extra installation to make movie "imageio": {e}' + raise ImportError(msg) from e + # initiate an empty list of "plotted" images myimages = [] # loops through available pngs diff --git a/sup3r/utilities/utilities.py b/sup3r/utilities/utilities.py index 17a85e4fa..0e155ba45 100644 --- a/sup3r/utilities/utilities.py +++ b/sup3r/utilities/utilities.py @@ -1460,12 +1460,12 @@ def get_input_handler_class(file_paths, input_handler_name): def np_to_pd_times(times): - """Convert np.bytes_ times to DatetimeIndex + """Convert `np.bytes_` times to DatetimeIndex Parameters ---------- times : ndarray | list - List of np.bytes_ objects for time indices + List of `np.bytes_` objects for time indices Returns ------- From ef4c1e7d2916083b60a536ae6512f7874a47ddeb Mon Sep 17 00:00:00 2001 From: grantbuster Date: Mon, 11 Dec 2023 11:49:34 -0700 Subject: [PATCH 09/15] doc updates trying to fix api page --- docs/source/conf.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/docs/source/conf.py b/docs/source/conf.py index 9cb6b53c3..f53862d13 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -57,6 +57,8 @@ "sphinx.ext.napoleon", "sphinx_rtd_theme", 'sphinx_click.ext', + "sphinx_tabs.tabs", + "sphinx_copybutton", ] intersphinx_mapping = { @@ -81,7 +83,7 @@ # # This is also used if you do content translation via gettext catalogs. # Usually you set "language" from the command line for these cases. -language = None +language = 'en' # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. @@ -140,7 +142,7 @@ # -- Options for HTMLHelp output --------------------------------------------- # Output file base name for HTML help builder. -htmlhelp_basename = 'rexdoc' +htmlhelp_basename = 'sup3rdoc' # -- Options for LaTeX output ------------------------------------------------ From c9ed54abb7507dcefb56a19c40844a1a4f36ebf6 Mon Sep 17 00:00:00 2001 From: grantbuster Date: Mon, 11 Dec 2023 15:13:18 -0700 Subject: [PATCH 10/15] defensive coding, better error messages --- .../data_handling/h5_data_handling.py | 4 +- .../data_handling/nc_data_handling.py | 4 +- sup3r/preprocessing/feature_handling.py | 52 +++++++++++-------- 3 files changed, 35 insertions(+), 25 deletions(-) diff --git a/sup3r/preprocessing/data_handling/h5_data_handling.py b/sup3r/preprocessing/data_handling/h5_data_handling.py index fca438c0c..f506d4754 100644 --- a/sup3r/preprocessing/data_handling/h5_data_handling.py +++ b/sup3r/preprocessing/data_handling/h5_data_handling.py @@ -137,7 +137,9 @@ def extract_feature(cls, try: fdata = handle[(feature, time_slice, *(raster_index.flatten(),))] except ValueError as e: - msg = f'{feature} cannot be extracted from source data' + msg = (f'Requested feature "{feature}" cannot be extracted from ' + f'source data that has available variables: ' + f'{handle.dsets}.') logger.exception(msg) raise ValueError(msg) from e diff --git a/sup3r/preprocessing/data_handling/nc_data_handling.py b/sup3r/preprocessing/data_handling/nc_data_handling.py index 8a2ade1ba..f96c03f7f 100644 --- a/sup3r/preprocessing/data_handling/nc_data_handling.py +++ b/sup3r/preprocessing/data_handling/nc_data_handling.py @@ -281,7 +281,9 @@ def extract_feature(cls, time_slice) else: - msg = f'{feature} cannot be extracted from source data.' + available = [str(var) for var in handle.variables] + msg = (f'Requested feature "{feature}" cannot be extracted from ' + f'source data that has available variables: {available}.') logger.exception(msg) raise ValueError(msg) diff --git a/sup3r/preprocessing/feature_handling.py b/sup3r/preprocessing/feature_handling.py index 609cec0aa..f83a8d74f 100644 --- a/sup3r/preprocessing/feature_handling.py +++ b/sup3r/preprocessing/feature_handling.py @@ -1223,11 +1223,13 @@ def get_height(feature): height to use for interpolation in meters """ - height = re.search(r'\d+m', feature) - if height: - height = height.group(0).strip('m') - if not height.isdigit(): - height = None + height = None + if isinstance(feature, str): + height = re.search(r'\d+m', feature) + if height: + height = height.group(0).strip('m') + if not height.isdigit(): + height = None return height @staticmethod @@ -1244,11 +1246,13 @@ def get_pressure(feature): float | None pressure to use for interpolation in pascals """ - pressure = re.search(r'\d+pa', feature) - if pressure: - pressure = pressure.group(0).strip('pa') - if not pressure.isdigit(): - pressure = None + pressure = None + if isinstance(feature, str): + pressure = re.search(r'\d+pa', feature) + if pressure: + pressure = pressure.group(0).strip('pa') + if not pressure.isdigit(): + pressure = None return pressure @@ -1752,10 +1756,11 @@ def _exact_lookup(cls, feature): Matching feature registry entry. """ out = None - for k, v in cls.FEATURE_REGISTRY.items(): - if k.lower() == feature.lower(): - out = v - break + if isinstance(feature, str): + for k, v in cls.FEATURE_REGISTRY.items(): + if k.lower() == feature.lower(): + out = v + break return out @classmethod @@ -1774,10 +1779,11 @@ def _pattern_lookup(cls, feature): Matching feature registry entry. """ out = None - for k, v in cls.FEATURE_REGISTRY.items(): - if re.match(k.lower(), feature.lower()): - out = v - break + if isinstance(feature, str): + for k, v in cls.FEATURE_REGISTRY.items(): + if re.match(k.lower(), feature.lower()): + out = v + break return out @classmethod @@ -1816,7 +1822,7 @@ def _lookup(cls, out, feature, handle_features=None): if pressure is not None: out = out.split('(.*)')[0] + f'{pressure}pa' - return lambda x: [out] + return lambda x: [out] if isinstance(out, str) else out @classmethod def lookup(cls, feature, attr_name, handle_features=None): @@ -1851,7 +1857,6 @@ def lookup(cls, feature, attr_name, handle_features=None): return getattr(out, attr_name, None) elif attr_name == 'inputs': - return cls._lookup(out, feature, handle_features) @classmethod @@ -1873,11 +1878,11 @@ def get_inputs_recursive(cls, feature, handle_features): """ raw_features = [] method = cls.lookup(feature, 'inputs', handle_features=handle_features) - lower_handle_features = [f.lower() for f in handle_features] + low_handle_features = [f.lower() for f in handle_features] + vhf = cls.valid_handle_features([feature.lower()], low_handle_features) check1 = feature not in raw_features - check2 = (cls.valid_handle_features( - [feature.lower()], lower_handle_features) or method is None) + check2 = (vhf or method is None) if check1 and check2: raw_features.append(feature) @@ -1924,6 +1929,7 @@ def get_raw_feature_list(cls, features, handle_features): f'Requested features: {req}') logger.error(msg) raise ValueError(msg) + return raw_features @classmethod From 5d99790eac6280edc6671c46ebaf5590fda3da4d Mon Sep 17 00:00:00 2001 From: ppinchuk Date: Mon, 11 Dec 2023 20:06:12 -0700 Subject: [PATCH 11/15] Remove import breaking sphinx --- sup3r/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sup3r/__init__.py b/sup3r/__init__.py index f2ba4e292..1a4994d48 100644 --- a/sup3r/__init__.py +++ b/sup3r/__init__.py @@ -2,7 +2,7 @@ """Super Resolving Renewable Energy Resource Data (SUP3R)""" import os from sup3r.version import __version__ -import sup3r.cli # import sets up CLI commands + __author__ = """Brandon Benton""" __email__ = "brandon.benton@nrel.gov" From 1fcd177bdca46eae74a3ecbd9052d92a02ee54a7 Mon Sep 17 00:00:00 2001 From: ppinchuk Date: Mon, 11 Dec 2023 20:46:07 -0700 Subject: [PATCH 12/15] Use relative import to fix tests and keep sphinx functional --- sup3r/__init__.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sup3r/__init__.py b/sup3r/__init__.py index 1a4994d48..8e37520df 100644 --- a/sup3r/__init__.py +++ b/sup3r/__init__.py @@ -2,7 +2,9 @@ """Super Resolving Renewable Energy Resource Data (SUP3R)""" import os from sup3r.version import __version__ - +# Next import sets up CLI commands +# This line could be "import sup3r.cli" but that breaks sphinx as of 12/11/2023 +from sup3r.cli import main __author__ = """Brandon Benton""" __email__ = "brandon.benton@nrel.gov" From 5e7c75975cac833fdef67bde68c8bd1868505fd3 Mon Sep 17 00:00:00 2001 From: grantbuster Date: Tue, 12 Dec 2023 14:59:41 -0700 Subject: [PATCH 13/15] do daily reduction with pandas groupby for better performance --- sup3r/bias/bias_calc.py | 61 ++++++++++++++++++++++++++++------------- 1 file changed, 42 insertions(+), 19 deletions(-) diff --git a/sup3r/bias/bias_calc.py b/sup3r/bias/bias_calc.py index 8498afe43..c20f25b23 100644 --- a/sup3r/bias/bias_calc.py +++ b/sup3r/bias/bias_calc.py @@ -323,7 +323,8 @@ def get_data_pair(self, coord, daily_reduction='avg'): daily_reduction : None | str Option to do a reduction of the hourly+ source base data to daily data. Can be None (no reduction, keep source time frequency), "avg" - (daily average), "max" (daily max), or "min" (daily min) + (daily average), "max" (daily max), "min" (daily min), + "sum" (daily sum/total) Returns ------- @@ -418,7 +419,8 @@ def get_base_data(cls, daily_reduction : None | str Option to do a reduction of the hourly+ source base data to daily data. Can be None (no reduction, keep source time frequency), "avg" - (daily average), "max" (daily max), or "min" (daily min) + (daily average), "max" (daily max), "min" (daily min), + "sum" (daily sum/total) decimals : int | None Option to round bias and base data to this number of decimals, this gets passed to np.around(). If decimals @@ -452,10 +454,10 @@ def get_base_data(cls, raise RuntimeError(msg) if issubclass(base_handler, DataHandler) and base_dh_inst is not None: - all_cs_ghi = [None] out_ti = base_dh_inst.time_index out_data = cls._read_base_sup3r_data(base_dh_inst, base_dset, base_gid) + all_cs_ghi = np.ones(len(out_data), dtype=np.float32) * np.nan else: for fp in base_fps: with base_handler(fp, **base_handler_kwargs) as res: @@ -532,9 +534,10 @@ def _read_base_rex_data(res, base_dset, base_gid): ------- base_data : np.ndarray 1D array of base data spatially averaged across the base_gid input - base_cs_ghi : np.ndarray | None + base_cs_ghi : np.ndarray If base_dset == "clearsky_ratio", the base_data array is GHI and - this base_cs_ghi is clearsky GHI. Otherwise this is None + this base_cs_ghi is clearsky GHI. Otherwise this is an array with + same length as base_data but full of np.nan """ msg = '`res` input must not be a `DataHandler` subclass!' @@ -566,6 +569,9 @@ def _read_base_rex_data(res, base_dset, base_gid): if base_cs_ghi is not None: base_cs_ghi = np.nanmean(base_cs_ghi, axis=1) + if base_cs_ghi is None: + base_cs_ghi = np.ones(len(base_data), dtype=np.float32) * np.nan + return base_data, base_cs_ghi @staticmethod @@ -580,15 +586,17 @@ def _reduce_base_data(base_ti, base_data, base_cs_ghi, base_dset, Time index associated with base_data base_data : np.ndarray 1D array of base data spatially averaged across the base_gid input - base_cs_ghi : np.ndarray | None + base_cs_ghi : np.ndarray If base_dset == "clearsky_ratio", the base_data array is GHI and - this base_cs_ghi is clearsky GHI. Otherwise this is None + this base_cs_ghi is clearsky GHI. Otherwise this is an array with + same length as base_data but full of np.nan base_dset : str A single dataset from the base_fps to retrieve. daily_reduction : str Option to do a reduction of the hourly+ source base data to daily data. Can be None (no reduction, keep source time frequency), "avg" - (daily average), "max" (daily max), or "min" (daily min) + (daily average), "max" (daily max), "min" (daily min), + "sum" (daily sum/total) Returns ------- @@ -603,24 +611,38 @@ def _reduce_base_data(base_ti, base_data, base_cs_ghi, base_dset, return base_data daily_ti = pd.DatetimeIndex(sorted(set(base_ti.date))) - slices = [np.where(base_ti.date == date.date()) for date in daily_ti] + df = pd.DataFrame({'date': base_ti.date, + 'base_data': base_data, + 'base_cs_ghi': base_cs_ghi}) + + cs_ratio = (daily_reduction.lower() in ('avg', 'average', 'mean') + and base_dset == 'clearsky_ratio') + + if cs_ratio: + daily_ghi = df.groupby('date').sum()['base_data'].values + daily_cs_ghi = df.groupby('date').sum()['base_cs_ghi'].values + base_data = daily_ghi / daily_cs_ghi + msg = ('Could not calculate daily average "clearsky_ratio" with ' + 'base_data and base_cs_ghi inputs: \n{}, \n{}' + .format(base_data, base_cs_ghi)) + assert not np.isnan(base_data).any(), msg - if base_dset == 'clearsky_ratio' and daily_reduction.lower() == 'avg': - base_data = np.array([base_data[s0].sum() / base_cs_ghi[s0].sum() - for s0 in slices]) + elif daily_reduction.lower() in ('avg', 'average', 'mean'): + base_data = df.groupby('date').mean()['base_data'].values - elif daily_reduction.lower() == 'avg': - base_data = np.array([base_data[s0].mean() for s0 in slices]) + elif daily_reduction.lower() in ('max', 'maximum'): + base_data = df.groupby('date').max()['base_data'].values - elif daily_reduction.lower() == 'max': - base_data = np.array([base_data[s0].max() for s0 in slices]) + elif daily_reduction.lower() in ('min', 'minimum'): + base_data = df.groupby('date').min()['base_data'].values - elif daily_reduction.lower() == 'min': - base_data = np.array([base_data[s0].min() for s0 in slices]) + elif daily_reduction.lower() in ('sum', 'total'): + base_data = df.groupby('date').sum()['base_data'].values msg = (f'Daily reduced base data shape {base_data.shape} does not ' f'match daily time index shape {daily_ti.shape}, ' 'something went wrong!') + assert len(base_data.shape) == 1, msg assert base_data.shape == daily_ti.shape, msg return base_data, daily_ti @@ -853,7 +875,8 @@ def run(self, daily_reduction : None | str Option to do a reduction of the hourly+ source base data to daily data. Can be None (no reduction, keep source time frequency), "avg" - (daily average), "max" (daily max), or "min" (daily min) + (daily average), "max" (daily max), "min" (daily min), + "sum" (daily sum/total) fill_extend : bool Flag to fill data past distance_upper_bound using spatial nearest neighbor. If False, the extended domain will be left as NaN. From 3ba3d14126a7397cc17317338ebc08e513fa04c5 Mon Sep 17 00:00:00 2001 From: grantbuster Date: Wed, 13 Dec 2023 09:25:32 -0700 Subject: [PATCH 14/15] bump version for new bias and topo aggregation algorithm, data handler arg updates, and other PRs --- sup3r/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sup3r/version.py b/sup3r/version.py index 7e7d07482..d50352360 100644 --- a/sup3r/version.py +++ b/sup3r/version.py @@ -1,3 +1,3 @@ """SUP3R Version""" -__version__ = '0.1.1' +__version__ = '0.1.2' From 90e44b6f404b94b455b3a39c5f2076840319570c Mon Sep 17 00:00:00 2001 From: grantbuster Date: Wed, 13 Dec 2023 11:24:45 -0700 Subject: [PATCH 15/15] use handle features method in extract error --- sup3r/preprocessing/data_handling/h5_data_handling.py | 4 ++-- sup3r/preprocessing/data_handling/nc_data_handling.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/sup3r/preprocessing/data_handling/h5_data_handling.py b/sup3r/preprocessing/data_handling/h5_data_handling.py index f506d4754..8b4e945a2 100644 --- a/sup3r/preprocessing/data_handling/h5_data_handling.py +++ b/sup3r/preprocessing/data_handling/h5_data_handling.py @@ -137,9 +137,9 @@ def extract_feature(cls, try: fdata = handle[(feature, time_slice, *(raster_index.flatten(),))] except ValueError as e: + hfeatures = cls.get_handle_features(file_paths) msg = (f'Requested feature "{feature}" cannot be extracted from ' - f'source data that has available variables: ' - f'{handle.dsets}.') + f'source data that has handle features: {hfeatures}.') logger.exception(msg) raise ValueError(msg) from e diff --git a/sup3r/preprocessing/data_handling/nc_data_handling.py b/sup3r/preprocessing/data_handling/nc_data_handling.py index f96c03f7f..cc2105b15 100644 --- a/sup3r/preprocessing/data_handling/nc_data_handling.py +++ b/sup3r/preprocessing/data_handling/nc_data_handling.py @@ -281,9 +281,9 @@ def extract_feature(cls, time_slice) else: - available = [str(var) for var in handle.variables] + hfeatures = cls.get_handle_features(file_paths) msg = (f'Requested feature "{feature}" cannot be extracted from ' - f'source data that has available variables: {available}.') + f'source data that has handle features: {hfeatures}.') logger.exception(msg) raise ValueError(msg)