diff --git a/sup3r/bias/bias_calc.py b/sup3r/bias/bias_calc.py index 2ea899ac4..5ff8ad8c3 100644 --- a/sup3r/bias/bias_calc.py +++ b/sup3r/bias/bias_calc.py @@ -44,7 +44,8 @@ def __init__(self, bias_handler='DataHandlerNCforCC', base_handler_kwargs=None, bias_handler_kwargs=None, - decimals=None): + decimals=None, + match_zero_rate=False): """ Parameters ---------- @@ -94,6 +95,13 @@ class to be retrieved from the rex/sup3r library. If a decimals, this gets passed to np.around(). If decimals is negative, it specifies the number of positions to the left of the decimal point. + match_zero_rate : bool + Option to fix the frequency of zero values in the biased data. The + lowest percentile of values in the biased data will be set to zero + to match the percentile of zeros in the base data. This helps + resolve the issue where global climate models produce too many days + with small precipitation totals e.g., the "drizzle problem". + Ref: Polade et al., 2014 https://doi.org/10.1038/srep04364 """ logger.info('Initializing DataRetrievalBase for base dset "{}" ' @@ -110,6 +118,7 @@ class to be retrieved from the rex/sup3r library. If a self.bias_handler_kwargs = bias_handler_kwargs or {} self.bad_bias_gids = [] self._distance_upper_bound = distance_upper_bound + self.match_zero_rate = match_zero_rate if isinstance(self.base_fps, str): self.base_fps = sorted(glob(self.base_fps)) @@ -367,6 +376,7 @@ def get_bias_data(self, bias_gid): bias_data : np.ndarray 1D array of temporal data at the requested gid. """ + idx = np.where(self.bias_gid_raster == bias_gid) if self.bias_dh.data is None: self.bias_dh.load_cached_data() @@ -486,6 +496,46 @@ def get_base_data(cls, return out_data, out_ti + @staticmethod + def _match_zero_rate(bias_data, base_data): + """The lowest percentile of values in the biased data will be set to + zero to match the percentile of zeros in the base data. This helps + resolve the issue where global climate models produce too many days + with small precipitation totals e.g., the "drizzle problem". + Ref: Polade et al., 2014 https://doi.org/10.1038/srep04364 + + Parameters + ---------- + bias_data : np.ndarray + 1D array of biased data observations. + base_data : np.ndarray + 1D array of base data observations. + + Returns + ------- + bias_data : np.ndarray + 1D array of biased data observations. Values below the quantile + associated with zeros in base_data will be set to zero + """ + + q_zero_base_in = (base_data == 0).mean() + q_zero_bias_in = (bias_data == 0).mean() + + q_bias = np.linspace(0, 1, len(bias_data)) + min_value_bias = np.interp(q_zero_base_in, q_bias, sorted(bias_data)) + + bias_data[bias_data < min_value_bias] = 0 + + q_zero_base_out = (base_data == 0).mean() + q_zero_bias_out = (bias_data == 0).mean() + + logger.debug('Input bias/base zero rate is {:.3e}/{:.3e}, ' + 'output is {:.3e}/{:.3e}' + .format(q_zero_bias_in, q_zero_base_in, + q_zero_bias_out, q_zero_base_out)) + + return bias_data + @staticmethod def _read_base_sup3r_data(dh, base_dset, base_gid): """Read baseline data from a sup3r DataHandler @@ -727,7 +777,8 @@ def _run_single(cls, daily_reduction, bias_ti, decimals, - base_dh_inst=None): + base_dh_inst=None, + match_zero_rate=False): """Find the nominal scalar + adder combination to bias correct data at a single site""" @@ -739,6 +790,9 @@ def _run_single(cls, decimals=decimals, base_dh_inst=base_dh_inst) + if match_zero_rate: + cls._match_zero_rate(bias_data, base_data) + out = cls.get_linear_correction(bias_data, base_data, bias_feature, base_dset) return out @@ -931,6 +985,7 @@ def run(self, self.bias_ti, self.decimals, base_dh_inst=self.base_dh, + match_zero_rate=self.match_zero_rate, ) for key, arr in single_out.items(): self.out[key][raster_loc] = arr @@ -963,6 +1018,7 @@ def run(self, daily_reduction, self.bias_ti, self.decimals, + match_zero_rate=self.match_zero_rate, ) futures[future] = raster_loc @@ -1006,7 +1062,8 @@ def _run_single(cls, daily_reduction, bias_ti, decimals, - base_dh_inst=None): + base_dh_inst=None, + match_zero_rate=False): """Find the nominal scalar + adder combination to bias correct data at a single site""" @@ -1018,6 +1075,9 @@ def _run_single(cls, decimals=decimals, base_dh_inst=base_dh_inst) + if match_zero_rate: + cls._match_zero_rate(bias_data, base_data) + base_arr = np.full(cls.NT, np.nan, dtype=np.float32) out = {} @@ -1117,10 +1177,12 @@ def _init_out(self): f'bias_{self.bias_feature}_std', f'bias_{self.bias_feature}_skew', f'bias_{self.bias_feature}_kurtosis', + f'bias_{self.bias_feature}_zero_rate', f'base_{self.base_dset}_mean', f'base_{self.base_dset}_std', f'base_{self.base_dset}_skew', f'base_{self.base_dset}_kurtosis', + f'base_{self.base_dset}_zero_rate', ] self.out = {k: np.full((*self.bias_gid_raster.shape, self.NT), @@ -1154,11 +1216,13 @@ def _run_skill_eval(cls, bias_data, base_data, bias_feature, base_dset): out[f'bias_{bias_feature}_std'] = np.nanstd(bias_data) out[f'bias_{bias_feature}_skew'] = stats.skew(bias_data) out[f'bias_{bias_feature}_kurtosis'] = stats.kurtosis(bias_data) + out[f'bias_{bias_feature}_zero_rate'] = (bias_data == 0).mean() out[f'base_{base_dset}_mean'] = base_mean out[f'base_{base_dset}_std'] = np.nanstd(base_data) out[f'base_{base_dset}_skew'] = stats.skew(base_data) out[f'base_{base_dset}_kurtosis'] = stats.kurtosis(base_data) + out[f'base_{base_dset}_zero_rate'] = (base_data == 0).mean() ks_out = stats.ks_2samp(base_data - base_mean, bias_data - bias_mean) out[f'{bias_feature}_ks_stat'] = ks_out.statistic @@ -1175,7 +1239,7 @@ def _run_skill_eval(cls, bias_data, base_data, bias_feature, base_dset): @classmethod def _run_single(cls, bias_data, base_fps, bias_feature, base_dset, base_gid, base_handler, daily_reduction, bias_ti, - decimals, base_dh_inst=None): + decimals, base_dh_inst=None, match_zero_rate=False): """Do a skill assessment at a single site""" base_data, base_ti = cls.get_base_data(base_fps, base_dset, @@ -1184,13 +1248,14 @@ def _run_single(cls, bias_data, base_fps, bias_feature, base_dset, decimals=decimals, base_dh_inst=base_dh_inst) + if match_zero_rate: + cls._match_zero_rate(bias_data, base_data) + arr = np.full(cls.NT, np.nan, dtype=np.float32) out = {f'bias_{bias_feature}_mean_monthly': arr.copy(), f'bias_{bias_feature}_std_monthly': arr.copy(), f'base_{base_dset}_mean_monthly': arr.copy(), f'base_{base_dset}_std_monthly': arr.copy(), - f'{bias_feature}_scalar': arr.copy(), - f'{bias_feature}_adder': arr.copy(), } out.update(cls._run_skill_eval(bias_data, base_data, @@ -1208,6 +1273,6 @@ def _run_single(cls, bias_data, base_fps, bias_feature, base_dset, for k, v in mout.items(): if not k.endswith(('_scalar', '_adder')): k += '_monthly' - out[k][month - 1] = v + out[k][month - 1] = v return out diff --git a/tests/bias/test_bias_correction.py b/tests/bias/test_bias_correction.py index 997f955e9..bb4f46b8a 100644 --- a/tests/bias/test_bias_correction.py +++ b/tests/bias/test_bias_correction.py @@ -526,7 +526,25 @@ def test_nc_base_file(): out = calc.run(fill_extend=True, max_workers=1) - assert (out['rsds_scalar'] == 1).all() - assert (out['rsds_adder'] == 0).all() + assert np.allclose(out['base_rsds_mean_monthly'], + out['bias_rsds_mean_monthly']) assert np.allclose(out['base_rsds_mean'], out['bias_rsds_mean']) assert np.allclose(out['base_rsds_std'], out['bias_rsds_std']) + + +def test_match_zero_rate(): + """Test feature to match the rate of zeros in the bias data based on the + zero rate in the base data. Useful for precip where GCMs have a low-precip + "drizzle" problem.""" + bias_data = np.random.uniform(0, 1, 1000) + base_data = np.random.uniform(0, 1, 1000) + base_data[base_data < 0.1] = 0 + + skill = SkillAssessment._run_skill_eval(bias_data, base_data, 'f1', 'f1') + assert skill['bias_f1_zero_rate'] != skill['base_f1_zero_rate'] + assert (bias_data == 0).mean() != (base_data == 0).mean() + + bias_data = SkillAssessment._match_zero_rate(bias_data, base_data) + skill = SkillAssessment._run_skill_eval(bias_data, base_data, 'f1', 'f1') + assert (bias_data == 0).mean() == (base_data == 0).mean() + assert skill['bias_f1_zero_rate'] == skill['base_f1_zero_rate']