Merge pull request #195 from NREL/gb/precip_skill

Gb/precip skill
NREL · Mar 28, 2024 · d710fe3 · d710fe3
2 parents 986a1fe + 226b67f
commit d710fe3
Show file tree

Hide file tree

Showing 2 changed files with 126 additions and 12 deletions.
diff --git a/sup3r/bias/bias_calc.py b/sup3r/bias/bias_calc.py
@@ -44,7 +44,8 @@ def __init__(self,
  bias_handler='DataHandlerNCforCC',
  base_handler_kwargs=None,
  bias_handler_kwargs=None,
- decimals=None):
+ decimals=None,
+ match_zero_rate=False):
  """
  Parameters
  ----------
@@ -94,6 +95,15 @@ class to be retrieved from the rex/sup3r library. If a
  decimals, this gets passed to np.around(). If decimals
  is negative, it specifies the number of positions to
  the left of the decimal point.
+ match_zero_rate : bool
+ Option to fix the frequency of zero values in the biased data. The
+ lowest percentile of values in the biased data will be set to zero
+ to match the percentile of zeros in the base data. If
+ SkillAssessment is being run and this is True, the distributions
+ will not be mean-centered. This helps resolve the issue where
+ global climate models produce too many days with small
+ precipitation totals e.g., the "drizzle problem".
+ Ref: Polade et al., 2014 https://doi.org/10.1038/srep04364
  """
 
  logger.info('Initializing DataRetrievalBase for base dset "{}" '
@@ -110,6 +120,7 @@ class to be retrieved from the rex/sup3r library. If a
  self.bias_handler_kwargs = bias_handler_kwargs or {}
  self.bad_bias_gids = []
  self._distance_upper_bound = distance_upper_bound
+ self.match_zero_rate = match_zero_rate
 
  if isinstance(self.base_fps, str):
  self.base_fps = sorted(glob(self.base_fps))
@@ -367,6 +378,7 @@ def get_bias_data(self, bias_gid):
  bias_data : np.ndarray
  1D array of temporal data at the requested gid.
  """
+
  idx = np.where(self.bias_gid_raster == bias_gid)
  if self.bias_dh.data is None:
  self.bias_dh.load_cached_data()
@@ -486,6 +498,46 @@ def get_base_data(cls,
 
  return out_data, out_ti
 
+ @staticmethod
+ def _match_zero_rate(bias_data, base_data):
+ """The lowest percentile of values in the biased data will be set to
+ zero to match the percentile of zeros in the base data. This helps
+ resolve the issue where global climate models produce too many days
+ with small precipitation totals e.g., the "drizzle problem".
+ Ref: Polade et al., 2014 https://doi.org/10.1038/srep04364
+
+ Parameters
+ ----------
+ bias_data : np.ndarray
+ 1D array of biased data observations.
+ base_data : np.ndarray
+ 1D array of base data observations.
+
+ Returns
+ -------
+ bias_data : np.ndarray
+ 1D array of biased data observations. Values below the quantile
+ associated with zeros in base_data will be set to zero
+ """
+
+ q_zero_base_in = np.nanmean(base_data == 0)
+ q_zero_bias_in = np.nanmean(bias_data == 0)
+
+ q_bias = np.linspace(0, 1, len(bias_data))
+ min_value_bias = np.interp(q_zero_base_in, q_bias, sorted(bias_data))
+
+ bias_data[bias_data < min_value_bias] = 0
+
+ q_zero_base_out = np.nanmean(base_data == 0)
+ q_zero_bias_out = np.nanmean(bias_data == 0)
+
+ logger.debug('Input bias/base zero rate is {:.3e}/{:.3e}, '
+ 'output is {:.3e}/{:.3e}'
+ .format(q_zero_bias_in, q_zero_base_in,
+ q_zero_bias_out, q_zero_base_out))
+
+ return bias_data
+
  @staticmethod
  def _read_base_sup3r_data(dh, base_dset, base_gid):
  """Read baseline data from a sup3r DataHandler
@@ -727,7 +779,8 @@ def _run_single(cls,
  daily_reduction,
  bias_ti,
  decimals,
- base_dh_inst=None):
+ base_dh_inst=None,
+ match_zero_rate=False):
  """Find the nominal scalar + adder combination to bias correct data
  at a single site"""
 
@@ -739,6 +792,9 @@ def _run_single(cls,
  decimals=decimals,
  base_dh_inst=base_dh_inst)
 
+ if match_zero_rate:
+ bias_data = cls._match_zero_rate(bias_data, base_data)
+
  out = cls.get_linear_correction(bias_data, base_data, bias_feature,
  base_dset)
  return out
@@ -931,6 +987,7 @@ def run(self,
  self.bias_ti,
  self.decimals,
  base_dh_inst=self.base_dh,
+ match_zero_rate=self.match_zero_rate,
  )
  for key, arr in single_out.items():
  self.out[key][raster_loc] = arr
@@ -963,6 +1020,7 @@ def run(self,
  daily_reduction,
  self.bias_ti,
  self.decimals,
+ match_zero_rate=self.match_zero_rate,
  )
  futures[future] = raster_loc
 
@@ -1006,7 +1064,8 @@ def _run_single(cls,
  daily_reduction,
  bias_ti,
  decimals,
- base_dh_inst=None):
+ base_dh_inst=None,
+ match_zero_rate=False):
  """Find the nominal scalar + adder combination to bias correct data
  at a single site"""
 
@@ -1018,6 +1077,9 @@ def _run_single(cls,
  decimals=decimals,
  base_dh_inst=base_dh_inst)
 
+ if match_zero_rate:
+ bias_data = cls._match_zero_rate(bias_data, base_data)
+
  base_arr = np.full(cls.NT, np.nan, dtype=np.float32)
  out = {}
 
@@ -1117,10 +1179,12 @@ def _init_out(self):
  f'bias_{self.bias_feature}_std',
  f'bias_{self.bias_feature}_skew',
  f'bias_{self.bias_feature}_kurtosis',
+ f'bias_{self.bias_feature}_zero_rate',
  f'base_{self.base_dset}_mean',
  f'base_{self.base_dset}_std',
  f'base_{self.base_dset}_skew',
  f'base_{self.base_dset}_kurtosis',
+ f'base_{self.base_dset}_zero_rate',
  ]
 
  self.out = {k: np.full((*self.bias_gid_raster.shape, self.NT),
@@ -1138,13 +1202,17 @@ def _init_out(self):
  self.out[bias_k] = arr.copy()
 
  @classmethod
- def _run_skill_eval(cls, bias_data, base_data, bias_feature, base_dset):
+ def _run_skill_eval(cls, bias_data, base_data, bias_feature, base_dset,
+ match_zero_rate=False):
  """Run skill assessment metrics on 1D datasets at a single site.
 
  Note we run the KS test on the mean=0 distributions as per:
  S. Brands et al. 2013 https://doi.org/10.1007/s00382-013-1742-8
  """
 
+ if match_zero_rate:
+ bias_data = cls._match_zero_rate(bias_data, base_data)
+
  out = {}
  bias_mean = np.nanmean(bias_data)
  base_mean = np.nanmean(base_data)
@@ -1154,13 +1222,20 @@ def _run_skill_eval(cls, bias_data, base_data, bias_feature, base_dset):
  out[f'bias_{bias_feature}_std'] = np.nanstd(bias_data)
  out[f'bias_{bias_feature}_skew'] = stats.skew(bias_data)
  out[f'bias_{bias_feature}_kurtosis'] = stats.kurtosis(bias_data)
+ out[f'bias_{bias_feature}_zero_rate'] = np.nanmean(bias_data == 0)
 
  out[f'base_{base_dset}_mean'] = base_mean
  out[f'base_{base_dset}_std'] = np.nanstd(base_data)
  out[f'base_{base_dset}_skew'] = stats.skew(base_data)
  out[f'base_{base_dset}_kurtosis'] = stats.kurtosis(base_data)
+ out[f'base_{base_dset}_zero_rate'] = np.nanmean(base_data == 0)
+
+ if match_zero_rate:
+ ks_out = stats.ks_2samp(base_data, bias_data)
+ else:
+ ks_out = stats.ks_2samp(base_data - base_mean,
+ bias_data - bias_mean)
 
- ks_out = stats.ks_2samp(base_data - base_mean, bias_data - bias_mean)
  out[f'{bias_feature}_ks_stat'] = ks_out.statistic
  out[f'{bias_feature}_ks_p'] = ks_out.pvalue
 
@@ -1175,7 +1250,7 @@ def _run_skill_eval(cls, bias_data, base_data, bias_feature, base_dset):
  @classmethod
  def _run_single(cls, bias_data, base_fps, bias_feature, base_dset,
  base_gid, base_handler, daily_reduction, bias_ti,
- decimals, base_dh_inst=None):
+ decimals, base_dh_inst=None, match_zero_rate=False):
  """Do a skill assessment at a single site"""
 
  base_data, base_ti = cls.get_base_data(base_fps, base_dset,
@@ -1189,12 +1264,11 @@ def _run_single(cls, bias_data, base_fps, bias_feature, base_dset,
  f'bias_{bias_feature}_std_monthly': arr.copy(),
  f'base_{base_dset}_mean_monthly': arr.copy(),
  f'base_{base_dset}_std_monthly': arr.copy(),
- f'{bias_feature}_scalar': arr.copy(),
- f'{bias_feature}_adder': arr.copy(),
  }
 
  out.update(cls._run_skill_eval(bias_data, base_data,
- bias_feature, base_dset))
+ bias_feature, base_dset,
+ match_zero_rate=match_zero_rate))
 
  for month in range(1, 13):
  bias_mask = bias_ti.month == month
@@ -1208,6 +1282,6 @@ def _run_single(cls, bias_data, base_fps, bias_feature, base_dset,
  for k, v in mout.items():
  if not k.endswith(('_scalar', '_adder')):
  k += '_monthly'
- out[k][month - 1] = v
+  out[k][month - 1] = v
 
  return out
diff --git a/tests/bias/test_bias_correction.py b/tests/bias/test_bias_correction.py
@@ -1,6 +1,7 @@
 # -*- coding: utf-8 -*-
 """pytests bias correction calculations"""
 import os
+import shutil
 import tempfile
 
 import h5py
@@ -526,7 +527,46 @@ def test_nc_base_file():
 
  out = calc.run(fill_extend=True, max_workers=1)
 
- assert (out['rsds_scalar'] == 1).all()
- assert (out['rsds_adder'] == 0).all()
+ assert np.allclose(out['base_rsds_mean_monthly'],
+  out['bias_rsds_mean_monthly'])
  assert np.allclose(out['base_rsds_mean'], out['bias_rsds_mean'])
  assert np.allclose(out['base_rsds_std'], out['bias_rsds_std'])
+
+
+def test_match_zero_rate():
+ """Test feature to match the rate of zeros in the bias data based on the
+ zero rate in the base data. Useful for precip where GCMs have a low-precip
+ "drizzle" problem."""
+ bias_data = np.random.uniform(0, 1, 1000)
+ base_data = np.random.uniform(0, 1, 1000)
+ base_data[base_data < 0.1] = 0
+
+ skill = SkillAssessment._run_skill_eval(bias_data, base_data, 'f1', 'f1')
+ assert skill['bias_f1_zero_rate'] != skill['base_f1_zero_rate']
+ assert (bias_data == 0).mean() != (base_data == 0).mean()
+
+ skill = SkillAssessment._run_skill_eval(bias_data, base_data, 'f1', 'f1',
+ match_zero_rate=True)
+ assert (bias_data == 0).mean() == (base_data == 0).mean()
+ assert skill['bias_f1_zero_rate'] == skill['base_f1_zero_rate']
+ for p in (1, 5, 25, 50, 75, 95, 99):
+ assert np.allclose(skill[f'base_f1_percentile_{p}'],
+ np.percentile(base_data, p))
+
+ with tempfile.TemporaryDirectory() as td:
+ fp_nsrdb_temp = os.path.join(td, os.path.basename(FP_NSRDB))
+ shutil.copy(FP_NSRDB, fp_nsrdb_temp)
+ with h5py.File(fp_nsrdb_temp, 'a') as nsrdb_temp:
+ ghi = nsrdb_temp['ghi'][...]
+ ghi[:1000, :] = 0
+ nsrdb_temp['ghi'][...] = ghi
+ calc = SkillAssessment(fp_nsrdb_temp, FP_CC, 'ghi', 'rsds',
+ target=TARGET, shape=SHAPE,
+ distance_upper_bound=0.7,
+ bias_handler='DataHandlerNCforCC',
+ match_zero_rate=True)
+ out = calc.run(fill_extend=True, max_workers=1)
+
+ bias_rate = out['bias_rsds_zero_rate']
+ base_rate = out['base_ghi_zero_rate']
+ assert np.allclose(bias_rate, base_rate, rtol=0.005)