Merge branch 'main' into dask-datetime-to-numeric

* main: (24 commits) Fix overflow issue in decode_cf_datetime for dtypes <= np.uint32 (pydata#6598) Enable flox in GroupBy and resample (pydata#5734) Add setuptools as dependency in ASV benchmark CI (pydata#6609) change polyval dim ordering (pydata#6601) re-add timedelta support for polyval (pydata#6599) Minor Dataset.map docstr clarification (pydata#6595) New inline_array kwarg for open_dataset (pydata#6566) Fix polyval overloads (pydata#6593) Restore old MultiIndex dropping behaviour (pydata#6592) [docs] add Dataset.assign_coords example (pydata#6336) (pydata#6558) Fix zarr append dtype checks (pydata#6476) Add missing space in exception message (pydata#6590) Doc Link to accessors list in extending-xarray.rst (pydata#6587) Fix Dataset/DataArray.isel with drop=True and scalar DataArray indexes (pydata#6579) Add some warnings about rechunking to the docs (pydata#6569) [pre-commit.ci] pre-commit autoupdate (pydata#6584) terminology.rst: fix link to Unidata's "netcdf_dataset_components" (pydata#6583) Allow string formatting of scalar DataArrays (pydata#5981) Fix mypy issues & reenable in tests (pydata#6581) polyval: Use Horner's algorithm + support chunked inputs (pydata#6548) ...
dcherian · May 16, 2022 · 0783df3 · 0783df3
2 parents 5cff4f1 + 8de7061
commit 0783df3
Show file tree

Hide file tree

Showing 48 changed files with 1,912 additions and 665 deletions.
diff --git a/.github/workflows/ci-additional.yaml b/.github/workflows/ci-additional.yaml
@@ -72,8 +72,7 @@ jobs:
  runs-on: "ubuntu-latest"
  needs: detect-ci-trigger
  # temporarily skipping due to https://github.com/pydata/xarray/issues/6551
- # if: needs.detect-ci-trigger.outputs.triggered == 'false'
- if: false
+ if: needs.detect-ci-trigger.outputs.triggered == 'false'
  defaults:
  run:
  shell: bash -l {0}
@@ -105,10 +104,10 @@ jobs:
  - name: Install mypy
  run: |
  python -m pip install mypy
- python -m mypy --install-types --non-interactive
 
  - name: Run mypy
- run: python -m mypy
+ run: |
+ python -m mypy --install-types --non-interactive
 
  min-version-policy:
  name: Minimum Version Policy

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -19,7 +19,7 @@ repos:
  hooks:
  - id: isort
  - repo: https://github.com/asottile/pyupgrade
- rev: v2.32.0
+ rev: v2.32.1
  hooks:
  - id: pyupgrade
  args:
@@ -46,7 +46,7 @@ repos:
  # - id: velin
  # args: ["--write", "--compact"]
  - repo: https://github.com/pre-commit/mirrors-mypy
- rev: v0.942
+ rev: v0.950
  hooks:
  - id: mypy
  # Copied from setup.cfg

diff --git a/asv_bench/asv.conf.json b/asv_bench/asv.conf.json
@@ -58,13 +58,17 @@
  // "pip+emcee": [""], // emcee is only available for install with pip.
  // },
  "matrix": {
+ "setuptools_scm[toml]": [""], // GH6609
+ "setuptools_scm_git_archive": [""], // GH6609
  "numpy": [""],
  "pandas": [""],
  "netcdf4": [""],
  "scipy": [""],
  "bottleneck": [""],
  "dask": [""],
  "distributed": [""],
+ "flox": [""],
+ "numpy_groupies": [""],
  "sparse": [""]
  },
 

diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py
@@ -13,6 +13,7 @@ def setup(self, *args, **kwargs):
  {
  "a": xr.DataArray(np.r_[np.repeat(1, self.n), np.repeat(2, self.n)]),
  "b": xr.DataArray(np.arange(2 * self.n)),
+ "c": xr.DataArray(np.arange(2 * self.n)),
  }
  )
  self.ds2d = self.ds1d.expand_dims(z=10)
@@ -50,10 +51,11 @@ class GroupByDask(GroupBy):
  def setup(self, *args, **kwargs):
  requires_dask()
  super().setup(**kwargs)
- self.ds1d = self.ds1d.sel(dim_0=slice(None, None, 2)).chunk({"dim_0": 50})
- self.ds2d = self.ds2d.sel(dim_0=slice(None, None, 2)).chunk(
- {"dim_0": 50, "z": 5}
- )
+
+ self.ds1d = self.ds1d.sel(dim_0=slice(None, None, 2))
+ self.ds1d["c"] = self.ds1d["c"].chunk({"dim_0": 50})
+ self.ds2d = self.ds2d.sel(dim_0=slice(None, None, 2))
+ self.ds2d["c"] = self.ds2d["c"].chunk({"dim_0": 50, "z": 5})
  self.ds1d_mean = self.ds1d.groupby("b").mean()
  self.ds2d_mean = self.ds2d.groupby("b").mean()
 

diff --git a/asv_bench/benchmarks/polyfit.py b/asv_bench/benchmarks/polyfit.py
@@ -0,0 +1,38 @@
+import numpy as np
+
+import xarray as xr
+
+from . import parameterized, randn, requires_dask
+
+NDEGS = (2, 5, 20)
+NX = (10**2, 10**6)
+
+
+class Polyval:
+ def setup(self, *args, **kwargs):
+ self.xs = {nx: xr.DataArray(randn((nx,)), dims="x", name="x") for nx in NX}
+ self.coeffs = {
+ ndeg: xr.DataArray(
+ randn((ndeg,)), dims="degree", coords={"degree": np.arange(ndeg)}
+ )
+ for ndeg in NDEGS
+ }
+
+ @parameterized(["nx", "ndeg"], [NX, NDEGS])
+ def time_polyval(self, nx, ndeg):
+ x = self.xs[nx]
+ c = self.coeffs[ndeg]
+ xr.polyval(x, c).compute()
+
+ @parameterized(["nx", "ndeg"], [NX, NDEGS])
+ def peakmem_polyval(self, nx, ndeg):
+ x = self.xs[nx]
+ c = self.coeffs[ndeg]
+ xr.polyval(x, c).compute()
+
+
+class PolyvalDask(Polyval):
+ def setup(self, *args, **kwargs):
+ requires_dask()
+ super().setup(*args, **kwargs)
+ self.xs = {k: v.chunk({"x": 10000}) for k, v in self.xs.items()}
diff --git a/ci/install-upstream-wheels.sh b/ci/install-upstream-wheels.sh
@@ -15,6 +15,7 @@ conda uninstall -y --force \
  pint \
  bottleneck \
  sparse \
+ flox \
  h5netcdf \
  xarray
 # to limit the runtime of Upstream CI
@@ -47,4 +48,5 @@ python -m pip install \
  git+https://github.com/pydata/sparse \
  git+https://github.com/intake/filesystem_spec \
  git+https://github.com/SciTools/nc-time-axis \
+ git+https://github.com/dcherian/flox \
  git+https://github.com/h5netcdf/h5netcdf
diff --git a/ci/requirements/all-but-dask.yml b/ci/requirements/all-but-dask.yml
@@ -13,6 +13,7 @@ dependencies:
  - cfgrib
  - cftime
  - coveralls
+ - flox
  - h5netcdf
  - h5py
  - hdf5

diff --git a/ci/requirements/environment-windows.yml b/ci/requirements/environment-windows.yml
@@ -10,6 +10,7 @@ dependencies:
  - cftime
  - dask-core
  - distributed
+ - flox
  - fsspec!=2021.7.0
  - h5netcdf
  - h5py

diff --git a/ci/requirements/environment.yml b/ci/requirements/environment.yml
@@ -12,6 +12,7 @@ dependencies:
  - cftime
  - dask-core
  - distributed
+ - flox
  - fsspec!=2021.7.0
  - h5netcdf
  - h5py

diff --git a/ci/requirements/min-all-deps.yml b/ci/requirements/min-all-deps.yml
@@ -10,46 +10,46 @@ dependencies:
  - python=3.8
  - boto3=1.13
  - bottleneck=1.3
- # cartopy 0.18 conflicts with pynio
- - cartopy=0.17
+ - cartopy=0.19
  - cdms2=3.1
  - cfgrib=0.9
- - cftime=1.2
+ - cftime=1.4
  - coveralls
- - dask-core=2.30
- - distributed=2.30
- - h5netcdf=0.8
- - h5py=2.10
- # hdf5 1.12 conflicts with h5py=2.10
+ - dask-core=2021.04
+ - distributed=2021.04
+ - flox=0.5
+ - h5netcdf=0.11
+ - h5py=3.1
+ # hdf5 1.12 conflicts with h5py=3.1
  - hdf5=1.10
  - hypothesis
  - iris=2.4
  - lxml=4.6 # Optional dep of pydap
- - matplotlib-base=3.3
+ - matplotlib-base=3.4
  - nc-time-axis=1.2
  # netcdf follows a 1.major.minor[.patch] convention
  # (see https://github.com/Unidata/netcdf4-python/issues/1090)
  # bumping the netCDF4 version is currently blocked by #4491
  - netcdf4=1.5.3
- - numba=0.51
- - numpy=1.18
+ - numba=0.53
+ - numpy=1.19
  - packaging=20.0
- - pandas=1.1
- - pint=0.16
+ - pandas=1.2
+ - pint=0.17
  - pip
  - pseudonetcdf=3.1
  - pydap=3.2
- - pynio=1.5
+ # - pynio=1.5.5
  - pytest
  - pytest-cov
  - pytest-env
  - pytest-xdist
- - rasterio=1.1
- - scipy=1.5
+ - rasterio=1.2
+ - scipy=1.6
  - seaborn=0.11
- - sparse=0.11
+ - sparse=0.12
  - toolz=0.11
  - typing_extensions=3.7
- - zarr=2.5
+ - zarr=2.8
  - pip:
  - numbagg==0.1
diff --git a/doc/getting-started-guide/installing.rst b/doc/getting-started-guide/installing.rst
@@ -7,9 +7,9 @@ Required dependencies
 ---------------------
 
 - Python (3.8 or later)
-- `numpy <https://www.numpy.org/>`__ (1.18 or later)
+- `numpy <https://www.numpy.org/>`__ (1.19 or later)
 - `packaging <https://packaging.pypa.io/en/latest/#>`__ (20.0 or later)
-- `pandas <https://pandas.pydata.org/>`__ (1.1 or later)
+- `pandas <https://pandas.pydata.org/>`__ (1.2 or later)
 
 .. _optional-dependencies:
 

diff --git a/doc/internals/extending-xarray.rst b/doc/internals/extending-xarray.rst
@@ -92,8 +92,8 @@ on ways to write new accessors and the philosophy behind the approach, see
 
 To help users keep things straight, please `let us know
 <https://github.com/pydata/xarray/issues>`_ if you plan to write a new accessor
-for an open source library. In the future, we will maintain a list of accessors
-and the libraries that implement them on this page.
+for an open source library. Existing open source accessors and the libraries
+that implement them are available in the list on the :ref:`ecosystem` page.
 
 To make documenting accessors with ``sphinx`` and ``sphinx.ext.autosummary``
 easier, you can use `sphinx-autosummary-accessors`_.

diff --git a/doc/user-guide/dask.rst b/doc/user-guide/dask.rst
@@ -84,7 +84,7 @@ argument to :py:func:`~xarray.open_dataset` or using the
 
 In this example ``latitude`` and ``longitude`` do not appear in the ``chunks``
 dict, so only one chunk will be used along those dimensions. It is also
-entirely equivalent to opening a dataset using :py:meth:`~xarray.open_dataset`
+entirely equivalent to opening a dataset using :py:func:`~xarray.open_dataset`
 and then chunking the data using the ``chunk`` method, e.g.,
 ``xr.open_dataset('example-data.nc').chunk({'time': 10})``.
 
@@ -95,13 +95,21 @@ use :py:func:`~xarray.open_mfdataset`::
 
 This function will automatically concatenate and merge datasets into one in
 the simple cases that it understands (see :py:func:`~xarray.combine_by_coords`
-for the full disclaimer). By default, :py:meth:`~xarray.open_mfdataset` will chunk each
+for the full disclaimer). By default, :py:func:`~xarray.open_mfdataset` will chunk each
 netCDF file into a single Dask array; again, supply the ``chunks`` argument to
 control the size of the resulting Dask arrays. In more complex cases, you can
-open each file individually using :py:meth:`~xarray.open_dataset` and merge the result, as
-described in :ref:`combining data`. Passing the keyword argument ``parallel=True`` to :py:meth:`~xarray.open_mfdataset` will speed up the reading of large multi-file datasets by
+open each file individually using :py:func:`~xarray.open_dataset` and merge the result, as
+described in :ref:`combining data`. Passing the keyword argument ``parallel=True`` to
+:py:func:`~xarray.open_mfdataset` will speed up the reading of large multi-file datasets by
 executing those read tasks in parallel using ``dask.delayed``.
 
+.. warning::
+
+ :py:func:`~xarray.open_mfdataset` called without ``chunks`` argument will return
+ dask arrays with chunk sizes equal to the individual files. Re-chunking
+ the dataset after creation with ``ds.chunk()`` will lead to an ineffective use of
+ memory and is not recommended.
+
 You'll notice that printing a dataset still shows a preview of array values,
 even if they are actually Dask arrays. We can do this quickly with Dask because
 we only need to compute the first few values (typically from the first block).
@@ -224,6 +232,7 @@ disk.
  available memory.
 
 .. note::
+
  For more on the differences between :py:meth:`~xarray.Dataset.persist` and
  :py:meth:`~xarray.Dataset.compute` see this `Stack Overflow answer <https://stackoverflow.com/questions/41806850/dask-difference-between-client-persist-and-client-compute>`_ and the `Dask documentation <https://distributed.dask.org/en/latest/manage-computation.html#dask-collections-to-futures>`_.
 
@@ -236,6 +245,11 @@ sizes of Dask arrays is done with the :py:meth:`~xarray.Dataset.chunk` method:
 
  rechunked = ds.chunk({"latitude": 100, "longitude": 100})
 
+.. warning::
+
+ Rechunking an existing dask array created with :py:func:`~xarray.open_mfdataset`
+ is not recommended (see above).
+
 You can view the size of existing chunks on an array by viewing the
 :py:attr:`~xarray.Dataset.chunks` attribute:
 
@@ -295,8 +309,7 @@ each block of your xarray object, you have three options:
 ``apply_ufunc``
 ~~~~~~~~~~~~~~~
 
-Another option is to use xarray's :py:func:`~xarray.apply_ufunc`, which can
-automate `embarrassingly parallel
+:py:func:`~xarray.apply_ufunc` automates `embarrassingly parallel
 <https://en.wikipedia.org/wiki/Embarrassingly_parallel>`__ "map" type operations
 where a function written for processing NumPy arrays should be repeatedly
 applied to xarray objects containing Dask arrays. It works similarly to
@@ -542,18 +555,20 @@ larger chunksizes.
 Optimization Tips
 -----------------
 
-With analysis pipelines involving both spatial subsetting and temporal resampling, Dask performance can become very slow in certain cases. Here are some optimization tips we have found through experience:
+With analysis pipelines involving both spatial subsetting and temporal resampling, Dask performance
+can become very slow or memory hungry in certain cases. Here are some optimization tips we have found
+through experience:
 
-1. Do your spatial and temporal indexing (e.g. ``.sel()`` or ``.isel()``) early in the pipeline, especially before calling ``resample()`` or ``groupby()``. Grouping and resampling triggers some computation on all the blocks, which in theory should commute with indexing, but this optimization hasn't been implemented in Dask yet. (See `Dask issue #746 <https://github.com/dask/dask/issues/746>`_).
+1. Do your spatial and temporal indexing (e.g. ``.sel()`` or ``.isel()``) early in the pipeline, especially before calling ``resample()`` or ``groupby()``. Grouping and resampling triggers some computation on all the blocks, which in theory should commute with indexing, but this optimization hasn't been implemented in Dask yet. (See `Dask issue #746 <https://github.com/dask/dask/issues/746>`_). More generally, ``groupby()`` is a costly operation and does not (yet) perform well on datasets split across multiple files (see :pull:`5734` and linked discussions there).
 
 2. Save intermediate results to disk as a netCDF files (using ``to_netcdf()``) and then load them again with ``open_dataset()`` for further computations. For example, if subtracting temporal mean from a dataset, save the temporal mean to disk before subtracting. Again, in theory, Dask should be able to do the computation in a streaming fashion, but in practice this is a fail case for the Dask scheduler, because it tries to keep every chunk of an array that it computes in memory. (See `Dask issue #874 <https://github.com/dask/dask/issues/874>`_)
 
-3. Specify smaller chunks across space when using :py:meth:`~xarray.open_mfdataset` (e.g., ``chunks={'latitude': 10, 'longitude': 10}``). This makes spatial subsetting easier, because there's no risk you will load chunks of data referring to different chunks (probably not necessary if you follow suggestion 1).
+3. Specify smaller chunks across space when using :py:meth:`~xarray.open_mfdataset` (e.g., ``chunks={'latitude': 10, 'longitude': 10}``). This makes spatial subsetting easier, because there's no risk you will load subsets of data which span multiple chunks. On individual files, prefer to subset before chunking (suggestion 1).
+
+4. Chunk as early as possible, and avoid rechunking as much as possible. Always pass the ``chunks={}`` argument to :py:func:`~xarray.open_mfdataset` to avoid redundant file reads.
 
-4. Using the h5netcdf package by passing ``engine='h5netcdf'`` to :py:meth:`~xarray.open_mfdataset`
- can be quicker than the default ``engine='netcdf4'`` that uses the netCDF4 package.
+5. Using the h5netcdf package by passing ``engine='h5netcdf'`` to :py:meth:`~xarray.open_mfdataset` can be quicker than the default ``engine='netcdf4'`` that uses the netCDF4 package.
 
-5. Some dask-specific tips may be found `here <https://docs.dask.org/en/latest/array-best-practices.html>`_.
+6. Some dask-specific tips may be found `here <https://docs.dask.org/en/latest/array-best-practices.html>`_.
 
-6. The dask `diagnostics <https://docs.dask.org/en/latest/understanding-performance.html>`_ can be
- useful in identifying performance bottlenecks.
+7. The dask `diagnostics <https://docs.dask.org/en/latest/understanding-performance.html>`_ can be useful in identifying performance bottlenecks.
diff --git a/doc/user-guide/terminology.rst b/doc/user-guide/terminology.rst
@@ -27,7 +27,7 @@ complete examples, please consult the relevant documentation.*
 
  Variable
  A `NetCDF-like variable
- <https://www.unidata.ucar.edu/software/netcdf/docs/netcdf_data_set_components.html#variables>`_
+ <https://docs.unidata.ucar.edu/nug/current/netcdf_data_set_components.html#variables>`_
  consisting of dimensions, data, and attributes which describe a single
  array. The main functional difference between variables and numpy arrays
  is that numerical operations on variables implement array broadcasting