diff --git a/docs/ts2img.rst b/docs/ts2img.rst deleted file mode 100644 index 43449b4..0000000 --- a/docs/ts2img.rst +++ /dev/null @@ -1,254 +0,0 @@ -.. _ts2img: - -====== -ts2img -====== - -Introduction -============ - -Conversion of time series data to images (ts2img) is a general problem -that we have to deal with often for all kinds of datasets. Although most -of the external datasets we use come in image format most of them are -converted into a time series format for analysis. This is fairly -straightforward for image stacks but gets more complicated for orbit -data. - -Orbit data mostly comes as several data values accompanied by latitude, -longitude information and must often be resampled to fit on a grid that -lends itself for time series analysis. A more or less general solution -for this already exists in the img2ts module. - -Possible steps involved in the conversion -========================================= - -The steps that a ts2img program might have to perform are: - -1. Read time series in a geographic region - constrained by memory -2. Aggregate time series in time - - - methods for doing this aggregation can vary for each dataset so - the method is best specified by the user - - resolution in time has to be chosen and is probably also best - specified by the user - - after aggregation every point in time and in space must have a - value which can of course be NaN - - time series might have to be split into separate images during - conversion, e.g. ASCAT time series are routinely split into images - for ascending and descending satellite overpasses. **This means - that we can not assume that the output dataset has the same number - or names of variables as the input dataset.** - -3. Put the now uniform time series of equal length into a 2D array per - variable -4. A resampling step could be performed here but since only a part of - the dataset is available edge cases would not be resolved correctly. - A better solution would be to develop a good resampling tool which - might already exist in pyresample and pytesmo functions that use it. -5. write this data into a file - - - this can be a netCDF file with dimensions of the grid into which - the data is written - - this could be any other file format, the interface to this format - just has to make sure that in the end a consistent image dataset - is built out of the parts that are written. - -Solution -======== - -The chosen first solution uses netCDF as an output format. The output -will be a stack of images in netCDF format. This format can easily be -converted into substacks or single images if that is needed for a -certain user or project. - -The chosen solution will **not** do resampling since this is better and -easier done using the whole converted dataset. This also means that if -the input dataset is e.g. a dataset defined over land only then the -resulting "image" will also not contain land points. I think it is best -to let this be decided by the input dataset. - -The output of the resulting netCDF can have one of two possible -"shapes": - -- 2D variables with time on one axis and gpi on the other. This is kind - of how SWI time series are stored already. -- 3D variables with latitude, longitude and time as the three - dimensions. - -The decision of which it will be is dependent on the grid on which the -input data is stored. If the grid has a 2D shape then the 3D solution -will be chosen. If the input grid has only a 1D shape then only the 2D -solution is possible. - -Time Series aggregation ------------------------ - -The chosen solution will use a custom function for each dataset to -perform the aggregation if necessary. A simple example of a function -that gets a time time series and aggregates it to a monthly time series -could look like *agg\_tsmonthly* - -Simple example of a aggregation function - -.. code:: python - - def agg_tsmonthly(ts, **kwargs): - """ - Parameters - ---------- - ts : pandas.DataFrame - time series of a point - kwargs : dict - any additional keyword arguments that are given to the ts2img object - during initialization - - Returns - ------- - ts_agg : pandas.DataFrame - aggregated time series, they all must have the same length - otherwise it can not work - each column of this DataFrame will be a layer in the image - """ - # very simple example - # aggregate to monthly timestamp - # should also make sure that the output has a certain length - return ts.asfreq("M") - -Time series iteration ---------------------- - -The function ``agg_tsmonthly`` will be called for every time series in -the input dataset. The input dataset must have a ``iter_ts`` -iterator that iterates over the grid points in a sensible order. - -Interface to the netCDF writer ------------------------------- - -The netCDF writer will be initialized outside the *ts2img* class with a -filename and other attributes it needs. So the *ts2img* class only gets -a writer object. This writer object already knows about the start and -end date of the time series as well as the target grid and has -initialized the correct dimensions in the netCDF file. This object must -have a method ``write_ts`` which takes a array of gpi's and a 2D array -containing the time series for these gpis. This should be enough to -write the gpi's into the correct position of the netCDF file. - -This approach should also work if another output format is supposed to -be used. - -Implementation of the main ts2img class ---------------------------------------- - -The ts2img class will automatically use a the function given in -``agg_ts2img`` if no custom ``agg_ts2img`` function is provided. If -the tsreader implements a method called ``agg_ts2img`` this function -will be used instead. - -.. code:: python - - class Ts2Img(object): - - """ - Takes a time series dataset and converts it - into an image dataset. - A custom aggregate function should be given otherwise - a daily mean will be used - - Parameters - ---------- - tsreader: object - object that implements a iter_ts method which iterates over - pandas time series and has a grid attribute that is a pytesmo - BasicGrid or CellGrid - imgwriter: object - writer object that implements a write_ts method that takes - a list of grid point indices and a 2D array containing the time series data - agg_func: function - function that takes a pandas DataFrame and returns - an aggregated pandas DataFrame - ts_buffer: int - how many time series to read before writing to disk, - constrained by the working memory the process should use. - - """ - - def __init__(self, tsreader, imgwriter, - agg_func=None, - ts_buffer=1000): - - self.agg_func = agg_func - if self.agg_func is None: - try: - self.agg_func = tsreader.agg_ts2img - except AttributeError: - self.agg_func = agg_tsmonthly - self.tsreader = tsreader - self.imgwriter = imgwriter - self.ts_buffer = ts_buffer - - def calc(self, **tsaggkw): - """ - does the conversion from time series to images - """ - for gpis, ts in self.tsbulk(**tsaggkw): - self.imgwriter.write_ts(gpis, ts) - - def tsbulk(self, gpis=None, **tsaggkw): - """ - iterator over gpi and time series arrays of size self.ts_buffer - - Parameters - ---------- - gpis: iterable, optional - if given these gpis will be used, can be practical - if the gpis are managed by an external class e.g. for parallel - processing - tsaggkw: dict - Keywords to give to the time series aggregation function - - - Returns - ------- - gpi_array: numpy.array - numpy array of gpis in this batch - ts_bulk: dict of numpy arrays - for each variable one numpy array of shape - (len(gpi_array), len(ts_aggregated)) - """ - # have to use the grid iteration as long as iter_ts only returns - # data frame and no time series object including relevant metadata - # of the time series - i = 0 - gpi_bulk = [] - ts_bulk = {} - ts_index = None - if gpis is None: - gpis, _, _, _ = self.tsreader.grid.grid_points() - for gpi in gpis: - gpi_bulk.append(gpi) - ts = self.tsreader.read_ts(gpi) - ts_agg = self.agg_func(ts, **tsaggkw) - for column in ts_agg.columns: - try: - ts_bulk[column].append(ts_agg[column].values) - except KeyError: - ts_bulk[column] = [] - ts_bulk[column].append(ts_agg[column].values) - - if ts_index is None: - ts_index = ts_agg.index - - i += 1 - if i >= self.ts_buffer: - for key in ts_bulk: - ts_bulk[key] = np.vstack(ts_bulk[key]) - gpi_array = np.hstack(gpi_bulk) - yield gpi_array, ts_bulk - ts_bulk = {} - gpi_bulk = [] - i = 0 - if i > 0: - for key in ts_bulk: - ts_bulk[key] = np.vstack(ts_bulk[key]) - gpi_array = np.hstack(gpi_bulk) - yield gpi_array, ts_bulk diff --git a/src/repurpose/concatenate.py b/src/repurpose/concatenate.py new file mode 100644 index 0000000..fc98aea --- /dev/null +++ b/src/repurpose/concatenate.py @@ -0,0 +1,4 @@ +""" +Module to concatenate the data from multiple time series sources. +This is used when e.g a CDR and ICDR time series data set should be merged. +""" diff --git a/tests/test_docs/test_examples.py b/tests/test_docs/test_examples.py index 93c78dd..9e2000c 100644 --- a/tests/test_docs/test_examples.py +++ b/tests/test_docs/test_examples.py @@ -10,7 +10,6 @@ import pytest from repurpose.process import rootdir - examples_path = os.path.join(rootdir(), 'docs', 'examples') @pytest.mark.parametrize("notebook", [ @@ -32,5 +31,7 @@ def test_ipython_notebook(notebook): """ preprocessor = ExecutePreprocessor(timeout=600, kernel_name="python3") with open(os.path.join(examples_path, notebook)) as f: + # warning: from nbformat.validator import normalize nb = nbformat.read(f, as_version=4) preprocessor.preprocess(nb) + diff --git a/tests/test_img2ts.py b/tests/test_img2ts.py index 4b7add4..8940f55 100644 --- a/tests/test_img2ts.py +++ b/tests/test_img2ts.py @@ -33,10 +33,15 @@ from datetime import timedelta, datetime import os import numpy as np +import pandas as pd from pygeobase.io_base import ImageBase, MultiTemporalImageBase from pygeobase.object_base import Image from pygeogrids import BasicGrid -from pynetcf.time_series import OrthoMultiTs +from pygeogrids.netcdf import load_grid +from pynetcf.time_series import OrthoMultiTs, GriddedNcIndexedRaggedTs, GriddedNcOrthoMultiTs +from glob import glob +import xarray as xr +import pytest import tempfile import numpy.testing as nptest @@ -46,18 +51,19 @@ # make a simple mock Dataset that can be used for testing the conversion -class TestImageDataset(ImageBase): +class TestOrthogonalImageDataset(ImageBase): - def read(self, timestamp=None, additional_kw=None): + def read(self, timestamp=None): if timestamp == datetime(2016, 1, 1): raise IOError("no data for day") # 2x2 pixels around zero lat, lon - return Image(np.array([0.5, 0.5, - -0.5, -0.5]), - np.array([1, -1, - 1, -1]), - {'var1': np.ones(4) * timestamp.day}, {'kw': additional_kw}, timestamp) + return Image(np.array([0.5, 0.5, -0.5, -0.5]), + np.array([1., -1., 1., -1.]), + {'var1': np.ones(4) * timestamp.day}, + {'metavar': 'value'}, + timestamp=timestamp, + timekey=None) def write(self, data): raise NotImplementedError() @@ -69,19 +75,38 @@ def close(self): pass -class TestMultiTemporalImageDatasetDaily(MultiTemporalImageBase): +class TestNonOrthogonalImageDataset(ImageBase): - def __init__(self): - super(TestMultiTemporalImageDatasetDaily, - self).__init__("", TestImageDataset) + def read(self, timestamp=None): + + if timestamp == datetime(2016, 1, 1): + raise IOError("no data for day") + jd = pd.to_datetime(timestamp).to_julian_date() + # 2x2 pixels around zero lat, lon + return Image(np.array([0.5, 0.5, -0.5, -0.5]), + np.array([1., -1., 1., -1.]), + {'var1': np.ones(4) * timestamp.day, + 'jd': np.array([jd-0.1, jd+0, jd+0.1, jd+0.2])}, + {'metavar': 'value'}, + timestamp=timestamp, + timekey='jd' + ) + + def write(self, data): + raise NotImplementedError() + + def flush(self): + pass + + def close(self): + pass - # def _build_filename(self, timestamp, custom_templ=None, - # str_param=None): - # """ - # Test implemenation that does not actually look for filenames. - # """ - # return "" +class TestMultiTemporalImageDatasetDaily(MultiTemporalImageBase): + + def __init__(self, cls=TestOrthogonalImageDataset): + super(TestMultiTemporalImageDatasetDaily, + self).__init__("", cls) def tstamps_for_daterange(self, start_date, end_date): """ @@ -110,18 +135,68 @@ def tstamps_for_daterange(self, start_date, end_date): return timestamps -def test_img2ts_daily_no_resampling(): +def test_img2ts_nonortho_daily_no_resampling(): input_grid = BasicGrid(np.array([0.5, 0.5, -0.5, -0.5]), - np.array([1, -1, 1, -1]), ) + np.array([1., -1., 1., -1.]), ) with tempfile.TemporaryDirectory() as outputpath: start = datetime(2014, 2, 5) end = datetime(2014, 4, 21) - ds_in = TestMultiTemporalImageDatasetDaily() + ds_in = TestMultiTemporalImageDatasetDaily( + TestNonOrthogonalImageDataset) + img2ts = Img2Ts(ds_in, - outputpath, start, end, imgbuffer=15, - input_grid=input_grid) + outputpath, start, end, imgbuffer=10, + input_grid=input_grid, n_proc=2) + img2ts.calc() + + ts_should_base = pd.date_range(start, end, freq='D') + ts_should_jd_gpi0 = ts_should_base + timedelta(days=-0.1) + ts_should_jd_gpi3 = ts_should_base + timedelta(days=0.2) + ts_should_var1 = np.array([ds_in.read(d).data['var1'][0] + for d in pd.date_range(start, end)]) + grid = load_grid(os.path.join(outputpath, 'grid.nc')) + + with GriddedNcIndexedRaggedTs(outputpath, grid=grid) as ds: + lon, lat = ds.grid.gpi2lonlat(0) + assert lon == 0.5 + assert lat == 1 + lon, lat = ds.grid.gpi2lonlat(2) + assert lon == -0.5 + assert lat == 1 + + np.testing.assert_array_equal(ds.read(0).index.to_julian_date(), + ts_should_jd_gpi0.to_julian_date()) + np.testing.assert_array_equal(ds.read(0)['var1'].values, + ts_should_var1) + np.testing.assert_array_equal(ds.read(3).index.to_julian_date(), + ts_should_jd_gpi3.to_julian_date()) + + ds = xr.open_dataset(os.path.join(outputpath, '0000.nc')) + + np.testing.assert_array_equal(ds['location_id'].data, + np.array([0, 1, 2, 3])) + np.testing.assert_array_equal(ds['lat'].data, + np.array([1., -1., 1., -1.])) + np.testing.assert_array_equal(ds['lon'].data, + np.array([0.5, 0.5, -0.5, -0.5])) + + ds_in.close() + + +def test_img2ts_ortho_daily_no_resampling(): + input_grid = BasicGrid(np.array([0.5, 0.5, -0.5, -0.5]), + np.array([1, -1, 1, -1]), ) + + with tempfile.TemporaryDirectory() as outputpath: + start = datetime(2014, 2, 5) + end = datetime(2014, 4, 21) + + ds_in = TestMultiTemporalImageDatasetDaily( + cls=TestOrthogonalImageDataset) + img2ts = Img2Ts(ds_in, outputpath, start, end, imgbuffer=20, + input_grid=input_grid, n_proc=4) ts_should = np.concatenate([np.arange(5, 29, dtype=float), np.arange(1, 32, dtype=float), @@ -129,16 +204,66 @@ def test_img2ts_daily_no_resampling(): dates_should = ds_in.tstamps_for_daterange(start, end) img2ts.calc() ts_file = os.path.join(outputpath, '0000.nc') + + grid = load_grid(os.path.join(outputpath, 'grid.nc')) + ds = GriddedNcOrthoMultiTs(outputpath, grid) + ts = ds.read(0) + assert np.all(dates_should == ts.index) + with OrthoMultiTs(ts_file) as ds: ts = ds.read('var1', 0) nptest.assert_allclose(ts['var1'], ts_should) - assert dates_should == list(ts['time']) + for i, t in enumerate(ts['time'].data): + assert dates_should[i] == t nptest.assert_allclose(ds.dataset.variables['location_id'][:], np.array([0, 1, 2, 3])) ds_in.close() +def test_img2ts_ortho_daily_resampling(): + input_grid = BasicGrid(np.array([0.5, 0.5, -0.5, -0.5]), + np.array([1., -1., 1., -1.]), ) + + target_grid = BasicGrid(np.array([0.4, 0.6, -0.4, -0.6]), + np.array([0.9, -1.1, 1.1, -0.9])) + + with tempfile.TemporaryDirectory() as outputpath: + start = datetime(2014, 2, 5) + end = datetime(2014, 4, 21) + + ds_in = TestMultiTemporalImageDatasetDaily( + cls=TestOrthogonalImageDataset) + img2ts = Img2Ts(ds_in, outputpath, start, end, imgbuffer=20, + target_grid=target_grid, + input_grid=input_grid, r_neigh=4, + n_proc=2) + img2ts.calc() + + ts_should = np.concatenate([np.arange(5, 29, dtype=float), + np.arange(1, 32, dtype=float), + np.arange(1, 22, dtype=float)]) + dates_should = ds_in.tstamps_for_daterange(start, end) + + grid = load_grid(os.path.join(outputpath, 'grid.nc')) + ds = GriddedNcOrthoMultiTs(outputpath, grid=grid) + ts = ds.read(0) + nptest.assert_allclose(ts['var1'], ts_should) + assert np.all(dates_should == ts.index) + + ds_in.close() + + ds = xr.open_dataset(os.path.join(outputpath, '0000.nc')) + + np.testing.assert_array_equal(ds['location_id'].data, + np.array([0, 1, 2, 3])) + + np.testing.assert_array_almost_equal(ds['lat'].data, + np.array([0.9, -1.1, 1.1, -0.9])) + + nptest.assert_allclose(ds['location_id'].data, + np.array([0, 1, 2, 3])) + ds_in.close() -def test_img2ts_daily_no_resampling_missing_day(): +def test_img2ts_ortho_daily_no_resampling_missing_day(): """ Test resampling over missing day 2016-01-01 (see reader above) """ @@ -167,5 +292,3 @@ def test_img2ts_daily_no_resampling_missing_day(): nptest.assert_allclose(ds.dataset.variables['location_id'][:], np.array([0, 1, 2, 3])) -if __name__ == '__main__': - test_img2ts_daily_no_resampling()