diff --git a/lib/iris/fileformats/netcdf/__init__.py b/lib/iris/fileformats/netcdf/__init__.py index 505e173b0b..ec51c31602 100644 --- a/lib/iris/fileformats/netcdf/__init__.py +++ b/lib/iris/fileformats/netcdf/__init__.py @@ -18,6 +18,7 @@ # Note: *must* be done before importing from submodules, as they also use this ! logger = iris.config.get_logger(__name__) +from ._parse_cell_methods import UnknownCellMethodWarning, parse_cell_methods from .loader import DEBUG, NetCDFDataProxy, load_cubes from .saver import ( CF_CONVENTIONS_VERSION, @@ -25,8 +26,6 @@ SPATIO_TEMPORAL_AXES, CFNameCoordMap, Saver, - UnknownCellMethodWarning, - parse_cell_methods, save, ) diff --git a/lib/iris/fileformats/netcdf/_parse_cell_methods.py b/lib/iris/fileformats/netcdf/_parse_cell_methods.py new file mode 100644 index 0000000000..bda60af026 --- /dev/null +++ b/lib/iris/fileformats/netcdf/_parse_cell_methods.py @@ -0,0 +1,219 @@ +# Copyright Iris contributors +# +# This file is part of Iris and is released under the LGPL license. +# See COPYING and COPYING.LESSER in the root of the repository for full +# licensing details. +""" +Helper routines specific to cell method parsing for netcdf-CF loading. + +""" +import re +from typing import List +import warnings + +from iris.coords import CellMethod + +# Cell methods. +_CM_KNOWN_METHODS = [ + "point", + "sum", + "mean", + "maximum", + "minimum", + "mid_range", + "standard_deviation", + "variance", + "mode", + "median", +] + +_CM_COMMENT = "comment" +_CM_EXTRA = "extra" +_CM_INTERVAL = "interval" +_CM_METHOD = "method" +_CM_NAME = "name" +_CM_PARSE_NAME = re.compile(r"([\w_]+\s*?:\s+)+") +_CM_PARSE = re.compile( + r""" + (?P([\w_]+\s*?:\s+)+) + (?P[\w_\s]+(?![\w_]*\s*?:))\s* + (?: + \(\s* + (?P.+) + \)\s* + )? + """, + re.VERBOSE, +) + + +class UnknownCellMethodWarning(Warning): + pass + + +def _split_cell_methods(nc_cell_methods: str) -> List[re.Match]: + """ + Split a CF cell_methods attribute string into a list of zero or more cell + methods, each of which is then parsed with a regex to return a list of match + objects. + + Args: + + * nc_cell_methods: The value of the cell methods attribute to be split. + + Returns: + + * nc_cell_methods_matches: A list of the re.Match objects associated with + each parsed cell method + + Splitting is done based on words followed by colons outside of any brackets. + Validation of anything other than being laid out in the expected format is + left to the calling function. + """ + + # Find name candidates + name_start_inds = [] + for m in _CM_PARSE_NAME.finditer(nc_cell_methods): + name_start_inds.append(m.start()) + + # Remove those that fall inside brackets + bracket_depth = 0 + for ind, cha in enumerate(nc_cell_methods): + if cha == "(": + bracket_depth += 1 + elif cha == ")": + bracket_depth -= 1 + if bracket_depth < 0: + msg = ( + "Cell methods may be incorrectly parsed due to mismatched " + "brackets" + ) + warnings.warn(msg, UserWarning, stacklevel=2) + if bracket_depth > 0 and ind in name_start_inds: + name_start_inds.remove(ind) + + # List tuples of indices of starts and ends of the cell methods in the string + name_start_inds.append(len(nc_cell_methods)) + method_indices = list(zip(name_start_inds[:-1], name_start_inds[1:])) + + # Index the string and match against each substring + nc_cell_methods_matches = [] + for start_ind, end_ind in method_indices: + nc_cell_method_str = nc_cell_methods[start_ind:end_ind] + nc_cell_method_match = _CM_PARSE.match(nc_cell_method_str.strip()) + if not nc_cell_method_match: + msg = ( + f"Failed to fully parse cell method string: {nc_cell_methods}" + ) + warnings.warn(msg, UserWarning, stacklevel=2) + continue + nc_cell_methods_matches.append(nc_cell_method_match) + + return nc_cell_methods_matches + + +def parse_cell_methods(nc_cell_methods): + """ + Parse a CF cell_methods attribute string into a tuple of zero or + more CellMethod instances. + + Args: + + * nc_cell_methods (str): + The value of the cell methods attribute to be parsed. + + Returns: + + * cell_methods + An iterable of :class:`iris.coords.CellMethod`. + + Multiple coordinates, intervals and comments are supported. + If a method has a non-standard name a warning will be issued, but the + results are not affected. + + """ + + cell_methods = [] + if nc_cell_methods is not None: + splits = _split_cell_methods(nc_cell_methods) + if not splits: + msg = ( + f"NetCDF variable cell_methods of {nc_cell_methods!r} " + "contains no valid cell methods." + ) + warnings.warn(msg, UserWarning) + for m in splits: + d = m.groupdict() + method = d[_CM_METHOD] + method = method.strip() + # Check validity of method, allowing for multi-part methods + # e.g. mean over years. + method_words = method.split() + if method_words[0].lower() not in _CM_KNOWN_METHODS: + msg = "NetCDF variable contains unknown cell method {!r}" + warnings.warn( + msg.format("{}".format(method_words[0])), + UnknownCellMethodWarning, + ) + d[_CM_METHOD] = method + name = d[_CM_NAME] + name = name.replace(" ", "") + name = name.rstrip(":") + d[_CM_NAME] = tuple([n for n in name.split(":")]) + interval = [] + comment = [] + if d[_CM_EXTRA] is not None: + # + # tokenise the key words and field colon marker + # + d[_CM_EXTRA] = d[_CM_EXTRA].replace( + "comment:", "<><<:>>" + ) + d[_CM_EXTRA] = d[_CM_EXTRA].replace( + "interval:", "<><<:>>" + ) + d[_CM_EXTRA] = d[_CM_EXTRA].split("<<:>>") + if len(d[_CM_EXTRA]) == 1: + comment.extend(d[_CM_EXTRA]) + else: + next_field_type = comment + for field in d[_CM_EXTRA]: + field_type = next_field_type + index = field.rfind("<>") + if index == 0: + next_field_type = interval + continue + elif index > 0: + next_field_type = interval + else: + index = field.rfind("<>") + if index == 0: + next_field_type = comment + continue + elif index > 0: + next_field_type = comment + if index != -1: + field = field[:index] + field_type.append(field.strip()) + # + # cater for a shared interval over multiple axes + # + if len(interval): + if len(d[_CM_NAME]) != len(interval) and len(interval) == 1: + interval = interval * len(d[_CM_NAME]) + # + # cater for a shared comment over multiple axes + # + if len(comment): + if len(d[_CM_NAME]) != len(comment) and len(comment) == 1: + comment = comment * len(d[_CM_NAME]) + d[_CM_INTERVAL] = tuple(interval) + d[_CM_COMMENT] = tuple(comment) + cell_method = CellMethod( + d[_CM_METHOD], + coords=d[_CM_NAME], + intervals=d[_CM_INTERVAL], + comments=d[_CM_COMMENT], + ) + cell_methods.append(cell_method) + return tuple(cell_methods) diff --git a/lib/iris/fileformats/netcdf/saver.py b/lib/iris/fileformats/netcdf/saver.py index 650c5e3338..7251c32116 100644 --- a/lib/iris/fileformats/netcdf/saver.py +++ b/lib/iris/fileformats/netcdf/saver.py @@ -19,7 +19,6 @@ import os.path import re import string -from typing import List import warnings import cf_units @@ -156,207 +155,6 @@ } -# Cell methods. -_CM_KNOWN_METHODS = [ - "point", - "sum", - "mean", - "maximum", - "minimum", - "mid_range", - "standard_deviation", - "variance", - "mode", - "median", -] - -_CM_COMMENT = "comment" -_CM_EXTRA = "extra" -_CM_INTERVAL = "interval" -_CM_METHOD = "method" -_CM_NAME = "name" -_CM_PARSE_NAME = re.compile(r"([\w_]+\s*?:\s+)+") -_CM_PARSE = re.compile( - r""" - (?P([\w_]+\s*?:\s+)+) - (?P[\w_\s]+(?![\w_]*\s*?:))\s* - (?: - \(\s* - (?P.+) - \)\s* - )? - """, - re.VERBOSE, -) - - -class UnknownCellMethodWarning(Warning): - pass - - -def _split_cell_methods(nc_cell_methods: str) -> List[re.Match]: - """ - Split a CF cell_methods attribute string into a list of zero or more cell - methods, each of which is then parsed with a regex to return a list of match - objects. - - Args: - - * nc_cell_methods: The value of the cell methods attribute to be split. - - Returns: - - * nc_cell_methods_matches: A list of the re.Match objects associated with - each parsed cell method - - Splitting is done based on words followed by colons outside of any brackets. - Validation of anything other than being laid out in the expected format is - left to the calling function. - """ - - # Find name candidates - name_start_inds = [] - for m in _CM_PARSE_NAME.finditer(nc_cell_methods): - name_start_inds.append(m.start()) - - # Remove those that fall inside brackets - bracket_depth = 0 - for ind, cha in enumerate(nc_cell_methods): - if cha == "(": - bracket_depth += 1 - elif cha == ")": - bracket_depth -= 1 - if bracket_depth < 0: - msg = ( - "Cell methods may be incorrectly parsed due to mismatched " - "brackets" - ) - warnings.warn(msg, UserWarning, stacklevel=2) - if bracket_depth > 0 and ind in name_start_inds: - name_start_inds.remove(ind) - - # List tuples of indices of starts and ends of the cell methods in the string - method_indices = [] - for ii in range(len(name_start_inds) - 1): - method_indices.append((name_start_inds[ii], name_start_inds[ii + 1])) - method_indices.append((name_start_inds[-1], len(nc_cell_methods))) - - # Index the string and match against each substring - nc_cell_methods_matches = [] - for start_ind, end_ind in method_indices: - nc_cell_method_str = nc_cell_methods[start_ind:end_ind] - nc_cell_method_match = _CM_PARSE.match(nc_cell_method_str.strip()) - if not nc_cell_method_match: - msg = ( - f"Failed to fully parse cell method string: {nc_cell_methods}" - ) - warnings.warn(msg, UserWarning, stacklevel=2) - continue - nc_cell_methods_matches.append(nc_cell_method_match) - - return nc_cell_methods_matches - - -def parse_cell_methods(nc_cell_methods): - """ - Parse a CF cell_methods attribute string into a tuple of zero or - more CellMethod instances. - - Args: - - * nc_cell_methods (str): - The value of the cell methods attribute to be parsed. - - Returns: - - * cell_methods - An iterable of :class:`iris.coords.CellMethod`. - - Multiple coordinates, intervals and comments are supported. - If a method has a non-standard name a warning will be issued, but the - results are not affected. - - """ - - cell_methods = [] - if nc_cell_methods is not None: - for m in _split_cell_methods(nc_cell_methods): - d = m.groupdict() - method = d[_CM_METHOD] - method = method.strip() - # Check validity of method, allowing for multi-part methods - # e.g. mean over years. - method_words = method.split() - if method_words[0].lower() not in _CM_KNOWN_METHODS: - msg = "NetCDF variable contains unknown cell method {!r}" - warnings.warn( - msg.format("{}".format(method_words[0])), - UnknownCellMethodWarning, - ) - d[_CM_METHOD] = method - name = d[_CM_NAME] - name = name.replace(" ", "") - name = name.rstrip(":") - d[_CM_NAME] = tuple([n for n in name.split(":")]) - interval = [] - comment = [] - if d[_CM_EXTRA] is not None: - # - # tokenise the key words and field colon marker - # - d[_CM_EXTRA] = d[_CM_EXTRA].replace( - "comment:", "<><<:>>" - ) - d[_CM_EXTRA] = d[_CM_EXTRA].replace( - "interval:", "<><<:>>" - ) - d[_CM_EXTRA] = d[_CM_EXTRA].split("<<:>>") - if len(d[_CM_EXTRA]) == 1: - comment.extend(d[_CM_EXTRA]) - else: - next_field_type = comment - for field in d[_CM_EXTRA]: - field_type = next_field_type - index = field.rfind("<>") - if index == 0: - next_field_type = interval - continue - elif index > 0: - next_field_type = interval - else: - index = field.rfind("<>") - if index == 0: - next_field_type = comment - continue - elif index > 0: - next_field_type = comment - if index != -1: - field = field[:index] - field_type.append(field.strip()) - # - # cater for a shared interval over multiple axes - # - if len(interval): - if len(d[_CM_NAME]) != len(interval) and len(interval) == 1: - interval = interval * len(d[_CM_NAME]) - # - # cater for a shared comment over multiple axes - # - if len(comment): - if len(d[_CM_NAME]) != len(comment) and len(comment) == 1: - comment = comment * len(d[_CM_NAME]) - d[_CM_INTERVAL] = tuple(interval) - d[_CM_COMMENT] = tuple(comment) - cell_method = iris.coords.CellMethod( - d[_CM_METHOD], - coords=d[_CM_NAME], - intervals=d[_CM_INTERVAL], - comments=d[_CM_COMMENT], - ) - cell_methods.append(cell_method) - return tuple(cell_methods) - - class CFNameCoordMap: """Provide a simple CF name to CF coordinate mapping.""" diff --git a/lib/iris/tests/unit/fileformats/netcdf/test_parse_cell_methods.py b/lib/iris/tests/unit/fileformats/netcdf/test_parse_cell_methods.py index bbde2d0a2d..4fb4b90f74 100644 --- a/lib/iris/tests/unit/fileformats/netcdf/test_parse_cell_methods.py +++ b/lib/iris/tests/unit/fileformats/netcdf/test_parse_cell_methods.py @@ -7,27 +7,44 @@ Unit tests for :func:`iris.fileformats.netcdf.parse_cell_methods`. """ +import warnings -# import iris tests first so that some things can be initialised before -# importing anything else -import iris.tests as tests # isort:skip - -from unittest import mock +import pytest from iris.coords import CellMethod -from iris.fileformats.netcdf import parse_cell_methods +from iris.fileformats.netcdf import ( + UnknownCellMethodWarning, + parse_cell_methods, +) + + +class TestParseCellMethods: + def _check_answers(self, test_string_or_strings, result): + """ + Compare a list of test strings against a single expected result. + Done this way so that any failures produce intelligible Pytest messages. + """ + if isinstance(test_string_or_strings, str): + test_string_or_strings = [test_string_or_strings] + expected_tests_and_results = [ + (cell_method_str, result) + for cell_method_str in test_string_or_strings + ] + actual_tests_and_results = [ + (cell_method_str, parse_cell_methods(cell_method_str)) + for cell_method_str in test_string_or_strings + ] + assert actual_tests_and_results == expected_tests_and_results -class Test(tests.IrisTest): def test_simple(self): + # Some simple testcases which should all have the same result cell_method_strings = [ "time: mean", "time : mean", ] expected = (CellMethod(method="mean", coords="time"),) - for cell_method_str in cell_method_strings: - res = parse_cell_methods(cell_method_str) - self.assertEqual(res, expected) + self._check_answers(cell_method_strings, expected) def test_with_interval(self): cell_method_strings = [ @@ -37,9 +54,7 @@ def test_with_interval(self): expected = ( CellMethod(method="variance", coords="time", intervals="1 hr"), ) - for cell_method_str in cell_method_strings: - res = parse_cell_methods(cell_method_str) - self.assertEqual(res, expected) + self._check_answers(cell_method_strings, expected) def test_multiple_axes(self): cell_method_strings = [ @@ -51,9 +66,7 @@ def test_multiple_axes(self): expected = ( CellMethod(method="standard_deviation", coords=["lat", "lon"]), ) - for cell_method_str in cell_method_strings: - res = parse_cell_methods(cell_method_str) - self.assertEqual(res, expected) + self._check_answers(cell_method_strings, expected) def test_multiple(self): cell_method_strings = [ @@ -66,20 +79,26 @@ def test_multiple(self): CellMethod(method="maximum", coords="time", intervals="1 hr"), CellMethod(method="mean", coords="time", intervals="1 day"), ) - for cell_method_str in cell_method_strings: - res = parse_cell_methods(cell_method_str) - self.assertEqual(res, expected) + self._check_answers(cell_method_strings, expected) def test_comment(self): cell_method_strings = [ - "time: maximum (interval: 1 hr comment: first bit) " - "time: mean (interval: 1 day comment: second bit)", - "time : maximum (interval: 1 hr comment: first bit) " - "time: mean (interval: 1 day comment: second bit)", - "time: maximum (interval: 1 hr comment: first bit) " - "time : mean (interval: 1 day comment: second bit)", - "time : maximum (interval: 1 hr comment: first bit) " - "time : mean (interval: 1 day comment: second bit)", + ( + "time: maximum (interval: 1 hr comment: first bit) " + "time: mean (interval: 1 day comment: second bit)" + ), + ( + "time : maximum (interval: 1 hr comment: first bit) " + "time: mean (interval: 1 day comment: second bit)" + ), + ( + "time: maximum (interval: 1 hr comment: first bit) " + "time : mean (interval: 1 day comment: second bit)" + ), + ( + "time : maximum (interval: 1 hr comment: first bit) " + "time : mean (interval: 1 day comment: second bit)" + ), ] expected = ( CellMethod( @@ -95,9 +114,7 @@ def test_comment(self): comments="second bit", ), ) - for cell_method_str in cell_method_strings: - res = parse_cell_methods(cell_method_str) - self.assertEqual(res, expected) + self._check_answers(cell_method_strings, expected) def test_comment_brackets(self): cell_method_strings = [ @@ -112,35 +129,43 @@ def test_comment_brackets(self): comments="18h(day-1)-18h", ), ) - for cell_method_str in cell_method_strings: - res = parse_cell_methods(cell_method_str) - self.assertEqual(res, expected) + self._check_answers(cell_method_strings, expected) def test_comment_bracket_mismatch_warning(self): cell_method_strings = [ "time: minimum within days (comment: 18h day-1)-18h)", "time : minimum within days (comment: 18h day-1)-18h)", ] + expected = ( + CellMethod( + method="minimum within days", + coords="time", + intervals=None, + comments="18h day-1)-18h", + ), + ) + msg = ( + "Cell methods may be incorrectly parsed due to mismatched brackets" + ) for cell_method_str in cell_method_strings: - with self.assertWarns( - UserWarning, - msg="Cell methods may be incorrectly parsed due to mismatched brackets", - ): - _ = parse_cell_methods(cell_method_str) + with pytest.warns(UserWarning, match=msg): + self._check_answers(cell_method_strings, expected) - def test_badly_formatted_warning(self): + def test_badly_formatted__warns(self): cell_method_strings = [ - # "time: maximum (interval: 1 hr comment: first bit " - # "time: mean (interval: 1 day comment: second bit)", - "time: (interval: 1 hr comment: first bit) " - "time: mean (interval: 1 day comment: second bit)", - "time: maximum (interval: 1 hr comment: first bit) " - "time: (interval: 1 day comment: second bit)", + ( + "time: (interval: 1 hr comment: first bit) " + "time: mean (interval: 1 day comment: second bit)" + ), + ( + "time: maximum (interval: 1 hr comment: first bit) " + "time: (interval: 1 day comment: second bit)" + ), ] - for cell_method_str in cell_method_strings: - with self.assertWarns( + for cell_method_str in cell_method_strings[1:]: + with pytest.warns( UserWarning, - msg=f"Failed to fully parse cell method string: {cell_method_str}", + match="Failed to fully parse cell method string: time: ", ): _ = parse_cell_methods(cell_method_str) @@ -152,9 +177,7 @@ def test_portions_of_cells(self): expected = ( CellMethod(method="mean where sea_ice over sea", coords="area"), ) - for cell_method_str in cell_method_strings: - res = parse_cell_methods(cell_method_str) - self.assertEqual(res, expected) + self._check_answers(cell_method_strings, expected) def test_climatology(self): cell_method_strings = [ @@ -167,11 +190,9 @@ def test_climatology(self): CellMethod(method="minimum within days", coords="time"), CellMethod(method="mean over days", coords="time"), ) - for cell_method_str in cell_method_strings: - res = parse_cell_methods(cell_method_str) - self.assertEqual(res, expected) + self._check_answers(cell_method_strings, expected) - def test_climatology_with_unknown_method(self): + def test_climatology_with_unknown_method__warns(self): cell_method_strings = [ "time: min within days time: mean over days", "time : min within days time: mean over days", @@ -182,15 +203,68 @@ def test_climatology_with_unknown_method(self): CellMethod(method="min within days", coords="time"), CellMethod(method="mean over days", coords="time"), ) + msg = "NetCDF variable contains unknown cell method 'min'" for cell_method_str in cell_method_strings: - with mock.patch("warnings.warn") as warn: + with pytest.warns(UnknownCellMethodWarning, match=msg): res = parse_cell_methods(cell_method_str) - self.assertIn( - "NetCDF variable contains unknown cell method 'min'", - warn.call_args[0][0], - ) - self.assertEqual(res, expected) + assert res == expected + + def test_empty__warns(self): + cm_str = "" + msg = "contains no valid cell methods" + with pytest.warns(UserWarning, match=msg): + result = parse_cell_methods(cm_str) + assert result == () + + def test_whitespace__warns(self): + cm_str = " \t " + msg = "contains no valid cell methods" + with pytest.warns(UserWarning, match=msg): + result = parse_cell_methods(cm_str) + assert result == () + + def test_barename__warns(self): + cm_str = "time" + msg = "contains no valid cell methods" + with pytest.warns(UserWarning, match=msg): + result = parse_cell_methods(cm_str) + assert result == () + + def test_missedspace__warns(self): + cm_str = "time:mean" + msg = "contains no valid cell methods" + with pytest.warns(UserWarning, match=msg): + result = parse_cell_methods(cm_str) + assert result == () + + def test_random_junk__warns(self): + cm_str = "y:12+4#?x:this" + msg = "contains no valid cell methods" + with pytest.warns(UserWarning, match=msg): + result = parse_cell_methods(cm_str) + assert result == () + + def test_junk_after__silentlyignores(self): + cm_str = "time: mean -?-" + with warnings.catch_warnings(): + warnings.simplefilter("error") + result = parse_cell_methods(cm_str) + expected = (CellMethod("mean", ("time",)),) + assert result == expected + def test_junk_before__silentlyignores(self): + cm_str = "-?- time: mean" + with warnings.catch_warnings(): + warnings.simplefilter("error") + result = parse_cell_methods(cm_str) + expected = (CellMethod("mean", ("time",)),) + assert result == expected -if __name__ == "__main__": - tests.main() + def test_embeddedcolon__silentlyignores(self): + cm_str = "time:any: mean" + with warnings.catch_warnings(): + warnings.simplefilter("error") + result = parse_cell_methods(cm_str) + # N.B. treats the initial "time:" as plain junk + discards it + expected = (CellMethod("mean", ("any",)),) + assert result == expected