diff --git a/Changelog.rst b/Changelog.rst index fc6ad440e2..b81bef9971 100644 --- a/Changelog.rst +++ b/Changelog.rst @@ -1,7 +1,7 @@ version NEXTVERSION ------------------- -**2024-??-??** +**2024-12-??** * Allow ``'nearest_dtos'`` 2-d regridding to work with discrete sampling geometry source grids @@ -23,6 +23,8 @@ version NEXTVERSION * New class `cf.NetCDF4Array` * New class `cf.CFAH5netcdfArray` * New class `cf.CFANetCDF4Array` +* Replace core `dask` functionality with that imported from `cfdm` + (https://github.com/NCAS-CMS/cf-python/issues/839) * Fix bug that sometimes puts an incorrect ``radian-1`` or ``radian-2`` in the returned units of the differential operator methods and functions @@ -41,9 +43,11 @@ version NEXTVERSION (https://github.com/NCAS-CMS/cf-python/issues/828) * New dependency: ``h5netcdf>=1.3.0`` * New dependency: ``h5py>=3.10.0`` -* New dependency: ``s3fs>=2024.2.0`` +* New dependency: ``s3fs>=2024.6.0`` +* Changed dependency: ``numpy>=1.15,<2.0`` * Changed dependency: ``1.11.2.0<=cfdm<1.11.3.0`` * Changed dependency: ``cfunits>=3.3.7`` +* Changed dependency: ``dask>=2024.6.0,<=2024.7.1`` ---- diff --git a/cf/__init__.py b/cf/__init__.py index 9e630d86ea..0c6f8f0464 100644 --- a/cf/__init__.py +++ b/cf/__init__.py @@ -123,7 +123,7 @@ raise ImportError(_error0 + str(error1)) try: - import numpy + import numpy as np except ImportError as error1: raise ImportError(_error0 + str(error1)) @@ -191,10 +191,11 @@ # Check the version of numpy _minimum_vn = "1.22" -if Version(numpy.__version__) < Version(_minimum_vn): - raise RuntimeError( - f"Bad numpy version: cf requires numpy>={_minimum_vn}. " - f"Got {numpy.__version__} at {numpy.__file__}" +_maximum_vn = "2.0" +if not Version(_minimum_vn) <= Version(np.__version__) < Version(_maximum_vn): + raise ValueError( + "Bad numpy version: cf requires _minimum_vn}<=numpy<{_maximum_vn}. " + f"Got {np.__version__} at {np.__file__}" ) # Check the version of cfunits @@ -208,15 +209,30 @@ # Check the version of cfdm _minimum_vn = "1.11.2.0" _maximum_vn = "1.11.3.0" -_cfdm_version = Version(cfdm.__version__) -if not Version(_minimum_vn) <= _cfdm_version < Version(_maximum_vn): +if ( + not Version(_minimum_vn) + <= Version(cfdm.__version__) + < Version(_maximum_vn) +): raise RuntimeError( f"Bad cfdm version: cf requires {_minimum_vn}<=cfdm<{_maximum_vn}. " - f"Got {_cfdm_version} at {cfdm.__file__}" + f"Got {cfdm.__version__} at {cfdm.__file__}" ) # Check the version of dask +_minimum_vn = "2024.6.1" +_maximum_vn = "2024.7.1" +if ( + not Version(_minimum_vn) + <= Version(dask.__version__) + <= Version(_maximum_vn) +): + raise ValueError( + "Bad dask version: cf requires {_minimum_vn}<=dask<={_maximum_vn}. " + f"Got {dask.__version__} at {dask.__file__}" + ) + # Check the version of Python _minimum_vn = "3.8.0" if Version(platform.python_version()) < Version(_minimum_vn): @@ -233,6 +249,8 @@ f"Got {scipy.__version__} at {scipy.__file__}" ) +del _minimum_vn, _maximum_vn + from .constructs import Constructs from .mixin import Coordinate diff --git a/cf/constants.py b/cf/constants.py index 1472bd83d2..aa2bfd0fcd 100644 --- a/cf/constants.py +++ b/cf/constants.py @@ -37,7 +37,7 @@ Find the total amount of physical memory (in bytes). CHUNKSIZE: `int` - The chunk size (in bytes) for data storage and processing. + The Dask chunk size (in bytes). See `cf.chunksize`. TEMPDIR: `str` The location to store temporary files. By default it is the diff --git a/cf/data/array/fullarray.py b/cf/data/array/fullarray.py index 81278c3407..559a9cb410 100644 --- a/cf/data/array/fullarray.py +++ b/cf/data/array/fullarray.py @@ -1,8 +1,8 @@ import numpy as np +from cfdm.data.mixin import IndexMixin from ...functions import indices_shape, parse_indices from .abstract import Array -from .mixin import IndexMixin _FULLARRAY_HANDLED_FUNCTIONS = {} diff --git a/cf/data/array/h5netcdfarray.py b/cf/data/array/h5netcdfarray.py index 02cd0f1cc5..b6c603b8d4 100644 --- a/cf/data/array/h5netcdfarray.py +++ b/cf/data/array/h5netcdfarray.py @@ -1,13 +1,11 @@ import cfdm from ...mixin_container import Container -from .locks import netcdf_lock -from .mixin import ActiveStorageMixin, ArrayMixin, FileArrayMixin, IndexMixin +from .mixin import ActiveStorageMixin, ArrayMixin, FileArrayMixin class H5netcdfArray( ActiveStorageMixin, - IndexMixin, FileArrayMixin, ArrayMixin, Container, @@ -23,59 +21,3 @@ class H5netcdfArray( .. versionadded:: NEXTVERSION """ - - def __dask_tokenize__(self): - """Return a value fully representative of the object. - - .. versionadded:: NEXTVERSION - - """ - return super().__dask_tokenize__() + (self.get_mask(),) - - @property - def _lock(self): - """Set the lock for use in `dask.array.from_array`. - - Returns a lock object because concurrent reads are not - currently supported by the HDF5 library. The lock object will - be the same for all `NetCDF4Array` and `H5netcdfArray` - instances, regardless of the dataset they access, which means - that access to all netCDF and HDF files coordinates around the - same lock. - - .. versionadded:: NEXTVERSION - - """ - return netcdf_lock - - def _get_array(self, index=None): - """Returns a subspace of the dataset variable. - - .. versionadded:: NEXTVERSION - - .. seealso:: `__array__`, `index` - - :Parameters: - - {{index: `tuple` or `None`, optional}} - - :Returns: - - `numpy.ndarray` - The subspace. - - """ - if index is None: - index = self.index() - - # We need to lock because the netCDF file is about to be accessed. - self._lock.acquire() - - # It's cfdm.H5netcdfArray.__getitem__ that we want to - # call here, but we use 'Container' in super because - # that comes immediately before cfdm.H5netcdfArray in - # the method resolution order. - array = super(Container, self).__getitem__(index) - - self._lock.release() - return array diff --git a/cf/data/array/locks.py b/cf/data/array/locks.py deleted file mode 100644 index 5a7b2bd333..0000000000 --- a/cf/data/array/locks.py +++ /dev/null @@ -1,4 +0,0 @@ -from dask.utils import SerializableLock - -# Global lock for netCDF file access -netcdf_lock = SerializableLock() diff --git a/cf/data/array/mixin/__init__.py b/cf/data/array/mixin/__init__.py index 8e5dd7690d..af036620cf 100644 --- a/cf/data/array/mixin/__init__.py +++ b/cf/data/array/mixin/__init__.py @@ -3,4 +3,3 @@ from .cfamixin import CFAMixin from .compressedarraymixin import CompressedArrayMixin from .filearraymixin import FileArrayMixin -from .indexmixin import IndexMixin diff --git a/cf/data/array/mixin/cfamixin.py b/cf/data/array/mixin/cfamixin.py index 6bcb01a468..56682cd94f 100644 --- a/cf/data/array/mixin/cfamixin.py +++ b/cf/data/array/mixin/cfamixin.py @@ -3,8 +3,7 @@ from itertools import accumulate, product import numpy as np - -from ...utils import chunk_locations, chunk_positions +from cfdm.data.utils import chunk_locations, chunk_positions class CFAMixin: diff --git a/cf/data/array/mixin/compressedarraymixin.py b/cf/data/array/mixin/compressedarraymixin.py index 3e74f2ffaf..8a1d5dfbe1 100644 --- a/cf/data/array/mixin/compressedarraymixin.py +++ b/cf/data/array/mixin/compressedarraymixin.py @@ -76,12 +76,11 @@ def to_dask_array(self, chunks="auto"): from functools import partial import dask.array as da + from cfdm.data.utils import normalize_chunks from dask import config from dask.array.core import getter from dask.base import tokenize - from ...utils import normalize_chunks - name = (f"{self.__class__.__name__}-{tokenize(self)}",) dtype = self.dtype diff --git a/cf/data/array/mixin/indexmixin.py b/cf/data/array/mixin/indexmixin.py deleted file mode 100644 index d105ba943a..0000000000 --- a/cf/data/array/mixin/indexmixin.py +++ /dev/null @@ -1,364 +0,0 @@ -from numbers import Integral - -import numpy as np -from dask.array.slicing import normalize_index -from dask.base import is_dask_collection - -from ....functions import indices_shape, parse_indices - - -class IndexMixin: - """Mixin class for lazy indexing of a data array. - - A data for a subspace is retrieved by casting the object as a - `numpy` array. See `__getitem__` for more details. - - **Examples** - - >>> a = cf.{{class}}(...) - >>> a.shape - (6, 5) - >>> print(np.asanyarray(a)) - [[ 0 1 2 3 4]) - [ 5 6 7 8 9] - [10 11 12 13 14] - [15 16 17 18 19] - [20 21 22 23 24] - [25 26 27 28 29]] - >>> a = a[::2, [1, 2, 4]] - >>> a = a[[True, False, True], :] - >>> a.shape - (2, 3) - >>> print(np.asanyarray(a)) - [[ 1, 2, 4], - [21, 22, 24]] - - .. versionadded:: NEXTVERSION - - """ - - def __array__(self, *dtype): - """Convert the `{{class}}` into a `numpy` array. - - .. versionadded:: NEXTVERSION - - :Parameters: - - dtype: optional - Typecode or data-type to which the array is cast. - - :Returns: - - `numpy.ndarray` - An independent `numpy` array of the subspace of the - data defined by the `indices` attribute. - - """ - array = self._get_array() - if dtype: - return array.astype(dtype[0], copy=False) - - return array - - def __getitem__(self, index): - """Returns a subspace of the data as a new `{{class}}`. - - x.__getitem__(indices) <==> x[indices] - - Subspaces created by indexing are lazy and are not applied - until the `{{class}}` object is converted to a `numpy` array, - by which time all lazily-defined subspaces will have been - converted to a single combined index which defines only the - actual elements that need to be retrieved from the original - data. - - The combined index is orthogonal, meaning that the index for - each dimension is to be applied independently, regardless of - how that index was defined. For instance, the indices ``[[0, - 1], [1, 3], 0]`` and ``[:2, 1::2, 0]`` will give identical - results. - - For example, if the original data has shape ``(12, 145, 192)`` - and consecutive subspaces of ``[::2, [1, 3, 4], 96:]`` and - ``[[0, 5], [True, False, True], 0]`` are applied, then only - the elements defined by the combined index``[[0, 10], [1, 4], - 96]`` will be retrieved from the data when `__array__` is - called. - - .. versionadded:: NEXTVERSION - - .. seealso:: `index`, `original_shape`, `__array__`, - `__getitem__` - - :Returns: - - `{{class}}` - The subspaced data. - - """ - shape0 = self.shape - index0 = self.index(conform=False) - original_shape = self.original_shape - - index1 = parse_indices(shape0, index, keepdims=False) - - new = self.copy() - new_indices = [] - new_shape = [] - - i = 0 - for ind0, original_size in zip(index0, original_shape): - if isinstance(ind0, Integral): - # The previous call to __getitem__ resulted in a - # dimension being removed (i.e. 'ind0' is - # integer-valued). Therefore 'index1' must have fewer - # elements than 'index0', so we need to "carry - # forward" the integer-valued index so that it is - # available at evaluation time. - new_indices.append(ind0) - continue - - ind1 = index1[i] - size0 = shape0[i] - i += 1 - - # If this dimension is not subspaced by the new index then - # we don't need to update the old index. - if isinstance(ind1, slice) and ind1 == slice(None): - new_indices.append(ind0) - continue - - # Still here? Then we have to work out the index of the - # full array that is equivalent to applying - # 'ind0' followed by 'ind1'. - if is_dask_collection(ind1): - # Note: This will never occur when this __getitem__ is - # being called from within a Dask graph, because - # any lazy indices will have already been - # computed as part of the whole graph execution; - # i.e. we don't have to worry about a - # compute-within-a-compute situation. (If this - # were not the case then we could add - # `scheduler="synchronous"` to the compute - # call.) - ind1 = ind1.compute() - - if isinstance(ind0, slice): - if isinstance(ind1, slice): - # ind0: slice - # ind1: slice - start, stop, step = ind0.indices(original_size) - start1, stop1, step1 = ind1.indices(size0) - size1, mod1 = divmod(stop1 - start1, step1) - - if mod1 != 0: - size1 += 1 - - start += start1 * step - step *= step1 - stop = start + (size1 - 1) * step - - if step > 0: - stop += 1 - else: - stop -= 1 - - if stop < 0: - stop = None - - new_index = slice(start, stop, step) - else: - # ind0: slice - # ind1: int, or array of int/bool - new_index = np.arange(*ind0.indices(original_size))[ind1] - else: - # ind0: array of int. If we made it to here then it - # can't be anything else. This is - # because we've dealt with ind0 - # being a slice or an int, the - # very first ind0 is always - # slice(None), and a previous ind1 - # that was an array of bool will - # have resulted in this ind0 being - # an array of int. - # - # ind1: anything - new_index = np.asanyarray(ind0)[ind1] - - new_indices.append(new_index) - - new._custom["index"] = tuple(new_indices) - - # Find the shape defined by the new index - new_shape = indices_shape(new_indices, original_shape, keepdims=False) - new._set_component("shape", tuple(new_shape), copy=False) - - return new - - def __repr__(self): - """Called by the `repr` built-in function. - - x.__repr__() <==> repr(x) - - """ - return ( - f"" - ) - - @property - def __asanyarray__(self): - """Whether the array is accessed by conversion to a `numpy` array. - - .. versionadded:: NEXTVERSION - - :Returns: - - `True` - - """ - return True - - def _get_array(self, index=None): - """Returns a subspace of the data as a `numpy` array. - - .. versionadded:: NEXTVERSION - - .. seealso:: `__array__`, `index` - - :Parameters: - - index: `tuple` or `None`, optional - Provide the indices that define the subspace. If - `None` then the `index` attribute is used. - - :Returns: - - `numpy.ndarray` - The subspace. - - """ - return NotImplementedError( - f"Must implement {self.__class__.__name__}._get_array" - ) - - def index(self, conform=True): - """The index to be applied when converting to a `numpy` array. - - The `shape` is defined by the `index` applied to the - `original_shape`. - - .. versionadded:: NEXTVERSION - - .. seealso:: `shape`, `original_shape` - - :Parameters: - - conform: `bool`, optional - If True, the default, then - - * Convert a decreasing size 1 slice to an increasing - one. - - * Convert, where possible, a sequence of integers to a - slice. - - These transformations are to allow subspacing on data - objects that have restricted indexing functionality, - such as `h5py.Variable` objects. - - If False then these transformations are not done. - - :Returns: - - `tuple` - - **Examples** - - >>> x.shape - (12, 145, 192) - >>> x.index() - (slice(None), slice(None), slice(None)) - >>> x = x[8:7:-1, 10:19:3, [15, 1, 4, 12]] - >>> x = x[[0], [True, False, True], ::-2] - >>> x.shape - (1, 2, 2) - >>> x.index() - (slice(8, 9, None), slice(10, 17, 6), slice(12, -1, -11)) - >>> x.index(conform=False) - (array([8]), array([10, 16]), array([12, 1])) - - """ - ind = self._custom.get("index") - if ind is None: - # No indices have been applied yet, so define indices that - # are equivalent to Ellipsis, and set the original shape. - ind = (slice(None),) * self.ndim - self._custom["index"] = ind - self._custom["original_shape"] = self.shape - return ind - - if not conform: - return ind - - # Still here? Then conform the indices by: - # - # 1) Converting decreasing size 1 slices to increasing - # ones. This helps when the parent class can't cope with - # decreasing slices. - # - # 2) Converting, where possible, sequences of integers to - # slices. This helps when the parent class can't cope with - # indices that are sequences of integers. - ind = list(ind) - for n, (i, size) in enumerate(zip(ind[:], self.original_shape)): - if isinstance(i, slice): - if size == 1: - start, _, step = i.indices(size) - if step and step < 0: - # Decreasing slices are not universally - # accepted (e.g. `h5py` doesn't like them), - # but we can convert them to increasing ones. - ind[n] = slice(start, start + 1) - elif np.iterable(i): - i = normalize_index((i,), (size,))[0] - if i.size == 1: - # Convert a sequence of one integer into a slice - start = i.item() - ind[n] = slice(start, start + 1) - else: - # Convert a sequence of two or more evenly spaced - # integers into a slice. - step = np.unique(np.diff(i)) - if step.size == 1: - start, stop = i[[0, -1]] - if stop >= start: - stop += 1 - elif stop: - stop = -1 - else: - stop = None - - ind[n] = slice(start, stop, step.item()) - - return tuple(ind) - - @property - def original_shape(self): - """The original shape of the data, before any subspacing. - - The `shape` is defined by the result of subspacing the data in - its original shape with the indices given by `index`. - - .. versionadded:: NEXTVERSION - - .. seealso:: `index`, `shape` - - """ - out = self._custom.get("original_shape") - if out is None: - # No subspace has been defined yet - out = self.shape - self._custom["original_shape"] = out - - return out diff --git a/cf/data/array/netcdf4array.py b/cf/data/array/netcdf4array.py index 095bf2d3ad..5255109006 100644 --- a/cf/data/array/netcdf4array.py +++ b/cf/data/array/netcdf4array.py @@ -1,13 +1,11 @@ import cfdm from ...mixin_container import Container -from .locks import netcdf_lock -from .mixin import ActiveStorageMixin, ArrayMixin, FileArrayMixin, IndexMixin +from .mixin import ActiveStorageMixin, ArrayMixin, FileArrayMixin class NetCDF4Array( ActiveStorageMixin, - IndexMixin, FileArrayMixin, ArrayMixin, Container, @@ -21,60 +19,3 @@ class NetCDF4Array( method. See `cf.data.collapse.Collapse` for details. """ - - def __dask_tokenize__(self): - """Return a value fully representative of the object. - - .. versionadded:: 3.15.0 - - """ - return super().__dask_tokenize__() + (self.get_mask(),) - - @property - def _lock(self): - """Set the lock for use in `dask.array.from_array`. - - Returns a lock object because concurrent reads are not - currently supported by the netCDF and HDF libraries. The lock - object will be the same for all `NetCDF4Array` and - `H5netcdfArray` instances, regardless of the dataset they - access, which means that access to all netCDF and HDF files - coordinates around the same lock. - - .. versionadded:: 3.14.0 - - """ - return netcdf_lock - - def _get_array(self, index=None): - """Returns a subspace of the dataset variable. - - .. versionadded:: NEXTVERSION - - .. seealso:: `__array__`, `index` - - :Parameters: - - {{index: `tuple` or `None`, optional}} - - :Returns: - - `numpy.ndarray` - The subspace. - - """ - if index is None: - index = self.index() - - # Note: We need to lock because the netCDF file is about to be - # accessed. - self._lock.acquire() - - # Note: It's cfdm.NetCDFArray.__getitem__ that we want to call - # here, but we use 'Container' in super because that - # comes immediately before cfdm.NetCDFArray in the - # method resolution order. - array = super(Container, self).__getitem__(index) - - self._lock.release() - return array diff --git a/cf/data/array/umarray.py b/cf/data/array/umarray.py index 510b9c97ee..a560365d9b 100644 --- a/cf/data/array/umarray.py +++ b/cf/data/array/umarray.py @@ -4,11 +4,14 @@ from ...functions import _DEPRECATION_ERROR_ATTRIBUTE, load_stash2standard_name from ...umread_lib.umfile import File, Rec from .abstract import Array -from .mixin import FileArrayMixin, IndexMixin +from .mixin import FileArrayMixin class UMArray( - IndexMixin, FileArrayMixin, cfdm.data.mixin.FileArrayMixin, Array + FileArrayMixin, + cfdm.data.mixin.IndexMixin, + cfdm.data.mixin.FileArrayMixin, + Array, ): """A sub-array stored in a PP or UM fields file.""" diff --git a/cf/data/collapse/dask_collapse.py b/cf/data/collapse/dask_collapse.py index 9476ffcb0b..9e4c75080c 100644 --- a/cf/data/collapse/dask_collapse.py +++ b/cf/data/collapse/dask_collapse.py @@ -10,13 +10,13 @@ from operator import mul import numpy as np +from cfdm.data.dask_utils import cfdm_to_memory from dask.array import chunk from dask.array.core import _concatenate2 from dask.array.reductions import divide, numel from dask.core import flatten from dask.utils import deepmap -from ..dask_utils import cf_asanyarray from .collapse_active import actify from .collapse_utils import double_precision_dtype @@ -276,9 +276,9 @@ def cf_mean_chunk( if computing_meta: return x - x = cf_asanyarray(x) + x = cfdm_to_memory(x) if weights is not None: - weights = cf_asanyarray(weights) + weights = cfdm_to_memory(weights) # N, sum d = cf_sum_chunk(x, weights=weights, dtype=dtype, **kwargs) @@ -401,7 +401,7 @@ def cf_max_chunk(x, dtype=None, computing_meta=False, **kwargs): if computing_meta: return x - x = cf_asanyarray(x) + x = cfdm_to_memory(x) return { "max": chunk.max(x, **kwargs), @@ -555,7 +555,7 @@ def cf_min_chunk(x, dtype=None, computing_meta=False, **kwargs): if computing_meta: return x - x = cf_asanyarray(x) + x = cfdm_to_memory(x) return { "min": chunk.min(x, **kwargs), @@ -662,7 +662,7 @@ def cf_range_chunk(x, dtype=None, computing_meta=False, **kwargs): if computing_meta: return x - x = cf_asanyarray(x) + x = cfdm_to_memory(x) # N, max d = cf_max_chunk(x, **kwargs) @@ -779,7 +779,7 @@ def cf_rms_chunk(x, weights=None, dtype="f8", computing_meta=False, **kwargs): if computing_meta: return x - x = cf_asanyarray(x) + x = cfdm_to_memory(x) return cf_mean_chunk( np.multiply(x, x, dtype=dtype), weights=weights, dtype=dtype, **kwargs @@ -857,7 +857,7 @@ def cf_sample_size_chunk(x, dtype="i8", computing_meta=False, **kwargs): if computing_meta: return x - x = cf_asanyarray(x) + x = cfdm_to_memory(x) if np.ma.isMA(x): N = chunk.sum(np.ones_like(x, dtype=dtype), **kwargs) @@ -985,10 +985,10 @@ def cf_sum_chunk( if computing_meta: return x - x = cf_asanyarray(x) + x = cfdm_to_memory(x) if weights is not None: - weights = cf_asanyarray(weights) + weights = cfdm_to_memory(weights) if check_weights: w_min = weights.min() if w_min <= 0: @@ -1107,9 +1107,9 @@ def cf_sum_of_weights_chunk( if computing_meta: return x - x = cf_asanyarray(x) + x = cfdm_to_memory(x) if weights is not None: - weights = cf_asanyarray(weights) + weights = cfdm_to_memory(weights) # N d = cf_sample_size_chunk(x, **kwargs) @@ -1152,9 +1152,9 @@ def cf_sum_of_weights2_chunk( if computing_meta: return x - x = cf_asanyarray(x) + x = cfdm_to_memory(x) if weights is not None: - weights = cf_asanyarray(weights) + weights = cfdm_to_memory(weights) # N d = cf_sample_size_chunk(x, **kwargs) @@ -1193,7 +1193,7 @@ def cf_unique_chunk(x, dtype=None, computing_meta=False, **kwargs): if computing_meta: return x - x = cf_asanyarray(x) + x = cfdm_to_memory(x) return {"unique": np.unique(x)} @@ -1298,11 +1298,11 @@ def cf_var_chunk( if computing_meta: return x - x = cf_asanyarray(x) + x = cfdm_to_memory(x) weighted = weights is not None if weighted: - weights = cf_asanyarray(weights) + weights = cfdm_to_memory(weights) # N, V1, sum d = cf_mean_chunk(x, weights=weights, dtype=dtype, **kwargs) diff --git a/cf/data/creation.py b/cf/data/creation.py index f8ddcb9b97..e69de29bb2 100644 --- a/cf/data/creation.py +++ b/cf/data/creation.py @@ -1,123 +0,0 @@ -"""Functions used during the creation of `Data` objects.""" - -from functools import lru_cache - -import dask.array as da -import numpy as np -from dask.base import is_dask_collection - - -def to_dask(array, chunks, **from_array_options): - """Create a `dask` array. - - .. versionadded:: 3.14.0 - - :Parameters: - - array: array_like - The array to be converted to a `dask` array. Examples of - valid types include anything with a `to_dask_array` - method, `numpy` arrays, `dask` arrays, `xarray` arrays, - `cf.Array` subclasses, `list`, `tuple`, scalars. - - chunks: `int`, `tuple`, `dict` or `str`, optional - Specify the chunking of the returned dask array. Any - value accepted by the *chunks* parameter of the - `dask.array.from_array` function is allowed. - - Might be ignored if *array* is a `dask` array that already - defines its own chunks. - - Might get automatically modified if *array* is a - compressed `Array` subclass. - - from_array_options: `dict`, optional - Keyword arguments to be passed to `dask.array.from_array`. - - If *from_array_options* has no ``'meta'`` key then the - `meta` keyword is set to the `_meta` attribute of *array* - or, if there is no such attribute, `None`. - - :Returns: - - `dask.array.Array` - The `dask` array representation of the array. - - **Examples** - - >>> cf.data.creation.to_dask([1, 2, 3], 'auto') - dask.array - >>> cf.data.creation.to_dask([1, 2, 3], chunks=2) - dask.array - >>> cf.data.creation.to_dask([1, 2, 3], chunks=2, {'asarray': True}) - dask.array - >>> cf.data.creation.to_dask(cf.dt(2000, 1, 1), 'auto') - dask.array - >>> cf.data.creation.to_dask([cf.dt(2000, 1, 1)], 'auto') - dask.array - - """ - if is_dask_collection(array): - return array - - if hasattr(array, "to_dask_array"): - try: - return array.to_dask_array(chunks=chunks) - except TypeError: - try: - return array.to_dask_array(_asanyarray=False) - except TypeError: - return array.to_dask_array() - - if type(array).__module__.split(".")[0] == "xarray": - data = getattr(array, "data", None) - if data is not None: - return da.asanyarray(data) - - if not isinstance( - array, (np.ndarray, list, tuple, memoryview) + np.ScalarType - ) and not hasattr(array, "shape"): - # 'array' is not of a type that `da.from_array` can cope with, - # so convert it to a numpy array. - array = np.asanyarray(array) - - kwargs = from_array_options - kwargs.setdefault("meta", getattr(array, "_meta", None)) - - try: - return da.from_array(array, chunks=chunks, **kwargs) - except NotImplementedError: - # Try again with 'chunks=-1', in case the failure was due to - # not being able to use auto rechunking with object dtype. - return da.from_array(array, chunks=-1, **kwargs) - - -@lru_cache(maxsize=32) -def generate_axis_identifiers(n): - """Return new axis identifiers for a given number of axes. - - The names are arbitrary and have no semantic meaning. - - .. versionadded:: 3.14.0 - - :Parameters: - - n: `int` - Generate this number of axis identifiers. - - :Returns: - - `list` - The new axis identifiers. - - **Examples** - - >>> cf.data.creation.generate_axis_identifiers(0) - [] - >>> cf.data.creation.generate_axis_identifiers(1) - ['dim0'] - >>> cf.data.creation.generate_axis_identifiers(3) - ['dim0', 'dim1', 'dim2'] - - """ - return [f"dim{i}" for i in range(n)] diff --git a/cf/data/dask_regrid.py b/cf/data/dask_regrid.py index 5a03a6f696..99b546400f 100644 --- a/cf/data/dask_regrid.py +++ b/cf/data/dask_regrid.py @@ -1,8 +1,7 @@ """Regridding functions used within a dask graph.""" import numpy as np - -from .dask_utils import cf_asanyarray +from cfdm.data.dask_utils import cfdm_to_memory def regrid( @@ -176,12 +175,12 @@ def regrid( """ weights, dst_mask = weights_dst_mask - a = cf_asanyarray(a) + a = cfdm_to_memory(a) if dst_mask is not None: - dst_mask = cf_asanyarray(dst_mask) + dst_mask = cfdm_to_memory(dst_mask) if ref_src_mask is not None: - ref_src_mask = cf_asanyarray(ref_src_mask) + ref_src_mask = cfdm_to_memory(ref_src_mask) # ---------------------------------------------------------------- # Reshape the array into a form suitable for the regridding dot diff --git a/cf/data/dask_utils.py b/cf/data/dask_utils.py index 8323bf696c..1fd3be11af 100644 --- a/cf/data/dask_utils.py +++ b/cf/data/dask_utils.py @@ -7,103 +7,14 @@ from functools import partial -import dask.array as da import numpy as np -from dask.core import flatten +from cfdm.data.dask_utils import cfdm_to_memory from scipy.ndimage import convolve1d from ..cfdatetime import dt, dt2rt, rt2dt -from ..functions import atol as cf_atol -from ..functions import rtol as cf_rtol from ..units import Units -def _da_ma_allclose(x, y, masked_equal=True, rtol=None, atol=None): - """An effective dask.array.ma.allclose method. - - True if two dask arrays are element-wise equal within a tolerance. - - Equivalent to allclose except that masked values are treated as - equal (default) or unequal, depending on the masked_equal - argument. - - Define an effective da.ma.allclose method here because one is - currently missing in the Dask codebase. - - Note that all default arguments are the same as those provided to - the corresponding NumPy method (see the `numpy.ma.allclose` API - reference). - - .. versionadded:: 3.14.0 - - :Parameters: - - x: a dask array to compare with y - - y: a dask array to compare with x - - masked_equal: `bool`, optional - Whether masked values in a and b are considered equal - (True) or not (False). They are considered equal by - default. - - {{rtol: number, optional}} - - {{atol: number, optional}} - - :Returns: - - `bool` - A Boolean value indicating whether or not the two dask - arrays are element-wise equal to the given *rtol* and - *atol* tolerance. - - """ - # TODODASK: put in a PR to Dask to request to add as genuine method. - - if rtol is None: - rtol = cf_rtol() - if atol is None: - atol = cf_atol() - - # Must pass rtol=rtol, atol=atol in as kwargs to allclose, rather than it - # using those in local scope from the outer function arguments, because - # Dask's internal algorithms require these to be set as parameters. - def allclose(a_blocks, b_blocks, rtol=rtol, atol=atol): - """Run `ma.allclose` across multiple blocks over two arrays.""" - result = True - # Handle scalars, including 0-d arrays, for which a_blocks and - # b_blocks will have the corresponding type and hence not be iterable. - # With this approach, we avoid inspecting sizes or lengths, and for - # the 0-d array blocks the following iteration can be used unchanged - # and will only execute once with block sizes as desired of: - # (np.array(),)[0] = array(). Note - # can't check against more general case of collections.abc.Iterable - # because a 0-d array is also iterable, but in practice always a list. - if not isinstance(a_blocks, list): - a_blocks = (a_blocks,) - if not isinstance(b_blocks, list): - b_blocks = (b_blocks,) - - # Note: If a_blocks or b_blocks has more than one chunk in - # more than one dimension they will comprise a nested - # sequence of sequences, that needs to be flattened so - # that we can safely iterate through the actual numpy - # array elements. - - for a, b in zip(flatten(a_blocks), flatten(b_blocks)): - result &= np.ma.allclose( - a, b, masked_equal=masked_equal, rtol=rtol, atol=atol - ) - - return result - - axes = tuple(range(x.ndim)) - return da.blockwise( - allclose, "", x, axes, y, axes, dtype=bool, rtol=rtol, atol=atol - ) - - def cf_contains(a, value): """Whether or not an array contains a value. @@ -127,8 +38,8 @@ def cf_contains(a, value): value. """ - a = cf_asanyarray(a) - value = cf_asanyarray(value) + a = cfdm_to_memory(a) + value = cfdm_to_memory(value) return np.array(value in a).reshape((1,) * a.ndim) @@ -162,7 +73,7 @@ def cf_convolve1d(a, window=None, axis=-1, origin=0): Convolved float array with same shape as input. """ - a = cf_asanyarray(a) + a = cfdm_to_memory(a) # Cast to float to ensure that NaNs can be stored if a.dtype != float: @@ -185,38 +96,6 @@ def cf_convolve1d(a, window=None, axis=-1, origin=0): return c -def cf_harden_mask(a): - """Harden the mask of a masked `numpy` array. - - Has no effect if the array is not a masked array. - - .. versionadded:: 3.14.0 - - .. seealso:: `cf.Data.harden_mask` - - :Parameters: - - a: `numpy.ndarray` - The array to have a hardened mask. - - :Returns: - - `numpy.ndarray` - The array with hardened mask. - - """ - a = cf_asanyarray(a) - if np.ma.isMA(a): - try: - a.harden_mask() - except AttributeError: - # Trap cases when the input array is not a numpy array - # (e.g. it might be numpy.ma.masked). - pass - - return a - - def cf_percentile(a, q, axis, method, keepdims=False, mtol=1): """Compute percentiles of the data along the specified axes. @@ -276,7 +155,7 @@ def cf_percentile(a, q, axis, method, keepdims=False, mtol=1): """ from math import prod - a = cf_asanyarray(a) + a = cfdm_to_memory(a) if np.ma.isMA(a) and not np.ma.is_masked(a): # Masked array with no masked elements @@ -351,142 +230,6 @@ def cf_percentile(a, q, axis, method, keepdims=False, mtol=1): return p -def cf_soften_mask(a): - """Soften the mask of a masked `numpy` array. - - Has no effect if the array is not a masked array. - - .. versionadded:: 3.14.0 - - .. seealso:: `cf.Data.soften_mask` - - :Parameters: - - a: `numpy.ndarray` - The array to have a softened mask. - - :Returns: - - `numpy.ndarray` - The array with softened mask. - - """ - a = cf_asanyarray(a) - - if np.ma.isMA(a): - try: - a.soften_mask() - except AttributeError: - # Trap cases when the input array is not a numpy array - # (e.g. it might be numpy.ma.masked). - pass - - return a - - -def cf_where(array, condition, x, y, hardmask): - """Set elements of *array* from *x* or *y* depending on *condition*. - - The input *array* is not changed in-place. - - See `where` for details on the expected functionality. - - .. note:: This function correctly sets the mask hardness of the - output array. - - .. versionadded:: 3.14.0 - - .. seealso:: `cf.Data.where` - - :Parameters: - - array: numpy.ndarray - The array to be assigned to. - - condition: numpy.ndarray - Where False or masked, assign from *y*, otherwise assign - from *x*. - - x: numpy.ndarray or `None` - *x* and *y* must not both be `None`. - - y: numpy.ndarray or `None` - *x* and *y* must not both be `None`. - - hardmask: `bool` - Set the mask hardness for a returned masked array. If True - then a returned masked array will have a hardened mask, and - the mask of the input *array* (if there is one) will be - applied to the returned array, in addition to any masked - elements arising from assignments from *x* or *y*. - - :Returns: - - `numpy.ndarray` - A copy of the input *array* with elements from *y* where - *condition* is False or masked, and elements from *x* - elsewhere. - - """ - array = cf_asanyarray(array) - condition = cf_asanyarray(condition) - if x is not None: - x = cf_asanyarray(x) - - if y is not None: - y = cf_asanyarray(y) - - mask = None - - if np.ma.isMA(array): - # Do a masked where - where = np.ma.where - if hardmask: - mask = array.mask - elif np.ma.isMA(x) or np.ma.isMA(y): - # Do a masked where - where = np.ma.where - else: - # Do a non-masked where - where = np.where - hardmask = False - - condition_is_masked = np.ma.isMA(condition) - if condition_is_masked: - condition = condition.astype(bool) - - if x is not None: - # Assign values from x - if condition_is_masked: - # Replace masked elements of condition with False, so that - # masked locations are assigned from array - c = condition.filled(False) - else: - c = condition - - array = where(c, x, array) - - if y is not None: - # Assign values from y - if condition_is_masked: - # Replace masked elements of condition with True, so that - # masked locations are assigned from array - c = condition.filled(True) - else: - c = condition - - array = where(c, array, y) - - if hardmask: - if mask is not None and mask.any(): - # Apply the mask from the input array to the result - array.mask |= mask - - array.harden_mask() - - return array - - def _getattr(x, attr): return getattr(x, attr, False) @@ -531,7 +274,7 @@ def cf_YMDhms(a, attr): array([1, 2]) """ - a = cf_asanyarray(a) + a = cfdm_to_memory(a) return _array_getattr(a, attr=attr) @@ -564,7 +307,8 @@ def cf_rt2dt(a, units): cftime.DatetimeGregorian(2000, 1, 2, 0, 0, 0, 0, has_year_zero=False)] """ - a = cf_asanyarray(a) + a = cfdm_to_memory(a) + if not units.iscalendartime: return rt2dt(a, units_in=units) @@ -619,7 +363,7 @@ def cf_dt2rt(a, units): [365 366] """ - a = cf_asanyarray(a) + a = cfdm_to_memory(a) return dt2rt(a, units_out=units, units_in=None) @@ -660,7 +404,7 @@ def cf_units(a, from_units, to_units): [1000. 2000.] """ - a = cf_asanyarray(a) + a = cfdm_to_memory(a) return Units.conform( a, from_units=from_units, to_units=to_units, inplace=False ) @@ -684,7 +428,7 @@ def cf_is_masked(a): values. """ - a = cf_asanyarray(a) + a = cfdm_to_memory(a) out = np.ma.is_masked(a) return np.array(out).reshape((1,) * a.ndim) @@ -717,30 +461,5 @@ def cf_filled(a, fill_value=None): [[-999 2 3]] """ - a = cf_asanyarray(a) + a = cfdm_to_memory(a) return np.ma.filled(a, fill_value=fill_value) - - -def cf_asanyarray(a): - """Convert to a `numpy` array. - - Only do this if the input *a* has an `__asanyarray__` attribute - with value True. - - .. versionadded:: NEXTVERSION - - :Parameters: - - a: array_like - The array. - - :Returns: - - The array converted to a `numpy` array, or the input array - unchanged if ``a.__asanyarray__`` False. - - """ - if getattr(a, "__asanyarray__", False): - return np.asanyarray(a) - - return a diff --git a/cf/data/data.py b/cf/data/data.py index 09ed49d87f..7e1cac6cf9 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -1,9 +1,7 @@ import logging import math -import operator from functools import partial, reduce from itertools import product -from numbers import Integral from operator import mul from os import sep @@ -11,16 +9,15 @@ import cftime import dask.array as da import numpy as np -from cfdm import is_log_level_info +from cfdm.data.dask_utils import cfdm_where +from cfdm.data.utils import new_axis_identifier from dask import compute, delayed # noqa: F401 from dask.array.core import normalize_chunks -from dask.base import collections_to_dsk, is_dask_collection, tokenize +from dask.base import is_dask_collection, tokenize from dask.highlevelgraph import HighLevelGraph -from dask.optimization import cull -from scipy.sparse import issparse from ..cfdatetime import dt as cf_dt -from ..constants import masked as cf_masked +from ..constants import masked from ..decorators import ( _deprecated_kwarg_check, _display_or_return, @@ -30,45 +27,24 @@ ) from ..functions import ( _DEPRECATION_ERROR_KWARGS, - _numpy_allclose, _section, abspath, - atol, - default_netCDF_fillvals, free_memory, parse_indices, - rtol, ) from ..mixin2 import CFANetCDF, Container from ..units import Units from .collapse import Collapse -from .creation import generate_axis_identifiers, to_dask from .dask_utils import ( - _da_ma_allclose, - cf_asanyarray, cf_contains, cf_dt2rt, - cf_filled, - cf_harden_mask, cf_is_masked, cf_percentile, cf_rt2dt, - cf_soften_mask, cf_units, - cf_where, ) from .mixin import DataClassDeprecationsMixin -from .utils import ( - YMDhms, - collapse, - conform_units, - convert_to_datetime, - convert_to_reftime, - first_non_missing_value, - is_numeric_dtype, - new_axis_identifier, - scalar_masked_array, -) +from .utils import YMDhms, collapse, conform_units, scalar_masked_array logger = logging.getLogger(__name__) @@ -91,18 +67,6 @@ _dtype_float = np.dtype(float) _dtype_bool = np.dtype(bool) -_DEFAULT_CHUNKS = "auto" -_DEFAULT_HARDMASK = True - -# Contstants used to specify which `Data` components should be cleared -# when a new dask array is set. See `Data._clear_after_dask_update` -# for details. -_NONE = 0 # = 0b0000 -_ARRAY = 1 # = 0b0001 -_CACHE = 2 # = 0b0010 -_CFA = 4 # = 0b0100 -_ALL = 15 # = 0b1111 - class Data(DataClassDeprecationsMixin, CFANetCDF, Container, cfdm.Data): """An N-dimensional data array with units and masked values. @@ -168,390 +132,25 @@ class Data(DataClassDeprecationsMixin, CFANetCDF, Container, cfdm.Data): """ - def __init__( - self, - array=None, - units=None, - calendar=None, - fill_value=None, - hardmask=_DEFAULT_HARDMASK, - chunks=_DEFAULT_CHUNKS, - dt=False, - source=None, - copy=True, - dtype=None, - mask=None, - mask_value=None, - to_memory=False, - init_options=None, - _use_array=True, - ): - """**Initialisation** - - :Parameters: - - array: optional - The array of values. May be a scalar or array-like - object, including another `Data` instance, anything - with a `!to_dask_array` method, `numpy` array, `dask` - array, `xarray` array, `cf.Array` subclass, `list`, - `tuple`, scalar. - - *Parameter example:* - ``array=34.6`` - - *Parameter example:* - ``array=[[1, 2], [3, 4]]`` - - *Parameter example:* - ``array=numpy.ma.arange(10).reshape(2, 1, 5)`` - - units: `str` or `Units`, optional - The physical units of the data. if a `Units` object is - provided then this an also set the calendar. - - The units (without the calendar) may also be set after - initialisation with the `set_units` method. - - *Parameter example:* - ``units='km hr-1'`` - - *Parameter example:* - ``units='days since 2018-12-01'`` - - calendar: `str`, optional - The calendar for reference time units. - - The calendar may also be set after initialisation with the - `set_calendar` method. - - *Parameter example:* - ``calendar='360_day'`` - - fill_value: optional - The fill value of the data. By default, or if set to - `None`, the `numpy` fill value appropriate to the array's - data-type will be used (see - `numpy.ma.default_fill_value`). - - The fill value may also be set after initialisation with - the `set_fill_value` method. - - *Parameter example:* - ``fill_value=-999.`` - - dtype: data-type, optional - The desired data-type for the data. By default the - data-type will be inferred form the *array* - parameter. - - The data-type may also be set after initialisation with - the `dtype` attribute. - - *Parameter example:* - ``dtype=float`` - - *Parameter example:* - ``dtype='float32'`` - - *Parameter example:* - ``dtype=numpy.dtype('i2')`` - - .. versionadded:: 3.0.4 - - mask: optional - Apply this mask to the data given by the *array* - parameter. By default, or if *mask* is `None`, no mask - is applied. May be any scalar or array-like object - (such as a `list`, `numpy` array or `Data` instance) - that is broadcastable to the shape of *array*. Masking - will be carried out where the mask elements evaluate - to `True`. - - This mask will applied in addition to any mask already - defined by the *array* parameter. - - .. versionadded:: 3.0.5 - - mask_value: scalar array_like - Mask *array* where it is equal to *mask_value*, using - numerically tolerant floating point equality. - - .. versionadded:: 3.16.0 - - {{init source: optional}} - - hardmask: `bool`, optional - If False then the mask is soft. By default the mask is - hard. - - dt: `bool`, optional - If True then strings (such as ``'1990-12-01 12:00'``) - given by the *array* parameter are re-interpreted as - date-time objects. By default they are not. - - {{init copy: `bool`, optional}} - - {{chunks: `int`, `tuple`, `dict` or `str`, optional}} - - .. versionadded:: 3.14.0 - - to_memory: `bool`, optional - If True then ensure that the original data are in - memory, rather than on disk. - - If the original data are on disk, then reading data - into memory during initialisation will slow down the - initialisation process, but can considerably improve - downstream performance by avoiding the need for - independent reads for every dask chunk, each time the - data are computed. - - In general, setting *to_memory* to True is not the same - as calling the `persist` of the newly created `Data` - object, which also decompresses data compressed by - convention and computes any data type, mask and - date-time modifications. - - If the input *array* is a `dask.array.Array` object - then *to_memory* is ignored. - - .. versionadded:: 3.14.0 - - init_options: `dict`, optional - Provide optional keyword arguments to methods and - functions called during the initialisation process. A - dictionary key identifies a method or function. The - corresponding value is another dictionary whose - key/value pairs are the keyword parameter names and - values to be applied. - - Supported keys are: - - * ``'from_array'``: Provide keyword arguments to - the `dask.array.from_array` function. This is used - when initialising data that is not already a dask - array and is not compressed by convention. - - * ``'first_non_missing_value'``: Provide keyword - arguments to the - `cf.data.utils.first_non_missing_value` - function. This is used when the input array contains - date-time strings or objects, and may affect - performance. - - *Parameter example:* - ``{'from_array': {'inline_array': True}}`` - - chunk: deprecated at version 3.14.0 - Use the *chunks* parameter instead. - - **Examples** - - >>> d = cf.Data(5) - >>> d = cf.Data([1,2,3], units='K') - >>> import numpy - >>> d = cf.Data(numpy.arange(10).reshape(2,5), - ... units=Units('m/s'), fill_value=-999) - >>> d = cf.Data('fly') - >>> d = cf.Data(tuple('fly')) - - """ - if source is None and isinstance(array, self.__class__): - source = array - - if init_options is None: - init_options = {} - - if source is not None: - try: - array = source._get_Array(None) - except AttributeError: - array = None - - super().__init__( - source=source, _use_array=_use_array and array is not None - ) - if _use_array: - try: - array = source.to_dask_array(_asanyarray=False) - except (AttributeError, TypeError): - try: - array = source.to_dask_array() - except (AttributeError, TypeError): - pass - else: - self._set_dask(array, copy=copy, clear=_NONE) - else: - self._set_dask( - array, copy=copy, clear=_NONE, asanyarray=None - ) - else: - self._del_dask(None, clear=_NONE) - - # Set the mask hardness - self.hardmask = getattr(source, "hardmask", _DEFAULT_HARDMASK) - - return - - super().__init__( - array=array, - fill_value=fill_value, - _use_array=False, - ) - - # Set the units - units = Units(units, calendar=calendar) - self._Units = units - - # Set the mask hardness - self.hardmask = hardmask - - if array is None: - # No data has been set - return - - sparse_array = issparse(array) - - try: - ndim = array.ndim - except AttributeError: - ndim = np.ndim(array) - - # Create the _cyclic attribute: identifies which axes are - # cyclic (and therefore allow cyclic slicing). It must be a - # subset of the axes given by the _axes attribute. If an axis - # is removed from _axes then it must also be removed from - # _cyclic. - # - # Never change the value of the _cyclic attribute in-place. - self._cyclic = _empty_set - - # Create the _axes attribute: an ordered sequence of unique - # (within this `Data` instance) names for each array axis. - self._axes = generate_axis_identifiers(ndim) - - if not _use_array: - return - - # Still here? Then create a dask array and store it. - custom = self._custom - - # Find out if the input data is compressed by convention - try: - compressed = array.get_compression_type() - except AttributeError: - compressed = "" - - if compressed and init_options.get("from_array"): - raise ValueError( - "Can't define 'from_array' initialisation options " - "for compressed input arrays" - ) - - # Bring the compressed data into memory without - # decompressing it - if to_memory: - try: - array = array.to_memory() - except AttributeError: - pass - - if self._is_abstract_Array_subclass(array): - # Save the input array in case it's useful later. For - # compressed input arrays this will contain extra - # information, such as a count or index variable. - self._set_Array(array) - - # Cast the input data as a dask array - kwargs = init_options.get("from_array", {}) - if "chunks" in kwargs: - raise TypeError( - "Can't define 'chunks' in the 'from_array' initialisation " - "options. Use the 'chunks' parameter instead." - ) - - # Set whether or not we're sure that the Data instance has a - # deterministic name - is_dask = is_dask_collection(array) - custom["deterministic"] = not is_dask - - # Set whether or not to call `np.asanyarray` on chunks to - # convert them to numpy arrays. - if is_dask: - # We don't know what's in the dask array, so we should - # assume that it might need converting to a numpy array. - custom["__asanyarray__"] = True - else: - # Use the array's __asanyarray__ value, if it has one. - custom["__asanyarray__"] = bool( - getattr(array, "__asanyarray__", False) - ) - - dx = to_dask(array, chunks, **kwargs) - - # Find out if we have an array of date-time objects - if units.isreftime: - dt = True - - first_value = None - if not dt and dx.dtype.kind == "O": - kwargs = init_options.get("first_non_missing_value", {}) - first_value = first_non_missing_value(dx, **kwargs) - - if first_value is not None: - dt = hasattr(first_value, "timetuple") - - # Convert string or object date-times to floating point - # reference times - if dt and dx.dtype.kind in "USO": - dx, units = convert_to_reftime(dx, units, first_value) - # Reset the units - self._Units = units - - # Store the dask array - self._set_dask(dx, clear=_NONE, asanyarray=None) - - # Override the data type - if dtype is not None: - self.dtype = dtype - - # Apply a mask - if mask is not None: - if sparse_array: - raise ValueError("Can't mask sparse array") - - self.where(mask, cf_masked, inplace=True) - - # Apply masked values - if mask_value is not None: - if sparse_array: - raise ValueError("Can't mask sparse array") - - self.masked_values(mask_value, inplace=True) - - @property - def dask_compressed_array(self): - """Returns a dask array of the compressed data. - - .. versionadded:: 3.14.0 - - :Returns: - - `dask.array.Array` - The compressed data. - - **Examples** - - >>> a = d.dask_compressed_array - - """ - ca = self.source(None) - - if ca is None or not ca.get_compression_type(): - raise ValueError("not compressed: can't get compressed dask array") - - return ca.to_dask_array() + # Constants used to specify which components should be cleared + # when a new dask array is set. See `_clear_after_dask_update` for + # details. + # + # These constants must have values 2**N (N>=1), except for + # `_NONE` which must be 0, and `_ALL` which must be the sum of + # other constants. It is therefore convenient to define these + # constants in binary. + _NONE = 0b000 + _ARRAY = 0b001 + _CACHE = 0b010 + _CFA = 0b100 + _ALL = 0b111 + + def __new__(cls, *args, **kwargs): + """Store component classes.""" + instance = super().__new__(cls) + instance._Units_class = Units + return instance def __contains__(self, value): """Membership test operator ``in`` @@ -644,13 +243,13 @@ def __contains__(self, value): # are incompatible return False - # 'cf_contains' has its own calls to 'cf_asanyarray', so - # we can set '_asanyarray=False'. - value = value.to_dask_array(_asanyarray=False) + # 'cf_contains' has its own calls to 'cfdm_to_memory', so + # we can set '_force_to_memory=False'. + value = value.to_dask_array(_force_to_memory=False) - # 'cf_contains' has its own calls to 'cf_asanyarray', so we - # can set '_asanyarray=False'. - dx = self.to_dask_array(_asanyarray=False) + # 'cf_contains' has its own calls to 'cfdm_to_memory', so we + # can set '_force_to_memory=False'. + dx = self.to_dask_array(_force_to_memory=False) out_ind = tuple(range(dx.ndim)) dx_ind = out_ind @@ -668,156 +267,6 @@ def __contains__(self, value): return bool(dx.any()) - @property - def _atol(self): - """Return the current value of the `cf.atol` function.""" - return atol().value - - @property - def _rtol(self): - """Return the current value of the `cf.rtol` function.""" - return rtol().value - - def __data__(self): - """Returns a new reference to self.""" - return self - - def __float__(self): - """Called to implement the built-in function `float` - - x.__float__() <==> float(x) - - **Performance** - - `__float__` causes all delayed operations to be executed, - unless the dask array size is already known to be greater than - 1. - - """ - return float(self.to_dask_array()) - - def __int__(self): - """Called to implement the built-in function `int` - - x.__int__() <==> int(x) - - **Performance** - - `__int__` causes all delayed operations to be executed, unless - the dask array size is already known to be greater than 1. - - """ - return int(self.to_dask_array()) - - def __iter__(self): - """Called when an iterator is required. - - x.__iter__() <==> iter(x) - - **Performance** - - If the shape of the data is unknown then it is calculated - immediately by executing all delayed operations. - - **Examples** - - >>> d = cf.Data([1, 2, 3], 'metres') - >>> for e in d: - ... print(repr(e)) - ... - - - - - >>> d = cf.Data([[1, 2], [3, 4]], 'metres') - >>> for e in d: - ... print(repr(e)) - ... - - - - >>> d = cf.Data(99, 'metres') - >>> for e in d: - ... print(repr(e)) - ... - Traceback (most recent call last): - ... - TypeError: iteration over a 0-d Data - - """ - try: - n = len(self) - except TypeError: - raise TypeError(f"iteration over a 0-d {self.__class__.__name__}") - - if self.__keepdims_indexing__: - for i in range(n): - out = self[i] - out.reshape(out.shape[1:], inplace=True) - yield out - else: - for i in range(n): - yield self[i] - - def __len__(self): - """Called to implement the built-in function `len`. - - x.__len__() <==> len(x) - - **Performance** - - If the shape of the data is unknown then it is calculated - immediately by executing all delayed operations. - - **Examples** - - >>> len(cf.Data([1, 2, 3])) - 3 - >>> len(cf.Data([[1, 2, 3]])) - 1 - >>> len(cf.Data([[1, 2, 3], [4, 5, 6]])) - 2 - >>> len(cf.Data(1)) - Traceback (most recent call last): - ... - TypeError: len() of unsized object - - """ - # The dask graph is never going to be computed, so we can set - # '_asanyarray=False'. - dx = self.to_dask_array(_asanyarray=False) - if math.isnan(dx.size): - logger.debug("Computing data len: Performance may be degraded") - dx.compute_chunk_sizes() - - return len(dx) - - def __bool__(self): - """Truth value testing and the built-in operation `bool` - - x.__bool__() <==> bool(x) - - **Performance** - - `__bool__` causes all delayed operations to be computed. - - **Examples** - - >>> bool(cf.Data(1.5)) - True - >>> bool(cf.Data([[False]])) - False - - """ - size = self.size - if size != 1: - raise ValueError( - f"The truth value of a {self.__class__.__name__} with {size} " - "elements is ambiguous. Use d.any() or d.all()" - ) - - return bool(self.to_dask_array()) - def __getitem__(self, indices): """Return a subspace of the data defined by indices. @@ -879,19 +328,17 @@ def __getitem__(self, indices): indices = indices[2:] shape = self.shape + axes = self._axes + cyclic_axes = self._cyclic keepdims = self.__keepdims_indexing__ indices, roll = parse_indices( shape, indices, cyclic=True, keepdims=keepdims ) - - axes = self._axes - cyclic_axes = self._cyclic - - # ------------------------------------------------------------ - # Roll axes with cyclic slices - # ------------------------------------------------------------ + indices = tuple(indices) if roll: + # Roll axes with cyclic slices. + # # For example, if slice(-2, 3) has been requested on a # cyclic axis, then we roll that axis by two points and # apply the slice(0, 5) instead. @@ -900,93 +347,21 @@ def __getitem__(self, indices): "Can't take a cyclic slice of a non-cyclic axis" ) - new = self.roll( - axis=tuple(roll.keys()), shift=tuple(roll.values()) - ) - dx = new.to_dask_array(_asanyarray=False) - else: - new = self.copy() - dx = self.to_dask_array(_asanyarray=False) - - # ------------------------------------------------------------ - # Subspace the dask array - # ------------------------------------------------------------ - if self.__orthogonal_indexing__: - # Apply 'orthogonal indexing': indices that are 1-d arrays - # or lists subspace along each dimension - # independently. This behaviour is similar to Fortran, but - # different to dask. - axes_with_list_indices = [ - i - for i, x in enumerate(indices) - if isinstance(x, list) or getattr(x, "shape", False) - ] - n_axes_with_list_indices = len(axes_with_list_indices) - - if n_axes_with_list_indices < 2: - # At most one axis has a list/1-d array index so do a - # normal dask subspace - dx = dx[tuple(indices)] - else: - # At least two axes have list/1-d array indices so we - # can't do a normal dask subspace - - # Subspace axes which have list/1-d array indices - for axis in axes_with_list_indices: - dx = da.take(dx, indices[axis], axis=axis) - - if n_axes_with_list_indices < len(indices): - # Subspace axes which don't have list/1-d array - # indices. (Do this after subspacing axes which do - # have list/1-d array indices, in case - # __keepdims_indexing__ is False.) - slice_indices = [ - slice(None) if i in axes_with_list_indices else x - for i, x in enumerate(indices) - ] - dx = dx[tuple(slice_indices)] + d = self.roll(axis=tuple(roll.keys()), shift=tuple(roll.values())) else: - raise NotImplementedError( - "Non-orthogonal indexing has not yet been implemented" - ) - - # ------------------------------------------------------------ - # Set the subspaced dask array - # - # * A subspaced chunk might not result in an array in memory, - # so we set asanyarray=True to ensure that, if required, - # they are converted at compute time. - # ------------------------------------------------------------ - new._set_dask(dx, asanyarray=True) + d = self - # ------------------------------------------------------------ - # Get the axis identifiers for the subspace - # ------------------------------------------------------------ - shape0 = shape - if keepdims: - new_axes = axes - else: - new_axes = [ - axis - for axis, x in zip(axes, indices) - if not isinstance(x, Integral) and getattr(x, "shape", True) - ] - if new_axes != axes: - new._axes = new_axes - cyclic_axes = new._cyclic - if cyclic_axes: - shape0 = [ - n for n, axis in zip(shape, axes) if axis in new_axes - ] + new = super(Data, d).__getitem__(indices) - # ------------------------------------------------------------ - # Cyclic axes that have been reduced in size are no longer - # considered to be cyclic - # ------------------------------------------------------------ if cyclic_axes: + # Cyclic axes that have been reduced in size are no longer + # considered to be cyclics + shape0 = [ + n for n, axis in zip(shape, self._axes) if axis in new._axes + ] x = [ axis - for axis, n0, n1 in zip(new_axes, shape0, new.shape) + for axis, n0, n1 in zip(new._axes, shape0, new.shape) if axis in cyclic_axes and n0 != n1 ] if x: @@ -994,15 +369,10 @@ def __getitem__(self, indices): # in-place new._cyclic = cyclic_axes.difference(x) - # ------------------------------------------------------------ - # Apply ancillary masks - # ------------------------------------------------------------ - for mask in ancillary_mask: - new.where(mask, cf_masked, None, inplace=True) - - if new.shape != self.shape: - # Delete hdf5 chunksizes when the shape has changed. - new.nc_clear_hdf5_chunksizes() + if ancillary_mask: + # Apply ancillary masks + for mask in ancillary_mask: + new.where(mask, masked, None, inplace=True) return new @@ -1052,8 +422,6 @@ def __setitem__(self, indices, value): `hardmask`, `where` """ - shape = self.shape - ancillary_mask = () try: arg = indices[0] @@ -1068,64 +436,15 @@ def __setitem__(self, indices, value): indices = indices[2:] indices, roll = parse_indices( - shape, + self.shape, indices, cyclic=True, keepdims=self.__keepdims_indexing__, ) - axes_with_list_indices = [ - i - for i, x in enumerate(indices) - if isinstance(x, list) or getattr(x, "shape", False) - ] - - # When there are two or more 1-d array indices of Booleans or - # integers, convert them to slices, if possible. - # - # Note: If any of these 1-d arrays is a dask collection, then - # this will be computed. - if len(axes_with_list_indices) > 1: - for i, index in enumerate(indices): - if not ( - isinstance(index, list) or getattr(index, "shape", False) - ): - # Not a 1-d array - continue - - index = np.array(index) - - size = shape[i] - if index.dtype == bool: - # Convert True values to integers - index = np.arange(size)[index] - else: - # Make sure all integer values are non-negative - index = np.where(index < 0, index + size, index) - - if size == 1: - start = index[0] - index = slice(start, start + 1) - else: - steps = index[1:] - index[:-1] - step = steps[0] - if step and not (steps - step).any(): - # Array has a regular step, and so can be - # converted to a slice. - if step > 0: - start, stop = index[0], index[-1] + 1 - elif step < 0: - start, stop = index[0], index[-1] - 1 - - if stop < 0: - stop = None - - index = slice(start, stop, step) - - indices[i] = index - - # Roll axes with cyclic slices if roll: + # Roll axes with cyclic slices + # # For example, if assigning to slice(-2, 3) has been # requested on a cyclic axis (and we're not using numpy # indexing), then we roll that axis by two points and @@ -1144,25 +463,20 @@ def __setitem__(self, indices, value): # Make sure that the units of value are the same as self value = conform_units(value, self.Units) - # Missing values could be affected, so make sure that the mask - # hardness has been applied. - dx = self.to_dask_array(apply_mask_hardness=True) - # Do the assignment - self._set_subspace(dx, indices, value) - self._set_dask(dx) + indices = tuple(indices) + super().__setitem__(indices, value) - # Unroll any axes that were rolled to enable a cyclic - # assignment if roll: + # Unroll any axes that were rolled to enable a cyclic + # assignment shifts = [-shift for shift in shifts] self.roll(shift=shifts, axis=roll_axes, inplace=True) - # Reset the original array values at locations that are - # excluded from the assignment by True values in any ancillary - # masks if ancillary_mask: - indices = tuple(indices) + # Reset the original array values at locations that are + # excluded from the assignment by True values in any + # ancillary masks original_self = original_self[indices] reset = self[indices] for mask in ancillary_mask: @@ -1172,132 +486,6 @@ def __setitem__(self, indices, value): return - @property - def __asanyarray__(self): - """Whether or not chunks need conversion to `numpy` arrays. - - .. versionadded:: NEXTVERSION - - ..seealso:: `to_dask_array`, `todict`, `_set_dask` - - :Returns: - - `bool` - - """ - return self._custom.get("__asanyarray__", True) - - @property - def __orthogonal_indexing__(self): - """Flag to indicate that orthogonal indexing is supported. - - Always True, indicating that 'orthogonal indexing' is - applied. This means that when indices are 1-d arrays or lists - then they subspace along each dimension independently. This - behaviour is similar to Fortran, but different to `numpy`. - - .. versionadded:: 3.14.0 - - .. seealso:: `__keepdims_indexing__`, `__getitem__`, - `__setitem__`, - `netCDF4.Variable.__orthogonal_indexing__` - - **Examples** - - >>> d = cf.Data([[1, 2, 3], - ... [4, 5, 6]]) - >>> e = d[[0], [0, 2]] - >>> e.shape - (1, 2) - >>> print(e.array) - [[1 3]] - >>> e = d[[0, 1], [0, 2]] - >>> e.shape - (2, 2) - >>> print(e.array) - [[1 3] - [4 6]] - - """ - return True - - @property - def __keepdims_indexing__(self): - """Flag to indicate whether dimensions indexed with integers are - kept. - - If set to True (the default) then providing a single integer - as a single-axis index does *not* reduce the number of array - dimensions by 1. This behaviour is different to `numpy`. - - If set to False then providing a single integer as a - single-axis index reduces the number of array dimensions by - 1. This behaviour is the same as `numpy`. - - .. versionadded:: 3.14.0 - - .. seealso:: `__orthogonal_indexing__`, `__getitem__`, - `__setitem__` - - **Examples** - - >>> d = cf.Data([[1, 2, 3], - ... [4, 5, 6]]) - >>> d.__keepdims_indexing__ - True - >>> e = d[0] - >>> e.shape - (1, 3) - >>> print(e.array) - [[1 2 3]] - - >>> d.__keepdims_indexing__ - True - >>> e = d[:, 1] - >>> e.shape - (2, 1) - >>> print(e.array) - [[2] - [5]] - - >>> d.__keepdims_indexing__ - True - >>> e = d[0, 1] - >>> e.shape - (1, 1) - >>> print(e.array) - [[2]] - - >>> d.__keepdims_indexing__ = False - >>> e = d[0] - >>> e.shape - (3,) - >>> print(e.array) - [1 2 3] - - >>> d.__keepdims_indexing__ - False - >>> e = d[:, 1] - >>> e.shape - (2,) - >>> print(e.array) - [2 5] - - >>> d.__keepdims_indexing__ - False - >>> e = d[0, 1] - >>> e.shape - () - >>> print(e.array) - 2 - - """ - return self._custom.get("__keepdims_indexing__", True) - - @__keepdims_indexing__.setter - def __keepdims_indexing__(self, value): - self._custom["__keepdims_indexing__"] = bool(value) - def _cfa_del_write(self): """Set the CFA write status of the data to `False`. @@ -1335,233 +523,6 @@ def _cfa_set_term(self, value): self._custom["cfa_term"] = bool(value) - def _clear_after_dask_update(self, clear=_ALL): - """Remove components invalidated by updating the `dask` array. - - Removes or modifies components that can't be guaranteed to be - consistent with an updated `dask` array. See the *clear* - parameter for details. - - .. versionadded:: 3.14.0 - - .. seealso:: `_del_Array`, `_del_cached_elements`, - `_cfa_del_write`, `_set_dask` - - :Parameters: - - clear: `int`, optional - Specify which components should be removed. Which - components are removed is determined by sequentially - combining *clear* with the ``_ARRAY``, ``_CACHE`` and - ``_CFA`` integer-valued contants, using the bitwise - AND operator: - - * If ``clear & _ARRAY`` is non-zero then a source - array is deleted. - - * If ``clear & _CACHE`` is non-zero then cached - element values are deleted. - - * If ``clear & _CFA`` is non-zero then the CFA write - status is set to `False`. - - By default *clear* is the ``_ALL`` integer-valued - constant, which results in all components being - removed. - - If *clear* is the ``_NONE`` integer-valued constant - then no components are removed. - - To retain a component and remove all others, use - ``_ALL`` with the bitwise OR operator. For instance, - if *clear* is ``_ALL ^ _CACHE`` then the cached - element values will be kept but all other components - will be removed. - - .. versionadded:: 3.15.0 - - :Returns: - - `None` - - """ - if not clear: - return - - if clear & _ARRAY: - # Delete a source array - self._del_Array(None) - - if clear & _CACHE: - # Delete cached element values - self._del_cached_elements() - - if clear & _CFA: - # Set the CFA write status to False - self._cfa_del_write() - - def _set_dask(self, dx, copy=False, clear=_ALL, asanyarray=False): - """Set the dask array. - - .. versionadded:: 3.14.0 - - .. seealso:: `to_dask_array`, `_clear_after_dask_update`, - `_del_dask` - - :Parameters: - - dx: `dask.array.Array` - The array to be inserted. - - copy: `bool`, optional - If True then copy *array* before setting it. By - default it is not copied. - - clear: `int`, optional - Specify which components should be removed. By default - *clear* is the ``_ALL`` integer-valued constant, which - results in all components being removed. See - `_clear_after_dask_update` for details. - - asanyarray: `None` or `bool`, optional - If `None` then do nothing. Otherwise set the - `__asanyarray__` attribute to *asanyarray*. - - .. versionadded:: NEXTVERSION - - :Returns: - - `None` - - """ - if dx is NotImplemented: - logger.warning( - "WARNING: NotImplemented has been set in the place of a " - "dask array." - "\n\n" - "This could occur if any sort of exception is raised " - "by a function that is run on chunks (via, for " - "instance, da.map_blocks or " - "dask.array.core.elemwise). Such a function could get " - "run at definition time in order to ascertain " - "suitability (such as data type casting, " - "broadcasting, etc.). Note that the exception may be " - "difficult to diagnose, as dask will have silently " - "trapped it and returned NotImplemented (see, for " - "instance, dask.array.core.elemwise). Print " - "statements in a local copy of dask are possibly the " - "way to go if the cause of the error is not obvious." - ) - - if copy: - dx = dx.copy() - - custom = self._custom - custom["dask"] = dx - if asanyarray is not None: - custom["__asanyarray__"] = bool(asanyarray) - - self._clear_after_dask_update(clear) - - def _del_dask(self, default=ValueError(), clear=_ALL): - """Remove the dask array. - - .. versionadded:: 3.14.0 - - .. seealso:: `to_dask_array`, `_clear_after_dask_update`, - `_set_dask` - - :Parameters: - - default: optional - Return the value of the *default* parameter if the - dask array axes has not been set. If set to an - `Exception` instance then it will be raised instead. - - clear: `int`, optional - Specify which components should be removed. By default - *clear* is the ``_ALL`` integer-valued constant, which - results in all components being removed. See - `_clear_after_dask_update` for details. If there is - no dask array then no components are removed, - regardless of the value of *clear*. - - :Returns: - - `dask.array.Array` - The removed dask array. - - **Examples** - - >>> d = cf.Data([1, 2, 3]) - >>> dx = d._del_dask() - >>> d._del_dask("No dask array") - 'No dask array' - >>> d._del_dask() - Traceback (most recent call last): - ... - ValueError: 'Data' has no dask array - >>> d._del_dask(RuntimeError('No dask array')) - Traceback (most recent call last): - ... - RuntimeError: No dask array - - """ - try: - out = self._custom.pop("dask") - except KeyError: - return self._default( - default, f"{self.__class__.__name__!r} has no dask array" - ) - - self._clear_after_dask_update(clear) - return out - - def _del_cached_elements(self): - """Delete any cached element values. - - Updates *data* in-place to remove the cached element values. - - .. versionadded:: 3.14.0 - - .. seealso:: `_get_cached_elements`, `_set_cached_elements` - - :Returns: - - `None` - - """ - self._custom.pop("cached_elements", None) - - def _get_cached_elements(self): - """Return the cache of selected element values. - - .. versionadded:: 3.14.1 - - .. seealso:: `_del_cached_elements`, `_set_cached_elements` - - :Returns: - - `dict` - The cached element values, where the keys are the element - positions within the dask array and the values are the cached - values for each position. - - **Examples** - - >>> d._get_cached_elements() - {} - - >>> d._get_cached_elements() - {0: 273.15, 1: 274.56, -1: 269.95} - - """ - cache = self._custom.get("cached_elements") - if not cache: - return {} - - return cache.copy() - def _is_abstract_Array_subclass(self, array): """Whether or not an array is a type of Array. @@ -1576,48 +537,6 @@ def _is_abstract_Array_subclass(self, array): """ return isinstance(array, cfdm.Array) - def _set_cached_elements(self, elements): - """Cache selected element values. - - Updates *data* in-place to store the given element values - within its ``custom`` dictionary. - - .. warning:: Never change ``_custom['cached_elements']`` - in-place. - - .. versionadded:: 3.14.0 - - .. seealso:: `_del_cached_elements`, `_get_cached_elements` - - :Parameters: - - elements: `dict` - Zero or more element values to be cached, each keyed by - a unique identifier to allow unambiguous retrieval. - Existing cached elements not specified by *elements* - will not be removed. - - :Returns: - - `None` - - **Examples** - - >>> d._set_cached_elements({0: 273.15}) - - """ - if not elements: - return - - cache = self._custom.get("cached_elements") - if cache: - cache = cache.copy() - cache.update(elements) - else: - cache = elements.copy() - - self._custom["cached_elements"] = cache - def _cfa_set_write(self, status): """Set the CFA write status of the data. @@ -1642,42 +561,6 @@ def _cfa_set_write(self, status): """ self._custom["cfa_write"] = bool(status) - def _update_deterministic(self, other): - """Update the deterministic name status. - - .. versionadded:: 3.15.1 - - .. seealso:: `get_deterministic_name`, - `has_deterministic_name` - - :Parameters: - - other: `bool` or `Data` - If `False` then set the deterministic name status to - `False`. If `True` then do not change the - deterministic name status. If `Data` then set the - deterministic name status to `False` if and only if - *other* has a False deterministic name status. - - :Returns: - - `None` - - """ - if other is False: - self._custom["deterministic"] = False - return - - if other is True: - return - - custom = self._custom - deterministic = custom["deterministic"] - if deterministic: - custom["deterministic"] = ( - deterministic and other._custom["deterministic"] - ) - @_inplace_enabled(default=False) def diff(self, axis=-1, n=1, inplace=False): """Calculate the n-th discrete difference along the given axis. @@ -1770,7 +653,7 @@ def diff(self, axis=-1, n=1, inplace=False): # whenever that issue is resolved. units = self.Units if units.isreftime: - units = Units(units._units_since_reftime) + units = d._Units_class(units._units_since_reftime) d.override_units(units, inplace=True) return d @@ -2228,7 +1111,7 @@ def mean_of_upper_decile( # masked at those locations less_than_p90.filled(True, inplace=True) - d.where(less_than_p90, cf_masked, inplace=True) + d.where(less_than_p90, masked, inplace=True) # Find the mean of elements greater than (or equal to) the # 90th percentile @@ -2244,157 +1127,54 @@ def mean_of_upper_decile( return d @_inplace_enabled(default=False) - def pad_missing(self, axis, pad_width=None, to_size=None, inplace=False): - """Pad an axis with missing data. + def percentile( + self, + ranks, + axes=None, + method="linear", + squeeze=False, + mtol=1, + inplace=False, + interpolation=None, + interpolation2=None, + ): + """Compute percentiles of the data along the specified axes. - :Parameters: + The default is to compute the percentiles along a flattened + version of the data. - axis: `int` - Select the axis for which the padding is to be - applied. + If the input data are integers, or floats smaller than + float64, or the input data contains missing values, then + output data-type is float64. Otherwise, the output data-type + is the same as that of the input. - *Parameter example:* - Pad second axis: ``axis=1``. + If multiple percentile ranks are given then a new, leading + data dimension is created so that percentiles can be stored + for each percentile rank. - *Parameter example:* - Pad the last axis: ``axis=-1``. + **Accuracy** - {{pad_width: sequence of `int`, optional}} + The `percentile` method returns results that are consistent + with `numpy.percentile`, which may be different to those + created by `dask.percentile`. The dask method uses an + algorithm that calculates approximate percentiles which are + likely to be different from the correct values when there are + two or more dask chunks. - {{to_size: `int`, optional}} + >>> import numpy as np + >>> import dask.array as da + >>> import cf + >>> a = np.arange(101) + >>> dx = da.from_array(a, chunks=10) + >>> da.percentile(dx, 40).compute() + array([40.36]) + >>> np.percentile(a, 40) + 40.0 + >>> d = cf.Data(a, chunks=10) + >>> d.percentile(40).array + array([40.]) - {{inplace: `bool`, optional}} - - :Returns: - - `Data` or `None` - The padded data, or `None` if the operation was - in-place. - - **Examples** - - >>> d = cf.Data(np.arange(6).reshape(2, 3)) - >>> print(d.array) - [[0 1 2] - [3 4 5]] - >>> e = d.pad_missing(1, (1, 2)) - >>> print(e.array) - [[-- 0 1 2 -- --] - [-- 3 4 5 -- --]] - >>> f = e.pad_missing(0, (0, 1)) - >>> print(f.array) - [[-- 0 1 2 -- --] - [-- 3 4 5 -- --] - [-- -- -- -- -- --]] - - >>> g = d.pad_missing(1, to_size=5) - >>> print(g.array) - [[0 1 2 -- --] - [3 4 5 -- --]] - - """ - if not 0 <= axis < self.ndim: - raise ValueError( - f"'axis' must be a valid dimension position. Got {axis}" - ) - - if to_size is not None: - # Set pad_width from to_size - if pad_width is not None: - raise ValueError("Can't set both 'pad_width' and 'to_size'") - - pad_width = (0, to_size - self.shape[axis]) - elif pad_width is None: - raise ValueError("Must set either 'pad_width' or 'to_size'") - - pad_width = np.asarray(pad_width) - if pad_width.shape != (2,) or not pad_width.dtype.kind == "i": - raise ValueError( - "'pad_width' must be a sequence of two integers. " - f"Got: {pad_width}" - ) - - pad_width = tuple(pad_width) - if any(n < 0 for n in pad_width): - if to_size is not None: - raise ValueError( - f"'to_size' ({to_size}) must not be smaller than the " - f"original axis size ({self.shape[axis]})" - ) - - raise ValueError( - f"Can't set a negative number of pad values. Got: {pad_width}" - ) - - d = _inplace_enabled_define_and_cleanup(self) - - dx = d.to_dask_array() - mask0 = da.ma.getmaskarray(dx) - - pad = [(0, 0)] * dx.ndim - pad[axis] = pad_width - - # Pad the data with zero. This will lose the original mask. - dx = da.pad(dx, pad, mode="constant", constant_values=0) - - # Pad the mask with True - mask = da.pad(mask0, pad, mode="constant", constant_values=True) - - # Set the mask - dx = da.ma.masked_where(mask, dx) - - d._set_dask(dx) - return d - - @_inplace_enabled(default=False) - def percentile( - self, - ranks, - axes=None, - method="linear", - squeeze=False, - mtol=1, - inplace=False, - interpolation=None, - interpolation2=None, - ): - """Compute percentiles of the data along the specified axes. - - The default is to compute the percentiles along a flattened - version of the data. - - If the input data are integers, or floats smaller than float64, or - the input data contains missing values, then output data-type is - float64. Otherwise, the output data-type is the same as that of - the input. - - If multiple percentile ranks are given then a new, leading data - dimension is created so that percentiles can be stored for each - percentile rank. - - **Accuracy** - - The `percentile` method returns results that are consistent - with `numpy.percentile`, which may be different to those - created by `dask.percentile`. The dask method uses an - algorithm that calculates approximate percentiles which are - likely to be different from the correct values when there are - two or more dask chunks. - - >>> import numpy as np - >>> import dask.array as da - >>> import cf - >>> a = np.arange(101) - >>> dx = da.from_array(a, chunks=10) - >>> da.percentile(dx, 40).compute() - array([40.36]) - >>> np.percentile(a, 40) - 40.0 - >>> d = cf.Data(a, chunks=10) - >>> d.percentile(40).array - array([40.]) - - .. versionadded:: 3.0.4 + .. versionadded:: 3.0.4 .. seealso:: `digitize`, `median`, `mean_of_upper_decile`, `where` @@ -2407,8 +1187,9 @@ def percentile( axes: (sequence of) `int`, optional Select the axes. The *axes* argument may be one, or a - sequence, of integers that select the axis corresponding to - the given position in the list of axes of the data array. + sequence, of integers that select the axis + corresponding to the given position in the list of + axes of the data array. By default, of *axes* is `None`, all axes are selected. @@ -2418,11 +1199,11 @@ def percentile( squeeze: `bool`, optional If True then all axes over which percentiles are - calculated are removed from the returned data. By default - axes over which percentiles have been calculated are left - in the result as axes with size 1, meaning that the result - is guaranteed to broadcast correctly against the original - data. + calculated are removed from the returned data. By + default axes over which percentiles have been + calculated are left in the result as axes with size 1, + meaning that the result is guaranteed to broadcast + correctly against the original data. {{mtol: number, optional}} @@ -2533,9 +1314,9 @@ def percentile( else: axes = tuple(sorted(d._parse_axes(axes))) - # 'cf_percentile' has its own call to 'cf_asanyarray', so we - # can set '_asanyarray=False'. - dx = d.to_dask_array(_asanyarray=False) + # 'cf_percentile' has its own call to 'cfdm_to_memory', so we + # can set '_force_to_memory=False'. + dx = d.to_dask_array(_force_to_memory=False) dtype = dx.dtype shape = dx.shape @@ -2613,49 +1394,6 @@ def percentile( return d - @_inplace_enabled(default=False) - def persist(self, inplace=False): - """Persist the underlying dask array into memory. - - This turns an underlying lazy dask array into a equivalent - chunked dask array, but now with the results fully computed. - - `persist` is particularly useful when using distributed - systems, because the results will be kept in distributed - memory, rather than returned to the local process. - - Compare with `compute` and `array`. - - **Performance** - - `persist` causes all delayed operations to be computed. - - .. versionadded:: 3.14.0 - - .. seealso:: `compute`, `array`, `datetime_array`, - `dask.array.Array.persist` - - :Parameters: - - {{inplace: `bool`, optional}} - - :Returns: - - `Data` or `None` - The persisted data. If the operation was in-place then - `None` is returned. - - **Examples** - - >>> e = d.persist() - - """ - d = _inplace_enabled_define_and_cleanup(self) - dx = self.to_dask_array() - dx = dx.persist() - d._set_dask(dx, clear=_ALL ^ _ARRAY ^ _CACHE) - return d - @_deprecated_kwarg_check("i", version="3.0.0", removed_at="4.0.0") @_inplace_enabled(default=False) def ceil(self, inplace=False, i=False): @@ -2799,63 +1537,6 @@ def cfa_set_write(self, status): self._cfa_del_write() - def compute(self): # noqa: F811 - """A view of the computed data. - - In-place changes to the returned array *might* affect the - underlying dask array, depending on how the dask array has - been defined, including any delayed operations. - - The returned array has the same mask hardness and fill values - as the data. - - Compare with `array`. - - **Performance** - - `compute` causes all delayed operations to be computed. - - .. versionadded:: 3.14.0 - - .. seealso:: `persist`, `array`, `datetime_array`, - `sparse_array` - - :Returns: - - An in-memory view of the data - - **Examples** - - >>> d = cf.Data([1, 2, 3.0], 'km') - >>> d.compute() - array([1., 2., 3.]) - - >>> from scipy.sparse import csr_array - >>> d = cf.Data(csr_array((2, 3))) - >>> d.compute() - <2x3 sparse array of type '' - with 0 stored elements in Compressed Sparse Row format> - >>>: d.array - array([[0., 0., 0.], - [0., 0., 0.]]) - >>> d.compute().toarray() - array([[0., 0., 0.], - [0., 0., 0.]]) - - """ - dx = self.to_dask_array() - a = dx.compute() - - if np.ma.isMA(a): - if self.hardmask: - a.harden_mask() - else: - a.soften_mask() - - a.set_fill_value(self.fill_value) - - return a - @_inplace_enabled(default=False) def convolution_filter( self, @@ -3148,87 +1829,6 @@ def cumsum( return d - @_inplace_enabled(default=False) - def rechunk( - self, - chunks=_DEFAULT_CHUNKS, - threshold=None, - block_size_limit=None, - balance=False, - inplace=False, - ): - """Change the chunk structure of the data. - - **Performance** - - Rechunking can sometimes be expensive and incur a lot of - communication overheads. - - .. versionadded:: 3.14.0 - - .. seealso:: `chunks`, `dask.array.rechunk` - - :Parameters: - - {{chunks: `int`, `tuple`, `dict` or `str`, optional}} - - {{threshold: `int`, optional}} - - {{block_size_limit: `int`, optional}} - - {{balance: `bool`, optional}} - - :Returns: - - `Data` or `None` - The rechunked data, or `None` if the operation was - in-place. - - **Examples** - - >>> x = cf.Data.ones((1000, 1000), chunks=(100, 100)) - - Specify uniform chunk sizes with a tuple - - >>> y = x.rechunk((1000, 10)) - - Or chunk only specific dimensions with a dictionary - - >>> y = x.rechunk({0: 1000}) - - Use the value ``-1`` to specify that you want a single chunk - along a dimension or the value ``"auto"`` to specify that dask - can freely rechunk a dimension to attain blocks of a uniform - block size. - - >>> y = x.rechunk({0: -1, 1: 'auto'}, block_size_limit=1e8) - - If a chunk size does not divide the dimension then rechunk - will leave any unevenness to the last chunk. - - >>> x.rechunk(chunks=(400, -1)).chunks - ((400, 400, 200), (1000,)) - - However if you want more balanced chunks, and don't mind - `dask` choosing a different chunksize for you then you can use - the ``balance=True`` option. - - >>> x.rechunk(chunks=(400, -1), balance=True).chunks - ((500, 500), (1000,)) - - """ - d = _inplace_enabled_define_and_cleanup(self) - - dx = d.to_dask_array(_asanyarray=False) - dx = dx.rechunk(chunks, threshold, block_size_limit, balance) - # Dask rechunking is essentially a wrapper for __getitem__ - # calls on the chunks, which means that we can use the same - # 'asanyarray' and 'clear' keyword values to `_set_dask` as - # are used in `__gettem__`. - d._set_dask(dx, clear=_ALL ^ _ARRAY ^ _CACHE, asanyarray=True) - - return d - @_inplace_enabled(default=False) def _asdatetime(self, inplace=False): """Change the internal representation of data array elements @@ -3276,9 +1876,9 @@ def _asdatetime(self, inplace=False): ) if not d._isdatetime(): - # 'cf_rt2dt' has its own call to 'cf_asanyarray', so we - # can set '_asanyarray=False'. - dx = d.to_dask_array(_asanyarray=False) + # 'cf_rt2dt' has its own call to 'cfdm_to_memory', so we + # can set '_force_to_memory=False'. + dx = d.to_dask_array(_force_to_memory=False) dx = dx.map_blocks(cf_rt2dt, units=units, dtype=object) d._set_dask(dx) @@ -3333,14 +1933,55 @@ def _asreftime(self, inplace=False): ) if d._isdatetime(): - # 'cf_dt2rt' has its own call to 'cf_asanyarray', so we - # can set '_asanyarray=False'. - dx = d.to_dask_array(_asanyarray=False) + # 'cf_dt2rt' has its own call to 'cfdm_to_memory', so we + # can set '_force_to_memory=False'. + dx = d.to_dask_array(_force_to_memory=False) dx = dx.map_blocks(cf_dt2rt, units=units, dtype=float) d._set_dask(dx) return d + def _clear_after_dask_update(self, clear=None): + """Remove components invalidated by updating the `dask` array. + + Removes or modifies components that can't be guaranteed to be + consistent with an updated `dask` array. See the *clear* + parameter for details. + + .. versionadded:: NEXTVERSION + + .. seealso:: `_del_Array`, `_del_cached_elements`, + `_set_dask`, `_cfa_del_write` + + :Parameters: + + clear: `int` or `None`, optional + Specify which components to remove, determined by + sequentially combining an integer value of *clear* + with the relevant class-level constants (such as + ``Data._ARRAY``), using the bitwise AND (&) + operator. If ``clear & `` is + True then the corresponding component is cleared. The + default value of `None` is equivalent to *clear* being + set to ``Data._ALL``. + + The bitwise OR (^) operator can be used to retain a + component (or components) but remove all others. For + instance, if *clear* is ``Data._ALL ^ + Data._CACHE`` then all components except the + cached array values will be removed. + + :Returns: + + `None` + + """ + clear = super()._clear_after_dask_update(clear) + + if clear & self._CFA: + # Set the CFA write status to False + self._cfa_del_write() + def _combined_units(self, data1, method, inplace): """Combines by given method the data's units with other units. @@ -3455,7 +2096,11 @@ def _combined_units(self, data1, method, inplace): if not units0.equals(units1): data1 = data1.copy() data1.Units = units0 - return data0, data1, Units(_ut_unit=units0._ut_unit) + return ( + data0, + data1, + self._Units_class(_ut_unit=units0._ut_unit), + ) else: # Non-equivalent reference_times: raise an # exception @@ -3463,7 +2108,7 @@ def _combined_units(self, data1, method, inplace): elif units1.istime: # reference_time minus time: the output units are # reference_time - time0 = Units(_ut_unit=units0._ut_unit) + time0 = self._Units_class(_ut_unit=units0._ut_unit) if not units1.equals(time0): data1 = data1.copy() data1.Units = time0 @@ -3507,7 +2152,7 @@ def _combined_units(self, data1, method, inplace): if units0.istime: # Time plus reference_time: the output units are # reference_time - time1 = Units(_ut_unit=units1._ut_unit) + time1 = self._Units_class(_ut_unit=units1._ut_unit) if not units0.equals(time1): if not inplace: data0 = data0.copy() @@ -3728,7 +2373,8 @@ def _combined_units(self, data1, method, inplace): ) ) - def _binary_operation(self, other, method): + @classmethod + def _binary_operation(cls, data, other, method): """Implement binary arithmetic and comparison operations with the numpy broadcasting rules. @@ -3779,101 +2425,40 @@ def _binary_operation(self, other, method): # Ensure other is an independent Data object, for example # so that combination with cf.Query objects works. # ------------------------------------------------------------ - if not isinstance(other, self.__class__): + if not isinstance(other, cls): if ( isinstance(other, cftime.datetime) and other.calendar == "" - and self.Units.isreftime + and data.Units.isreftime ): other = cf_dt( - other, calendar=getattr(self.Units, "calendar", "standard") + other, calendar=getattr(data.Units, "calendar", "standard") ) elif other is None: # Can't sensibly initialise a Data object from a bare # `None` (issue #281) other = np.array(None, dtype=object) - other = type(self).asdata(other) + other = cls.asdata(other) # ------------------------------------------------------------ # Prepare data0 (i.e. self copied) and data1 (i.e. other) # ------------------------------------------------------------ - data0 = self.copy() + data0 = data.copy() # Parse units data0, other, new_Units = data0._combined_units(other, method, True) - # Cast as dask arrays - dx0 = data0.to_dask_array() - dx1 = other.to_dask_array() - - # Set if applicable the tolerance levels for the result - if method in ("__eq__", "__ne__"): - rtol = self._rtol - atol = self._atol - - # ------------------------------------------------------------ - # Perform the binary operation with data0 (self) and data1 - # (other) - # ------------------------------------------------------------ - if method == "__eq__": - if dx0.dtype.kind in "US" or dx1.dtype.kind in "US": - result = getattr(dx0, method)(dx1) - else: - result = da.isclose(dx0, dx1, rtol=rtol, atol=atol) - elif method == "__ne__": - if dx0.dtype.kind in "US" or dx1.dtype.kind in "US": - result = getattr(dx0, method)(dx1) - else: - result = ~da.isclose(dx0, dx1, rtol=rtol, atol=atol) - elif inplace: - # Find non-in-place equivalent operator (remove 'i') - equiv_method = method[:2] + method[3:] - # Need to add check in here to ensure that the operation is not - # trying to cast in a way which is invalid. For example, doing - # [an int array] ** float value = [a float array] is fine, but - # doing this in-place would try to chance an int array into a - # float one, which isn't valid casting. Therefore we need to - # catch cases where __i__ isn't possible even if ____ - # is due to datatype consistency rules. - result = getattr(dx0, equiv_method)(dx1) - else: - result = getattr(dx0, method)(dx1) + d = super()._binary_operation(data0, other, method) - if result is NotImplemented: - raise TypeError( - f"Unsupported operands for {method}: {self!r} and {other!r}" - ) + d.override_units(new_Units, inplace=True) - # Set axes when other has more dimensions than self - axes = None - ndim0 = dx0.ndim - if not ndim0: - axes = other._axes + if inplace: + data.__dict__ = d.__dict__ else: - diff = dx1.ndim - ndim0 - if diff > 0: - axes = list(self._axes) - for _ in range(diff): - axes.insert(0, new_axis_identifier(tuple(axes))) - - if inplace: # in-place so concerns original self - self._set_dask(result) - self.override_units(new_Units, inplace=True) - if axes is not None: - self._axes = axes + data = d - self._update_deterministic(other) - return self - - else: # not, so concerns a new Data object copied from self, data0 - data0._set_dask(result) - data0.override_units(new_Units, inplace=True) - if axes is not None: - data0._axes = axes - - data0._update_deterministic(other) - return data0 + return data def _parse_indices(self, *args, **kwargs): """'cf.Data._parse_indices' is not available. @@ -3945,9 +2530,9 @@ def _regrid( f"the shape of the regrid operator: {operator.src_shape}" ) - # 'regrid' has its own calls to 'cf_asanyarray', so we can set - # '_asanyarray=False'. - dx = self.to_dask_array(_asanyarray=False) + # 'regrid' has its own calls to 'cfdm_to_memory', so we can set + # '_force_to_memory=False'. + dx = self.to_dask_array(_force_to_memory=False) # Rechunk so that each chunk contains data in the form # expected by the regrid operator, i.e. the regrid axes all @@ -4190,25 +2775,25 @@ def concatenate( copied = not copy # to avoid making two copies in a given case # Get data as dask arrays and apply concatenation - # operation. We can set '_asanyarray=False' because at compute + # operation. We can set '_force_to_memory=False' because at compute # time the concatenation operation does not need to access the # actual data. - dxs = [d.to_dask_array(_asanyarray=False) for d in processed_data] + dxs = [d.to_dask_array(_force_to_memory=False) for d in processed_data] dx = da.concatenate(dxs, axis=axis) # Set the CFA write status # # Assume at first that all input data instances have True # status, but ... - cfa = _CFA + cfa = cls._CFA for d in processed_data: if not d.cfa_get_write(): # ... the CFA write status is False when any input # data instance has False status ... - cfa = _NONE + cfa = cls._NONE break - if cfa != _NONE: + if cfa != cls._NONE: non_concat_axis_chunks0 = list(processed_data[0].chunks) non_concat_axis_chunks0.pop(axis) for d in processed_data[1:]: @@ -4218,21 +2803,21 @@ def concatenate( # ... the CFA write status is False when any two # input data instances have different chunk # patterns for the non-concatenated axes. - cfa = _NONE + cfa = cls._NONE break - # Define the __asanyarray__ status - asanyarray = processed_data[0].__asanyarray__ + # Define the __in_memory__ status + in_memory = processed_data[0].__in_memory__ for d in processed_data[1:]: - if d.__asanyarray__ != asanyarray: + if d.__in_memory__ != in_memory: # If and only if any two input Data objects have - # different __asanyarray__ values, then set - # asanyarray=True on the concatenation. - asanyarray = True + # different __in_memory__ values, then set + # in_memory=False on the concatenation. + in_memory = False break # Set the new dask array - data0._set_dask(dx, clear=_ALL ^ cfa, asanyarray=asanyarray) + data0._set_dask(dx, clear=cls._ALL ^ cfa, in_memory=in_memory) # Set appropriate cached elements cached_elements = {} @@ -4289,57 +2874,13 @@ def concatenate( return data0 - def _unary_operation(self, operation): - """Implement unary arithmetic operations. - - It is called by the unary arithmetic methods, such as - __abs__(). - - .. seealso:: `_binary_operation` - - :Parameters: - - operation: `str` - The unary arithmetic method name (such as "__invert__"). - - :Returns: - - `Data` - A new Data array. - - **Examples** - - >>> d = cf.Data([[1, 2, -3, -4, -5]]) - - >>> e = d._unary_operation('__abs__') - >>> print(e.array) - [[1 2 3 4 5]] - - >>> e = d.__abs__() - >>> print(e.array) - [[1 2 3 4 5]] - - >>> e = abs(d) - >>> print(e.array) - [[1 2 3 4 5]] - - """ - out = self.copy(array=False) - - dx = self.to_dask_array() - dx = getattr(operator, operation)(dx) - - out._set_dask(dx) - - return out - def __add__(self, other): """The binary arithmetic operation ``+`` x.__add__(y) <==> x+y """ - return self._binary_operation(other, "__add__") + return self._binary_operation(self, other, "__add__") def __iadd__(self, other): """The augmented arithmetic assignment ``+=`` @@ -4347,7 +2888,7 @@ def __iadd__(self, other): x.__iadd__(y) <==> x+=y """ - return self._binary_operation(other, "__iadd__") + return self._binary_operation(self, other, "__iadd__") def __radd__(self, other): """The binary arithmetic operation ``+`` with reflected @@ -4356,7 +2897,7 @@ def __radd__(self, other): x.__radd__(y) <==> y+x """ - return self._binary_operation(other, "__radd__") + return self._binary_operation(self, other, "__radd__") def __sub__(self, other): """The binary arithmetic operation ``-`` @@ -4364,7 +2905,7 @@ def __sub__(self, other): x.__sub__(y) <==> x-y """ - return self._binary_operation(other, "__sub__") + return self._binary_operation(self, other, "__sub__") def __isub__(self, other): """The augmented arithmetic assignment ``-=`` @@ -4372,7 +2913,7 @@ def __isub__(self, other): x.__isub__(y) <==> x-=y """ - return self._binary_operation(other, "__isub__") + return self._binary_operation(self, other, "__isub__") def __rsub__(self, other): """The binary arithmetic operation ``-`` with reflected @@ -4381,7 +2922,7 @@ def __rsub__(self, other): x.__rsub__(y) <==> y-x """ - return self._binary_operation(other, "__rsub__") + return self._binary_operation(self, other, "__rsub__") def __mul__(self, other): """The binary arithmetic operation ``*`` @@ -4389,7 +2930,7 @@ def __mul__(self, other): x.__mul__(y) <==> x*y """ - return self._binary_operation(other, "__mul__") + return self._binary_operation(self, other, "__mul__") def __imul__(self, other): """The augmented arithmetic assignment ``*=`` @@ -4397,7 +2938,7 @@ def __imul__(self, other): x.__imul__(y) <==> x*=y """ - return self._binary_operation(other, "__imul__") + return self._binary_operation(self, other, "__imul__") def __rmul__(self, other): """The binary arithmetic operation ``*`` with reflected @@ -4406,7 +2947,7 @@ def __rmul__(self, other): x.__rmul__(y) <==> y*x """ - return self._binary_operation(other, "__rmul__") + return self._binary_operation(self, other, "__rmul__") def __div__(self, other): """The binary arithmetic operation ``/`` @@ -4414,7 +2955,7 @@ def __div__(self, other): x.__div__(y) <==> x/y """ - return self._binary_operation(other, "__div__") + return self._binary_operation(self, other, "__div__") def __idiv__(self, other): """The augmented arithmetic assignment ``/=`` @@ -4422,7 +2963,7 @@ def __idiv__(self, other): x.__idiv__(y) <==> x/=y """ - return self._binary_operation(other, "__idiv__") + return self._binary_operation(self, other, "__idiv__") def __rdiv__(self, other): """The binary arithmetic operation ``/`` with reflected @@ -4431,7 +2972,7 @@ def __rdiv__(self, other): x.__rdiv__(y) <==> y/x """ - return self._binary_operation(other, "__rdiv__") + return self._binary_operation(self, other, "__rdiv__") def __floordiv__(self, other): """The binary arithmetic operation ``//`` @@ -4439,7 +2980,7 @@ def __floordiv__(self, other): x.__floordiv__(y) <==> x//y """ - return self._binary_operation(other, "__floordiv__") + return self._binary_operation(self, other, "__floordiv__") def __ifloordiv__(self, other): """The augmented arithmetic assignment ``//=`` @@ -4447,7 +2988,7 @@ def __ifloordiv__(self, other): x.__ifloordiv__(y) <==> x//=y """ - return self._binary_operation(other, "__ifloordiv__") + return self._binary_operation(self, other, "__ifloordiv__") def __rfloordiv__(self, other): """The binary arithmetic operation ``//`` with reflected @@ -4456,7 +2997,7 @@ def __rfloordiv__(self, other): x.__rfloordiv__(y) <==> y//x """ - return self._binary_operation(other, "__rfloordiv__") + return self._binary_operation(self, other, "__rfloordiv__") def __truediv__(self, other): """The binary arithmetic operation ``/`` (true division) @@ -4464,7 +3005,7 @@ def __truediv__(self, other): x.__truediv__(y) <==> x/y """ - return self._binary_operation(other, "__truediv__") + return self._binary_operation(self, other, "__truediv__") def __itruediv__(self, other): """The augmented arithmetic assignment ``/=`` (true division) @@ -4472,7 +3013,7 @@ def __itruediv__(self, other): x.__itruediv__(y) <==> x/=y """ - return self._binary_operation(other, "__itruediv__") + return self._binary_operation(self, other, "__itruediv__") def __rtruediv__(self, other): """The binary arithmetic operation ``/`` (true division) with @@ -4481,7 +3022,7 @@ def __rtruediv__(self, other): x.__rtruediv__(y) <==> y/x """ - return self._binary_operation(other, "__rtruediv__") + return self._binary_operation(self, other, "__rtruediv__") def __pow__(self, other, modulo=None): """The binary arithmetic operations ``**`` and ``pow`` @@ -4496,7 +3037,7 @@ def __pow__(self, other, modulo=None): ) ) - return self._binary_operation(other, "__pow__") + return self._binary_operation(self, other, "__pow__") def __ipow__(self, other, modulo=None): """The augmented arithmetic assignment ``**=`` @@ -4511,7 +3052,7 @@ def __ipow__(self, other, modulo=None): ) ) - return self._binary_operation(other, "__ipow__") + return self._binary_operation(self, other, "__ipow__") def __rpow__(self, other, modulo=None): """The binary arithmetic operations ``**`` and ``pow`` with @@ -4527,7 +3068,7 @@ def __rpow__(self, other, modulo=None): ) ) - return self._binary_operation(other, "__rpow__") + return self._binary_operation(self, other, "__rpow__") def __mod__(self, other): """The binary arithmetic operation ``%`` @@ -4535,7 +3076,7 @@ def __mod__(self, other): x.__mod__(y) <==> x % y """ - return self._binary_operation(other, "__mod__") + return self._binary_operation(self, other, "__mod__") def __imod__(self, other): """The binary arithmetic operation ``%=`` @@ -4543,7 +3084,7 @@ def __imod__(self, other): x.__imod__(y) <==> x %= y """ - return self._binary_operation(other, "__imod__") + return self._binary_operation(self, other, "__imod__") def __rmod__(self, other): """The binary arithmetic operation ``%`` with reflected @@ -4552,950 +3093,184 @@ def __rmod__(self, other): x.__rmod__(y) <==> y % x """ - return self._binary_operation(other, "__rmod__") + return self._binary_operation(self, other, "__rmod__") - def __eq__(self, other): - """The rich comparison operator ``==`` + def __query_isclose__(self, value, rtol, atol): + """Query interface method for an "is close" condition. - x.__eq__(y) <==> x==y + :Parameters: - """ - return self._binary_operation(other, "__eq__") + value: + The object to test against. - def __ne__(self, other): - """The rich comparison operator ``!=`` + rtol: number + The tolerance on relative numerical differences. - x.__ne__(y) <==> x!=y - - """ - return self._binary_operation(other, "__ne__") - - def __ge__(self, other): - """The rich comparison operator ``>=`` - - x.__ge__(y) <==> x>=y - - """ - return self._binary_operation(other, "__ge__") - - def __gt__(self, other): - """The rich comparison operator ``>`` - - x.__gt__(y) <==> x>y - - """ - return self._binary_operation(other, "__gt__") - - def __le__(self, other): - """The rich comparison operator ``<=`` - - x.__le__(y) <==> x<=y - - """ - return self._binary_operation(other, "__le__") - - def __lt__(self, other): - """The rich comparison operator ``<`` - - x.__lt__(y) <==> x x&y - - """ - return self._binary_operation(other, "__and__") - - def __iand__(self, other): - """The augmented bitwise assignment ``&=`` - - x.__iand__(y) <==> x&=y - - """ - return self._binary_operation(other, "__iand__") - - def __rand__(self, other): - """The binary bitwise operation ``&`` with reflected operands. - - x.__rand__(y) <==> y&x - - """ - return self._binary_operation(other, "__rand__") - - def __or__(self, other): - """The binary bitwise operation ``|`` - - x.__or__(y) <==> x|y - - """ - return self._binary_operation(other, "__or__") - - def __ior__(self, other): - """The augmented bitwise assignment ``|=`` - - x.__ior__(y) <==> x|=y - - """ - return self._binary_operation(other, "__ior__") - - def __ror__(self, other): - """The binary bitwise operation ``|`` with reflected operands. - - x.__ror__(y) <==> y|x - - """ - return self._binary_operation(other, "__ror__") - - def __xor__(self, other): - """The binary bitwise operation ``^`` - - x.__xor__(y) <==> x^y - - """ - return self._binary_operation(other, "__xor__") - - def __ixor__(self, other): - """The augmented bitwise assignment ``^=`` - - x.__ixor__(y) <==> x^=y - - """ - return self._binary_operation(other, "__ixor__") - - def __rxor__(self, other): - """The binary bitwise operation ``^`` with reflected operands. - - x.__rxor__(y) <==> y^x - - """ - return self._binary_operation(other, "__rxor__") - - def __lshift__(self, y): - """The binary bitwise operation ``<<`` - - x.__lshift__(y) <==> x< x<<=y - - """ - return self._binary_operation(y, "__ilshift__") - - def __rlshift__(self, y): - """The binary bitwise operation ``<<`` with reflected operands. - - x.__rlshift__(y) <==> y<>`` - - x.__lshift__(y) <==> x>>y - - """ - return self._binary_operation(y, "__rshift__") - - def __irshift__(self, y): - """The augmented bitwise assignment ``>>=`` - - x.__irshift__(y) <==> x>>=y - - """ - return self._binary_operation(y, "__irshift__") - - def __rrshift__(self, y): - """The binary bitwise operation ``>>`` with reflected operands. - - x.__rrshift__(y) <==> y>>x - - """ - return self._binary_operation(y, "__rrshift__") - - def __abs__(self): - """The unary arithmetic operation ``abs`` - - x.__abs__() <==> abs(x) - - """ - return self._unary_operation("__abs__") - - def __neg__(self): - """The unary arithmetic operation ``-`` - - x.__neg__() <==> -x - - """ - return self._unary_operation("__neg__") - - def __invert__(self): - """The unary bitwise operation ``~`` - - x.__invert__() <==> ~x - - """ - return self._unary_operation("__invert__") - - def __pos__(self): - """The unary arithmetic operation ``+`` - - x.__pos__() <==> +x - - """ - return self._unary_operation("__pos__") - - def __query_isclose__(self, value, rtol, atol): - """Query interface method for an "is close" condition. - - :Parameters: - - value: - The object to test against. - - rtol: number - The tolerance on relative numerical differences. - - atol: number - The tolerance on absolute numerical differences. + atol: number + The tolerance on absolute numerical differences. .. versionadded:: 3.15.2 """ return self.isclose(value, rtol=rtol, atol=atol) - @property - def _Units(self): - """Storage for the units. - - The units are stored in a `Units` object, and reflect the - units of the (yet to be computed) elements of the underlying - data. - - .. warning:: Assigning to `_Units` does *not* trigger a units - conversion of the underlying data - values. Therefore assigning to `_Units` should - only be done in cases when it is known that the - intrinsic units represented by the data values - are inconsistent with the existing value of - `_Units`. Before assigning to `_Units`, first - consider if assigning to `Units`, or calling the - `override_units` or `override_calendar` method is - a more appropriate course of action, and use one - of those if possible. - - """ - return self._custom["_Units"] - - @_Units.setter - def _Units(self, value): - self._custom["_Units"] = value - - @_Units.deleter - def _Units(self): - self._custom["_Units"] = _units_None - @property def _cyclic(self): """Storage for axis cyclicity. Contains a `set` that identifies which axes are cyclic (and - therefore allow cyclic slicing). The set contains a subset of - the axis identifiers defined by the `_axes` attribute. - - .. warning:: Never change the value of the `_cyclic` attribute - in-place. - - .. note:: When an axis identifier is removed from the `_axes` - attribute then it is automatically also removed from - the `_cyclic` attribute. - - """ - return self._custom["_cyclic"] - - @_cyclic.setter - def _cyclic(self, value): - self._custom["_cyclic"] = value - - @_cyclic.deleter - def _cyclic(self): - self._custom["_cyclic"] = _empty_set - - @property - def _axes(self): - """Storage for the axis identifiers. - - Contains a `tuple` of identifiers, one for each array axis. - - .. note:: When the axis identifiers are reset, then any axis - identifier named by the `_cyclic` attribute which is - not in the new `_axes` set is automatically removed - from the `_cyclic` attribute. - - """ - return self._custom["_axes"] - - @_axes.setter - def _axes(self, value): - self._custom["_axes"] = tuple(value) - - # Remove cyclic axes that are not in the new axes - cyclic = self._cyclic - if cyclic: - # Never change the value of the _cyclic attribute in-place - self._cyclic = cyclic.intersection(value) - - # ---------------------------------------------------------------- - # Dask attributes - # ---------------------------------------------------------------- - @property - def chunks(self): - """The `dask` chunk sizes for each dimension. - - .. versionadded:: 3.14.0 - - .. seealso:: `npartitions`, `numblocks`, `rechunk` - - **Examples** - - >>> d = cf.Data.ones((6, 5), chunks=(2, 4)) - >>> d.chunks - ((2, 2, 2), (4, 1)) - >>> d.numblocks - (3, 2) - >>> d.npartitions - 6 - - """ - # The dask graph is never going to be computed, so we can set - # '_asanyarray=False'. - return self.to_dask_array(_asanyarray=False).chunks - - # ---------------------------------------------------------------- - # Attributes - # ---------------------------------------------------------------- - @property - def Units(self): - """The `cf.Units` object containing the units of the data array. - - Can be set to any units equivalent to the existing units. - - .. seealso `override_units`, `override_calendar` - - **Examples** - - >>> d = cf.Data([1, 2, 3], units='m') - >>> d.Units - - >>> d.Units = cf.Units('kilometres') - >>> d.Units - - >>> d.Units = cf.Units('km') - >>> d.Units - - - """ - return self._Units - - @Units.setter - def Units(self, value): - try: - old_units = self._Units - except KeyError: - pass - else: - if not old_units or self.Units.equals(value): - self._Units = value - return - - if old_units and not old_units.equivalent(value): - raise ValueError( - f"Can't set Units to {value!r} that are not " - f"equivalent to the current units {old_units!r}. " - "Consider using the override_units method instead." - ) - - dtype = self.dtype - if dtype.kind in "iu": - if dtype.char in "iI": - dtype = _dtype_float32 - else: - dtype = _dtype_float - - cf_func = partial(cf_units, from_units=old_units, to_units=value) - - # 'cf_units' has its own call to 'cf_asanyarray', so we can - # set '_asanyarray=False'. - dx = self.to_dask_array(_asanyarray=False) - dx = dx.map_blocks(cf_func, dtype=dtype) - - # Setting equivalent units doesn't affect the CFA write - # status. Nor does it invalidate any cached values, but only - # because we'll adjust those, too. - self._set_dask(dx, clear=_ALL ^ _CACHE ^ _CFA) - - # Adjust cached values for the new units - cache = self._get_cached_elements() - if cache: - self._set_cached_elements( - {index: cf_func(value) for index, value in cache.items()} - ) - - self._Units = value - - @Units.deleter - def Units(self): - raise ValueError( - "Can't delete the Units attribute. " - "Consider using the override_units method instead." - ) - - @property - def data(self): - """The data as an object identity. - - **Examples** - - >>> d = cf.Data([1, 2], 'm') - >>> d.data is d - True - - """ - return self - - @property - def dtype(self): - """The `numpy` data-type of the data. - - Always returned as a `numpy` data-type instance, but may be set - as any object that converts to a `numpy` data-type. - - **Examples** - - >>> d = cf.Data([1, 2.5, 3.9]) - >>> d.dtype - dtype('float64') - >>> print(d.array) - [1. 2.5 3.9] - >>> d.dtype = int - >>> d.dtype - dtype('int64') - >>> print(d.array) - [1 2 3] - >>> d.dtype = 'float32' - >>> print(d.array) - [1. 2. 3.] - >>> import numpy as np - >>> d.dtype = np.dtype('int32') - >>> d.dtype - dtype('int32') - >>> print(d.array) - [1 2 3] - - """ - # The dask graph is never going to be computed, so we can set - # '_asanyarray=False'. - dx = self.to_dask_array(_asanyarray=False) - return dx.dtype - - @dtype.setter - def dtype(self, value): - # Only change the datatype if it's different to that of the - # dask array - if self.dtype != value: - dx = self.to_dask_array() - dx = dx.astype(value) - self._set_dask(dx) - - @property - def fill_value(self): - """The data array missing data value. - - If set to `None` then the default `numpy` fill value appropriate to - the data array's data-type will be used. - - Deleting this attribute is equivalent to setting it to None, so - this attribute is guaranteed to always exist. - - **Examples** - - >>> d.fill_value = 9999.0 - >>> d.fill_value - 9999.0 - >>> del d.fill_value - >>> d.fill_value - None - - """ - return self.get_fill_value(None) - - @fill_value.setter - def fill_value(self, value): - self.set_fill_value(value) - - @fill_value.deleter - def fill_value(self): - self.del_fill_value(None) - - @property - def hardmask(self): - """Hardness of the mask. - - If the `hardmask` attribute is `True`, i.e. there is a hard - mask, then unmasking an entry will silently not occur. This is - the default, and prevents overwriting the mask. - - If the `hardmask` attribute is `False`, i.e. there is a soft - mask, then masked entries may be overwritten with non-missing - values. - - .. note:: Setting the `hardmask` attribute does not - immediately change the mask hardness, rather its - value indicates to other methods (such as `where`, - `transpose`, etc.) whether or not the mask needs - hardening or softening prior to an operation being - defined, and those methods will reset the mask - hardness if required. - - By contrast, the `harden_mask` and `soften_mask` - methods immediately reset the mask hardness of the - underlying `dask` array, and also set the value of - the `hardmask` attribute. - - .. seealso:: `harden_mask`, `soften_mask`, `to_dask_array`, - `where`, `__setitem__` - - **Examples** - - >>> d = cf.Data([1, 2, 3]) - >>> d.hardmask - True - >>> d[0] = cf.masked - >>> print(d.array) - [-- 2 3] - >>> d[...] = 999 - >>> print(d.array) - [-- 999 999] - >>> d.hardmask = False - >>> d.hardmask - False - >>> d[...] = -1 - >>> print(d.array) - [-1 -1 -1] - - """ - return self._custom.get("hardmask", _DEFAULT_HARDMASK) - - @hardmask.setter - def hardmask(self, value): - self._custom["hardmask"] = value - - @property - def is_masked(self): - """True if the data array has any masked values. - - **Performance** - - `is_masked` causes all delayed operations to be executed. - - **Examples** - - >>> d = cf.Data([[1, 2, 3], [4, 5, 6]]) - >>> print(d.is_masked) - False - >>> d[0, ...] = cf.masked - >>> d.is_masked - True - - """ - # 'cf_is_masked' has its own call to 'cf_asanyarray', so we - # can set '_asanyarray=False'. - dx = self.to_dask_array(_asanyarray=False) - - out_ind = tuple(range(dx.ndim)) - dx_ind = out_ind - - dx = da.blockwise( - cf_is_masked, - out_ind, - dx, - dx_ind, - adjust_chunks={i: 1 for i in out_ind}, - dtype=bool, - ) - - return bool(dx.any()) - - @property - def nbytes(self): - """Total number of bytes consumed by the elements of the array. - - Does not include bytes consumed by the array mask - - **Performance** - - If the number of bytes is unknown then it is calculated - immediately by executing all delayed operations. - - **Examples** - - >>> d = cf.Data([[1, 1.5, 2]]) - >>> d.dtype - dtype('float64') - >>> d.size, d.dtype.itemsize - (3, 8) - >>> d.nbytes - 24 - >>> d[0] = cf.masked - >>> print(d.array) - [[-- 1.5 2.0]] - >>> d.nbytes - 24 - - """ - # The dask graph is never going to be computed, so we can set - # '_asanyarray=False'. - dx = self.to_dask_array(_asanyarray=False) - if math.isnan(dx.size): - logger.debug("Computing data nbytes: Performance may be degraded") - dx.compute_chunk_sizes() - - return dx.nbytes - - @property - def ndim(self): - """Number of dimensions in the data array. - - **Examples** - - >>> d = cf.Data([[1, 2, 3], [4, 5, 6]]) - >>> d.ndim - 2 - - >>> d = cf.Data([[1, 2, 3]]) - >>> d.ndim - 2 - - >>> d = cf.Data([[3]]) - >>> d.ndim - 2 - - >>> d = cf.Data([3]) - >>> d.ndim - 1 - - >>> d = cf.Data(3) - >>> d.ndim - 0 - - """ - # The dask graph is never going to be computed, so we can set - # '_asanyarray=False'. - dx = self.to_dask_array(_asanyarray=False) - return dx.ndim - - @property - def npartitions(self): - """The total number of chunks. - - .. versionadded:: 3.14.0 - - .. seealso:: `chunks`, `numblocks`, `rechunk` - - **Examples** - - >>> d = cf.Data.ones((6, 5), chunks=(2, 4)) - >>> d.chunks - ((2, 2, 2), (4, 1)) - >>> d.numblocks - (3, 2) - >>> d.npartitions - 6 - - """ - # The dask graph is never going to be computed, so we can set - # '_asanyarray=False'. - return self.to_dask_array(_asanyarray=False).npartitions - - @property - def numblocks(self): - """The number of chunks along each dimension. - - .. versionadded:: 3.14.0 - - .. seealso:: `chunks`, `npartitions`, `rechunk` - - **Examples** - - >>> d = cf.Data.ones((6, 5), chunks=(2, 4)) - >>> d.chunks - ((2, 2, 2), (4, 1)) - >>> d.numblocks - (3, 2) - >>> d.npartitions - 6 - - """ - # The dask graph is never going to be computed, so we can set - # '_asanyarray=False'. - return self.to_dask_array(_asanyarray=False).numblocks - - @property - def shape(self): - """Tuple of the data array's dimension sizes. - - **Performance** - - If the shape of the data is unknown then it is calculated - immediately by executing all delayed operations. - - **Examples** - - >>> d = cf.Data([[1, 2, 3], [4, 5, 6]]) - >>> d.shape - (2, 3) - - >>> d = cf.Data([[1, 2, 3]]) - >>> d.shape - (1, 3) - - >>> d = cf.Data([[3]]) - >>> d.shape - (1, 1) - - >>> d = cf.Data(3) - >>> d.shape - () - - """ - # The dask graph is never going to be computed, so we can set - # '_asanyarray=False'. - dx = self.to_dask_array(_asanyarray=False) - if math.isnan(dx.size): - logger.debug("Computing data shape: Performance may be degraded") - dx.compute_chunk_sizes() - - return dx.shape - - @property - def size(self): - """Number of elements in the data array. - - **Performance** - - If the size of the data is unknown then it is calculated - immediately by executing all delayed operations. - - **Examples** - - >>> d = cf.Data([[1, 2, 3], [4, 5, 6]]) - >>> d.size - 6 - - >>> d = cf.Data([[1, 2, 3]]) - >>> d.size - 3 - - >>> d = cf.Data([[3]]) - >>> d.size - 1 - - >>> d = cf.Data([3]) - >>> d.size - 1 - - >>> d = cf.Data(3) - >>> d.size - 1 - - """ - # The dask graph is never going to be computed, so we can set - # '_asanyarray=False'. - dx = self.to_dask_array(_asanyarray=False) - size = dx.size - if math.isnan(size): - logger.debug("Computing data size: Performance may be degraded") - dx.compute_chunk_sizes() - size = dx.size - - return size - - @property - def array(self): - """A numpy array copy of the data. - - In-place changes to the returned numpy array do not affect the - underlying dask array. - - The returned numpy array has the same mask hardness and fill - values as the data. - - Compare with `compute`. - - **Performance** - - `array` causes all delayed operations to be computed. The - returned `numpy` array is a deep copy of that returned by - created `compute`. - - .. seealso:: `datetime_array`, `compute`, `persist` - - **Examples** - - >>> d = cf.Data([1, 2, 3.0], 'km') - >>> a = d.array - >>> isinstance(a, numpy.ndarray) - True - >>> print(a) - [ 1. 2. 3.] - >>> d[0] = -99 - >>> print(a[0]) - 1.0 - >>> a[0] = 88 - >>> print(d[0]) - -99.0 km - - >>> d = cf.Data('2000-12-1', units='days since 1999-12-1') - >>> print(d.array) - 366 - >>> print(d.datetime_array) - 2000-12-01 00:00:00 + therefore allows cyclic slicing). The set contains a subset of + the axis identifiers defined by the `_axes` attribute. - """ - a = self.compute().copy() - if issparse(a): - a = a.toarray() - elif not isinstance(a, np.ndarray): - a = np.asanyarray(a) + .. warning:: Never change the value of the `_cyclic` attribute + in-place. - if not a.size: - return a + .. note:: When an axis identifier is removed from the `_axes` + attribute then it is automatically also removed from + the `_cyclic` attribute. - # Set cached elements - items = [0, -1] - if a.ndim == 2 and a.shape[-1] == 2: - items.extend((1, -2)) - elif a.size == 3: - items.append(1) + """ + return self._custom.get("_cyclic", _empty_set) - self._set_cached_elements({i: a.item(i) for i in items}) + @_cyclic.setter + def _cyclic(self, value): + self._custom["_cyclic"] = value - return a + @_cyclic.deleter + def _cyclic(self): + self._custom["_cyclic"] = _empty_set @property - def datetime_array(self): - """An independent numpy array of date-time objects. - - Only applicable to data arrays with reference time units. + def _axes(self): + """Storage for the axis identifiers. - If the calendar has not been set then the CF default calendar will - be used and the units will be updated accordingly. + Contains a `tuple` of identifiers, one for each array axis. - The data-type of the data array is unchanged. + """ + return super()._axes - .. seealso:: `array`, `compute`, `persist` + @_axes.setter + def _axes(self, value): + self._set_component("axes", tuple(value), copy=False) - **Performance** + # Remove cyclic axes that are not in the new axes + cyclic = self._cyclic + if cyclic: + # Never change the value of the _cyclic attribute in-place + self._cyclic = cyclic.intersection(value) - `datetime_array` causes all delayed operations to be computed. + @property + def Units(self): + """The `Units` object containing the units of the data array. **Examples** + >>> d = cf.Data([1, 2, 3], units='m') + >>> d.Units + + >>> d.Units = cf.Units('kilometres') + >>> d.Units + + >>> d.Units = cf.Units('km') + >>> d.Units + + """ - units = self.Units + return self._Units - if not units.isreftime: - raise ValueError( - f"Can't create date-time array from units {self.Units!r}" - ) + @Units.setter + def Units(self, value): + try: + old_units = self._Units + except ValueError: + pass + else: + if not old_units or self.Units.equals(value): + self._Units = value + return - if getattr(units, "calendar", None) == "none": - raise ValueError( - f"Can't create date-time array from units {self.Units!r} " - "because calendar is 'none'" - ) + if old_units and not old_units.equivalent(value): + raise ValueError( + f"Can't set Units to {value!r} that are not " + f"equivalent to the current units {old_units!r}. " + "Consider using the override_units method instead." + ) - units1, reftime = units.units.split(" since ") - - # Convert months and years to days, because cftime won't work - # otherwise. - if units1 in ("months", "month"): - d = self * _month_length - d.override_units( - Units( - f"days since {reftime}", - calendar=getattr(units, "calendar", None), - ), - inplace=True, - ) - elif units1 in ("years", "year", "yr"): - d = self * _year_length - d.override_units( - Units( - f"days since {reftime}", - calendar=getattr(units, "calendar", None), - ), - inplace=True, - ) - else: - d = self + try: + dtype = self.dtype + except ValueError: + dtype = None - dx = d.to_dask_array() - dx = convert_to_datetime(dx, d.Units) + if dtype is not None: + if dtype.kind in "iu": + if dtype.char in "iI": + dtype = _dtype_float32 + else: + dtype = _dtype_float - a = dx.compute() + cf_func = partial(cf_units, from_units=old_units, to_units=value) - if np.ma.isMA(a): - if self.hardmask: - a.harden_mask() - else: - a.soften_mask() + # 'cf_units' has its own call to 'cfdm_to_memory', so we + # can set '_force_to_memory=False'. + dx = self.to_dask_array(_force_to_memory=False) + dx = dx.map_blocks(cf_func, dtype=dtype) - a.set_fill_value(self.fill_value) + # Setting equivalent units doesn't affect the CFA write + # status. Nor does it invalidate any cached values, but + # only because we'll adjust those, too. + self._set_dask(dx, clear=self._ALL ^ self._CACHE ^ self._CFA) - return a + # Adjust cached values for the new units + cache = self._get_cached_elements() + if cache: + self._set_cached_elements( + {index: cf_func(value) for index, value in cache.items()} + ) - @property - def mask(self): - """The Boolean missing data mask of the data array. + self._Units = value + + @Units.deleter + def Units(self): + raise ValueError( + "Can't delete the Units attribute. " + "Consider using the override_units method instead." + ) - The Boolean mask has True where the data array has missing data - and False otherwise. + @property + def is_masked(self): + """True if the data array has any masked values. - :Returns: + **Performance** - `Data` + `is_masked` causes all delayed operations to be executed. **Examples** - >>> d.shape - (12, 73, 96) - >>> m = d.mask - >>> m.dtype - dtype('bool') - >>> m.shape - (12, 73, 96) + >>> d = cf.Data([[1, 2, 3], [4, 5, 6]]) + >>> print(d.is_masked) + False + >>> d[0, ...] = cf.masked + >>> d.is_masked + True """ - mask_data_obj = self.copy(array=False) + # 'cf_is_masked' has its own call to 'cfdm_to_memory', so we + # can set '_force_to_memory=False'. + dx = self.to_dask_array(_force_to_memory=False) - dx = self.to_dask_array() - mask = da.ma.getmaskarray(dx) + out_ind = tuple(range(dx.ndim)) + dx_ind = out_ind - mask_data_obj._set_dask(mask) - mask_data_obj.override_units(_units_None, inplace=True) - mask_data_obj.hardmask = _DEFAULT_HARDMASK + dx = da.blockwise( + cf_is_masked, + out_ind, + dx, + dx_ind, + adjust_chunks={i: 1 for i in out_ind}, + dtype=bool, + ) - return mask_data_obj + return bool(dx.any()) @_inplace_enabled(default=False) def arctan(self, inplace=False): @@ -5791,404 +3566,74 @@ def arccosh(self, inplace=False): [1.2 1.0 0.8 0.6 --] >>> d.arccosh(inplace=True) >>> print(d.array) - [0.6223625037147786 0.0 nan nan --] - >>> d.masked_invalid(inplace=True) - >>> print(d.array) - [0.6223625037147786 0.0 -- -- --] - - """ - d = _inplace_enabled_define_and_cleanup(self) - - # Data.func is used instead of the Dask built-in in this case because - # arccosh has a restricted domain therefore it is necessary to use our - # custom logic implemented via the `preserve_invalid` keyword to func. - d.func( - np.arccosh, - units=_units_radians, - inplace=True, - preserve_invalid=True, - ) - - return d - - def all(self, axis=None, keepdims=True, split_every=None): - """Test whether all data array elements evaluate to True. - - .. seealso:: `allclose`, `any`, `isclose` - - :Parameters: - - axis: (sequence of) `int`, optional - Axis or axes along which a logical AND reduction is - performed. The default (`None`) is to perform a - logical AND over all the dimensions of the input - array. *axis* may be negative, in which case it counts - from the last to the first axis. - - {{collapse keepdims: `bool`, optional}} - - {{split_every: `int` or `dict`, optional}} - - :Returns: - - `Data` - Whether or not all data array elements evaluate to True. - - **Examples** - - >>> d = cf.Data([[1, 2], [3, 4]]) - >>> d.all() - - >>> d.all(keepdims=False) - - >>> d.all(axis=0) - - >>> d.all(axis=1) - - >>> d.all(axis=()) - - - >>> d[0] = cf.masked - >>> d[1, 0] = 0 - >>> print(d.array) - [[-- --] - [0 4]] - >>> d.all(axis=0) - - >>> d.all(axis=1) - - - >>> d[...] = cf.masked - >>> d.all() - - >>> bool(d.all()) - True - >>> bool(d.all(keepdims=False)) - False - - """ - d = self.copy(array=False) - dx = self.to_dask_array() - dx = da.all(dx, axis=axis, keepdims=keepdims, split_every=split_every) - d._set_dask(dx) - d.hardmask = _DEFAULT_HARDMASK - d.override_units(_units_None, inplace=True) - return d - - def allclose(self, y, rtol=None, atol=None): - """Whether an array is element-wise equal within a tolerance. - - Return True if the data is broadcastable to array *y* and - element-wise equal within a tolerance. - - {{equals tolerance}} - - .. seealso:: `all`, `any`, `isclose` - - :Parameters: - - y: data_like - The data to compare. - - {{rtol: number, optional}} - - {{atol: number, optional}} - - :Returns: - - `Data` - A scalar boolean array that is `True` if the two arrays - are equal within the given tolerance, or `False` - otherwise. - - **Examples** - - >>> d = cf.Data([1000, 2500], 'metre') - >>> e = cf.Data([1, 2.5], 'km') - >>> bool(d.allclose(e)) - True - - >>> d = cf.Data(['ab', 'cdef']) - >>> bool(d.allclose([[['ab', 'cdef']]])) - True - - >>> d = cf.Data([[1000, 2500], [1000, 2500]], 'metre') - >>> e = cf.Data([1, 2.5], 'km') - >>> bool(d.allclose(e)) - True - - >>> d = cf.Data([1, 1, 1], 's') - >>> bool(d.allclose(1)) - True - - """ - return self.isclose(y, rtol=rtol, atol=atol).all() - - def any(self, axis=None, keepdims=True, split_every=None): - """Test whether any data array elements evaluate to True. - - .. seealso:: `all`, `allclose`, `isclose` - - :Parameters: - - axis: (sequence of) `int`, optional - Axis or axes along which a logical OR reduction is - performed. The default (`None`) is to perform a - logical OR over all the dimensions of the input - array. *axis* may be negative, in which case it counts - from the last to the first axis. - - {{collapse keepdims: `bool`, optional}} - - {{split_every: `int` or `dict`, optional}} - - :Returns: - - `Data` - Whether or any data array elements evaluate to True. - - **Examples** - - >>> d = cf.Data([[0, 2], [0, 4]]) - >>> d.any() - - >>> d.any(keepdims=False) - - >>> d.any(axis=0) - - >>> d.any(axis=1) - - >>> d.any(axis=()) - - - >>> d[0] = cf.masked - >>> print(d.array) - [[-- --] - [0 4]] - >>> d.any(axis=0) - - >>> d.any(axis=1) - - - >>> d[...] = cf.masked - >>> d.any() - - >>> bool(d.any()) - False - >>> bool(d.any(keepdims=False)) - False - - """ - d = self.copy(array=False) - dx = self.to_dask_array() - dx = da.any(dx, axis=axis, keepdims=keepdims, split_every=split_every) - d._set_dask(dx) - d.hardmask = _DEFAULT_HARDMASK - d.override_units(_units_None, inplace=True) - return d - - @_inplace_enabled(default=False) - def apply_masking( - self, - fill_values=None, - valid_min=None, - valid_max=None, - valid_range=None, - inplace=False, - ): - """Apply masking. - - Masking is applied according to the values of the keyword - parameters. - - Elements that are already masked remain so. - - .. versionadded:: 3.4.0 - - .. seealso:: `get_fill_value`, `hardmask`, `mask`, `where` - - :Parameters: - - fill_values: `bool` or sequence of scalars, optional - Specify values that will be set to missing data. Data - elements exactly equal to any of the values are set to - missing data. - - If True then the value returned by the - `get_fill_value` method, if such a value exists, is - used. - - Zero or more values may be provided in a sequence of - scalars. - - *Parameter example:* - Specify a fill value of 999: ``fill_values=[999]`` - - *Parameter example:* - Specify fill values of 999 and -1.0e30: - ``fill_values=[999, -1.0e30]`` - - *Parameter example:* - Use the fill value already set for the data: - ``fill_values=True`` - - *Parameter example:* - Use no fill values: ``fill_values=False`` or - ``fill_value=[]`` - - valid_min: number, optional - A scalar specifying the minimum valid value. Data - elements strictly less than this number will be set to - missing data. - - valid_max: number, optional - A scalar specifying the maximum valid value. Data - elements strictly greater than this number will be set - to missing data. - - valid_range: (number, number), optional - A vector of two numbers specifying the minimum and - maximum valid values, equivalent to specifying values - for both *valid_min* and *valid_max* parameters. The - *valid_range* parameter must not be set if either - *valid_min* or *valid_max* is defined. - - *Parameter example:* - ``valid_range=[-999, 10000]`` is equivalent to setting - ``valid_min=-999, valid_max=10000`` + [0.6223625037147786 0.0 nan nan --] + >>> d.masked_invalid(inplace=True) + >>> print(d.array) + [0.6223625037147786 0.0 -- -- --] - {{inplace: `bool`, optional}} + """ + d = _inplace_enabled_define_and_cleanup(self) - :Returns: + # Data.func is used instead of the Dask built-in in this case because + # arccosh has a restricted domain therefore it is necessary to use our + # custom logic implemented via the `preserve_invalid` keyword to func. + d.func( + np.arccosh, + units=_units_radians, + inplace=True, + preserve_invalid=True, + ) - `Data` or `None` - The data with masked values. If the operation was in-place - then `None` is returned. + return d - **Examples** + def allclose(self, y, rtol=None, atol=None): + """Whether an array is element-wise equal within a tolerance. - >>> import numpy - >>> d = cf.Data(numpy.arange(12).reshape(3, 4), 'm') - >>> d[1, 1] = cf.masked - >>> print(d.array) - [[0 1 2 3] - [4 -- 6 7] - [8 9 10 11]] - >>> print(d.apply_masking().array) - [[0 1 2 3] - [4 -- 6 7] - [8 9 10 11]] - >>> print(d.apply_masking(fill_values=[0]).array) - [[-- 1 2 3] - [4 -- 6 7] - [8 9 10 11]] - >>> print(d.apply_masking(fill_values=[0, 11]).array) - [[-- 1 2 3] - [4 -- 6 7] - [8 9 10 --]] - >>> print(d.apply_masking(valid_min=3).array) - [[-- -- -- 3] - [4 -- 6 7] - [8 9 10 11]] - >>> print(d.apply_masking(valid_max=6).array) - [[0 1 2 3] - [4 -- 6 --] - [-- -- -- --]] - >>> print(d.apply_masking(valid_range=[2, 8]).array) - [[-- -- 2 3] - [4 -- 6 7] - [8 -- -- --]] - >>> d.set_fill_value(7) - >>> print(d.apply_masking(fill_values=True).array) - [[0 1 2 3] - [4 -- 6 --] - [8 9 10 11]] - >>> print(d.apply_masking(fill_values=True, - ... valid_range=[2, 8]).array) - [[-- -- 2 3] - [4 -- 6 --] - [8 -- -- --]] + Return True if the data is broadcastable to array *y* and + element-wise equal within a tolerance. - """ - # Parse valid_range - if valid_range is not None: - if valid_min is not None or valid_max is not None: - raise ValueError( - "Can't set 'valid_range' parameter with either the " - "'valid_min' nor 'valid_max' parameters" - ) + {{equals tolerance}} - try: - if len(valid_range) != 2: - raise ValueError( - "'valid_range' parameter must be a vector of " - "two elements" - ) - except TypeError: - raise ValueError( - "'valid_range' parameter must be a vector of " - "two elements" - ) + .. seealso:: `all`, `any`, `isclose` - valid_min, valid_max = valid_range + :Parameters: - # Parse fill_values - if fill_values is None: - fill_values = False + y: data_like + The data to compare. - if isinstance(fill_values, bool): - if fill_values: - fill_value = self.get_fill_value(None) - if fill_value is not None: - fill_values = (fill_value,) - else: - fill_values = () - else: - fill_values = () - else: - try: - iter(fill_values) - except TypeError: - raise TypeError( - "'fill_values' parameter must be a sequence or " - f"of type bool. Got type {type(fill_values)}" - ) - else: - if isinstance(fill_values, str): - raise TypeError( - "'fill_values' parameter must be a sequence or " - f"of type bool. Got type {type(fill_values)}" - ) + {{rtol: number, optional}} - d = _inplace_enabled_define_and_cleanup(self) + {{atol: number, optional}} - dx = self.to_dask_array() + :Returns: - mask = None - if fill_values: - mask = dx == fill_values[0] + `Data` + A scalar boolean array that is `True` if the two arrays + are equal within the given tolerance, or `False` + otherwise. - for fill_value in fill_values[1:]: - mask |= dx == fill_value + **Examples** - if valid_min is not None: - if mask is None: - mask = dx < valid_min - else: - mask |= dx < valid_min + >>> d = cf.Data([1000, 2500], 'metre') + >>> e = cf.Data([1, 2.5], 'km') + >>> bool(d.allclose(e)) + True - if valid_max is not None: - if mask is None: - mask = dx > valid_max - else: - mask |= dx > valid_max + >>> d = cf.Data(['ab', 'cdef']) + >>> bool(d.allclose([[['ab', 'cdef']]])) + True - if mask is not None: - dx = da.ma.masked_where(mask, dx) + >>> d = cf.Data([[1000, 2500], [1000, 2500]], 'metre') + >>> e = cf.Data([1, 2.5], 'km') + >>> bool(d.allclose(e)) + True - d._set_dask(dx) + >>> d = cf.Data([1, 1, 1], 's') + >>> bool(d.allclose(1)) + True - return d + """ + return self.isclose(y, rtol=rtol, atol=atol).all() def argmax(self, axis=None, unravel=False): """Return the indices of the maximum values along an axis. @@ -6441,7 +3886,7 @@ def convert_reference_time( :Returns: - `{{class}}` or `None` + `Data` or `None` The data with converted reference time values, or `None` if the operation was in-place. @@ -6457,318 +3902,79 @@ def convert_reference_time( cftime.DatetimeGregorian(2004, 3, 1, 7, 27, 11, 493670, has_year_zero=False)] >>> print(d.array) [0 1 2 3] - >>> e = d.convert_reference_time(calendar_months=True) - >>> e.Units - - >>> print(e.datetime_array) - [cftime.DatetimeGregorian(2003, 12, 1, 0, 0, 0, 0, has_year_zero=False) - cftime.DatetimeGregorian(2004, 1, 1, 0, 0, 0, 0, has_year_zero=False) - cftime.DatetimeGregorian(2004, 2, 1, 0, 0, 0, 0, has_year_zero=False) - cftime.DatetimeGregorian(2004, 3, 1, 0, 0, 0, 0, has_year_zero=False)] - >>> print(e.array) - [ 0 31 62 91] - - """ - units0 = self.Units - - if not units0.isreftime: - raise ValueError( - f"{self.__class__.__name__} must have reference time units. " - f"Got {units0!r}" - ) - - d = _inplace_enabled_define_and_cleanup(self) - - if units is None: - # By default, set the target units to "days since - # , calendar=" - units = Units( - "days since " + units0.units.split(" since ")[1], - calendar=units0._calendar, - ) - elif not getattr(units, "isreftime", False): - raise ValueError( - f"New units must be reference time units, not {units!r}" - ) - - units0_since_reftime = units0._units_since_reftime - if units0_since_reftime in _month_units: - if calendar_months: - units0 = Units( - "calendar_" + units0.units, calendar=units0._calendar - ) - else: - units0 = Units( - "days since " + units0.units.split(" since ")[1], - calendar=units0._calendar, - ) - d.Units = units0 - elif units0_since_reftime in _year_units: - if calendar_years: - units0 = Units( - "calendar_" + units0.units, calendar=units0._calendar - ) - else: - units0 = Units( - "days since " + units0.units.split(" since ")[1], - calendar=units0._calendar, - ) - d.Units = units0 - - # 'cf_rt2dt' its own call to 'cf_asanyarray', so we can set - # '_asanyarray=False'. - dx = d.to_dask_array(_asanyarray=False) - - # Convert to the correct date-time objects - dx = dx.map_blocks(cf_rt2dt, units=units0, dtype=object) - - # Convert the date-time objects to reference times - dx = dx.map_blocks(cf_dt2rt, units=units, dtype=float) - - d._set_dask(dx) - d.override_units(units, inplace=True) - - return d - - def get_data(self, default=ValueError(), _units=None, _fill_value=None): - """Returns the data. - - .. versionadded:: 3.0.0 - - :Returns: - - `Data` - - """ - return self - - def get_deterministic_name(self): - """Get the deterministic name for the data. - - If there is a deterministic name then the data array may be - assumed to be 'equal' to that of another `Data` object with - the same deterministic name. This measure of equality is - different to that applied by the `equals` method in that NaN - and inf values are, in effect, always considered equal. - - Note that the opposite is not always true. Two `Data` objects - that are considered equal by their `equals` methods might not - have the same deterministic name. - - An exception is raised if there is no deterministic name. - - .. versionadded:: 3.15.1 - - .. seealso:: `has_deterministic_name` - - :Returns: - - `str` - The deterministic name. - - **Examples** - - >>> d = cf.Data([1, 2, 3], 'm') - >>> d.has_deterministic_name() - True - >>> d.get_deterministic_name() - '6380dd3674fbf10d30561484b084e9b3' - >>> d1 = cf.Data([1, 2, 3], 'metre') - >>> d1.get_deterministic_name() - '6380dd3674fbf10d30561484b084e9b3' - >>> d1.get_deterministic_name() == d.get_deterministic_name() - True - >>> d1.equals(d) - True - - >>> e = d + 1 - 1 - >>> e.get_deterministic_name() - '0b83ada62d4b014bae83c3de1c1d3a80' - >>> e.get_deterministic_name() == d.get_deterministic_name() - False - >>> e.equals(d) - True - - """ - if not self._custom["deterministic"]: - raise ValueError() - - units = self._Units - - # The dask graph is never going to be computed, so we can set - # '_asanyarray=False'. - return tokenize( - self.to_dask_array(_asanyarray=False).name, - units.formatted(definition=True, names=True), - units._canonical_calendar, - ) - - def get_filenames(self): - """The names of files containing parts of the data array. - - Returns the names of any files that may be required to deliver - the computed data array. This set may contain fewer names than - the collection of file names that defined the data when it was - first instantiated, as could be the case after the data has - been subspaced. - - **Implementation** - - A `dask` chunk that contributes to the computed array is - assumed to reference data within a file if that chunk's array - object has a callable `get_filenames` method, the output of - which is added to the returned `set`. - - :Returns: - - `set` - The file names. If no files are required to compute - the data then an empty `set` is returned. - - **Examples** - - >>> d = cf.Data.full((5, 8), 1, chunks=4) - >>> d.get_filenames() - set() - - >>> f = cf.example_field(0) - >>> cf.write(f, "file_A.nc") - >>> cf.write(f, "file_B.nc") - - >>> a = cf.read("file_A.nc", chunks=4)[0].data - >>> a += 999 - >>> b = cf.read("file_B.nc", chunks=4)[0].data - >>> c = cf.Data(b.array, units=b.Units, chunks=4) - >>> print(a.shape, b.shape, c.shape) - (5, 8) (5, 8) (5, 8) - >>> d = cf.Data.concatenate([a, a.copy(), b, c], axis=1) - >>> print(d.shape) - (5, 32) - - >>> d.get_filenames() - {'file_A.nc', 'file_B.nc'} - >>> d[:, 2:7].get_filenames() - {'file_A.nc'} - >>> d[:, 2:14].get_filenames() - {'file_A.nc', 'file_B.nc'} - >>> d[:, 2:20].get_filenames() - {'file_A.nc', 'file_B.nc'} - >>> d[:, 2:30].get_filenames() - {'file_A.nc', 'file_B.nc'} - >>> d[:, 29:30].get_filenames() - set() - >>> d[2, 3] = -99 - >>> d[2, 3].get_filenames() - {'file_A.nc'} - - """ - out = set() - - # The dask graph is never going to be computed, so we can set - # '_asanyarray=False'. - for a in self.todict(_asanyarray=False).values(): - try: - out.update(a.get_filenames()) - except AttributeError: - pass - - return out - - def get_units(self, default=ValueError()): - """Return the units. - - .. seealso:: `del_units`, `set_units` - - :Parameters: - - default: optional - Return the value of the *default* parameter if the units - have not been set. If set to an `Exception` instance then - it will be raised instead. - - :Returns: - - The units. - - **Examples** - - >>> d.set_units('metres') - >>> d.get_units() - 'metres' - >>> d.del_units() - >>> d.get_units() - ValueError: Can't get non-existent units - >>> print(d.get_units(None)) - None - - """ - try: - return self.Units.units - except AttributeError: - return super().get_units(default=default) - - def get_calendar(self, default=ValueError()): - """Return the calendar. - - .. seealso:: `del_calendar`, `set_calendar` - - :Parameters: - - default: optional - Return the value of the *default* parameter if the - calendar has not been set. If set to an `Exception` - instance then it will be raised instead. - - :Returns: - - The calendar. - - **Examples** - - >>> d.set_calendar('julian') - >>> d.get_calendar - 'metres' - >>> d.del_calendar() - >>> d.get_calendar() - ValueError: Can't get non-existent calendar - >>> print(d.get_calendar(None)) - None + >>> e = d.convert_reference_time(calendar_months=True) + >>> e.Units + + >>> print(e.datetime_array) + [cftime.DatetimeGregorian(2003, 12, 1, 0, 0, 0, 0, has_year_zero=False) + cftime.DatetimeGregorian(2004, 1, 1, 0, 0, 0, 0, has_year_zero=False) + cftime.DatetimeGregorian(2004, 2, 1, 0, 0, 0, 0, has_year_zero=False) + cftime.DatetimeGregorian(2004, 3, 1, 0, 0, 0, 0, has_year_zero=False)] + >>> print(e.array) + [ 0 31 62 91] """ - try: - return self.Units.calendar - except (AttributeError, KeyError): - return super().get_calendar(default=default) + units0 = self.Units - def set_calendar(self, calendar): - """Set the calendar. + if not units0.isreftime: + raise ValueError( + f"{self.__class__.__name__} must have reference time units. " + f"Got {units0!r}" + ) - .. seealso:: `override_calendar`, `override_units`, - `del_calendar`, `get_calendar` + d = _inplace_enabled_define_and_cleanup(self) - :Parameters: + if units is None: + # By default, set the target units to "days since + # , calendar=" + units = self._Units_class( + "days since " + units0.units.split(" since ")[1], + calendar=units0._calendar, + ) + elif not getattr(units, "isreftime", False): + raise ValueError( + f"New units must be reference time units, not {units!r}" + ) - value: `str` - The new calendar. + units0_since_reftime = units0._units_since_reftime + if units0_since_reftime in _month_units: + if calendar_months: + units0 = self._Units_class( + "calendar_" + units0.units, calendar=units0._calendar + ) + else: + units0 = self._Units_class( + "days since " + units0.units.split(" since ")[1], + calendar=units0._calendar, + ) + d.Units = units0 + elif units0_since_reftime in _year_units: + if calendar_years: + units0 = self._Units_class( + "calendar_" + units0.units, calendar=units0._calendar + ) + else: + units0 = self._Units_class( + "days since " + units0.units.split(" since ")[1], + calendar=units0._calendar, + ) + d.Units = units0 - :Returns: + # 'cf_rt2dt' its own call to 'cfdm_to_memory', so we can set + # '_force_to_memory=False'. + dx = d.to_dask_array(_force_to_memory=False) - `None` + # Convert to the correct date-time objects + dx = dx.map_blocks(cf_rt2dt, units=units0, dtype=object) - **Examples** + # Convert the date-time objects to reference times + dx = dx.map_blocks(cf_dt2rt, units=units, dtype=float) - >>> d.set_calendar('none') - >>> d.get_calendar - 'none' - >>> d.del_calendar() - >>> d.get_calendar() - ValueError: Can't get non-existent calendar - >>> print(d.get_calendar(None)) - None + d._set_dask(dx) + d.override_units(units, inplace=True) - """ - self.Units = Units(self.get_units(default=None), calendar) + return d def add_file_location(self, location): """Add a new file location in-place. @@ -6802,8 +4008,8 @@ def add_file_location(self, location): updated = False # The dask graph is never going to be computed, so we can set - # '_asanyarray=False'. - dsk = self.todict(_asanyarray=False) + # '_force_to_memory=False'. + dsk = self.todict(_force_to_memory=False) for key, a in dsk.items(): try: dsk[key] = a.add_file_location(location) @@ -6816,9 +4022,9 @@ def add_file_location(self, location): updated = True if updated: - dx = self.to_dask_array(_asanyarray=False) + dx = self.to_dask_array(_force_to_memory=False) dx = da.Array(dsk, dx.name, dx.chunks, dx.dtype, dx._meta) - self._set_dask(dx, clear=_NONE, asanyarray=None) + self._set_dask(dx, clear=self._NONE, in_memory=None) return location @@ -6849,7 +4055,51 @@ def set_units(self, value): None """ - self.Units = Units(value, self.get_calendar(default=None)) + self.Units = self._Units_class(value, self.get_calendar(default=None)) + + @_inplace_enabled(default=False) + def masked_where(self, condition, inplace=False): + """Mask the data where a condition is met. + + ``d.masked_where(condition)`` is equivalent to + ``d.where(condition, cf.masked)``. + + **Performance** + + `masked_where` causes all delayed operations to be executed. + + .. versionadded:: NEXTVERSION + + .. seealso:: `mask`, `masked_values`, `where` + + :Parameters: + + condition: array_like + The masking condition. The data is masked where + *condition* is True. Any masked values already in the + data are also masked in the result. + + {{inplace: `bool`, optional}} + + :Returns: + + {{inplace: `bool`, optional}} + + :Returns: + + `Data` or `None` + The result of masking the data, or `None` if the + operation was in-place. + + **Examples** + + >>> d = cf.Data([1, 2, 3, 4, 5]) + >>> e = d.masked_where([0, 1, 0, 1, 0]) + >>> print(e.array) + [1 -- 3 -- 5] + + """ + return self.where(condition, masked, None, inplace=inplace) @_inplace_enabled(default=False) @_deprecated_kwarg_check("i", version="3.0.0", removed_at="4.0.0") @@ -7518,7 +4768,7 @@ def clip(self, a_min, a_max, units=None, inplace=False, i=False): """ if units is not None: # Convert the limits to the same units as the data array - units = Units(units) + units = self._Units_class(units) self_units = self.Units if self_units != units: a_min = Units.conform(np.asanyarray(a_min), units, self_units) @@ -7530,73 +4780,6 @@ def clip(self, a_min, a_max, units=None, inplace=False, i=False): d._set_dask(dx) return d - @classmethod - def asdata(cls, d, dtype=None, copy=False): - """Convert the input to a `Data` object. - - If the input *d* has the Data interface (i.e. it has a - `__data__` method), then the output of this method is used as - the returned `Data` object. Otherwise, `Data(d)` is returned. - - :Parameters: - - d: data-like - Input data in any form that can be converted to a - `Data` object. This includes `Data` and `Field` - objects, and objects with the Data interface, numpy - arrays and any object which may be converted to a - numpy array. - - dtype: data-type, optional - By default, the data-type is inferred from the input data. - - copy: `bool`, optional - If True and *d* has the Data interface, then a copy of - `d.__data__()` is returned. - - :Returns: - - `Data` - `Data` interpretation of *d*. No copy is performed on the - input if it is already a `Data` object with matching dtype - and *copy* is False. - - **Examples** - - >>> d = cf.Data([1, 2]) - >>> cf.Data.asdata(d) is d - True - >>> d.asdata(d) is d - True - - >>> cf.Data.asdata([1, 2]) - - - >>> cf.Data.asdata(numpy.array([1, 2])) - - - """ - data = getattr(d, "__data__", None) - if data is None: - # d does not have a Data interface - data = cls(d) - if dtype is not None: - data.dtype = dtype - - return data - - # d does have a Data interface - data = data() - if copy: - data = data.copy() - if dtype is not None and np.dtype(dtype) != data.dtype: - data.dtype = dtype - elif dtype is not None and np.dtype(dtype) != data.dtype: - data = data.copy() - data.dtype = dtype - - return data - @classmethod def arctan2(cls, x1, x2): """Element-wise arc tangent of ``x1/x2`` with correct quadrant. @@ -7864,7 +5047,7 @@ def count(self, axis=None, keepdims=True, split_every=None): dx, axis=axis, keepdims=keepdims, split_every=split_every ) d._set_dask(dx) - d.hardmask = _DEFAULT_HARDMASK + d.hardmask = self._DEFAULT_HARDMASK d.override_units(_units_None, inplace=True) return d @@ -8115,92 +5298,6 @@ def second(self): """ return YMDhms(self, "second") - @property - def sparse_array(self): - """Return an independent `scipy` sparse array of the data. - - In-place changes to the returned sparse array do not affect - the underlying dask array. - - An `AttributeError` is raised if a sparse array representation - is not available. - - **Performance** - - `sparse_array` causes all delayed operations to be - computed. The returned sparse array is a deep copy of that - returned by created `compute`. - - .. versionadded:: 3.16.0 - - .. seealso:: `array` - - :Returns: - - An independent `scipy` sparse array of the data. - - **Examples** - - >>> from scipy.sparse import issparse - >>> issparse(d.sparse_array) - True - - """ - array = self.compute() - if issparse(array): - return array.copy() - - raise AttributeError( - "A sparse array representation of the data is not available" - ) - - @_inplace_enabled(default=False) - def uncompress(self, inplace=False): - """Uncompress the data. - - Only affects data that is compressed by convention, i.e. - - * Ragged arrays for discrete sampling geometries (DSG) and - simple geometry cell definitions. - - * Compression by gathering. - - * Compression by coordinate subsampling. - - Data that is already uncompressed is returned - unchanged. Whether the data is compressed or not does not - alter its functionality nor external appearance, but may - affect how the data are written to a dataset on disk. - - .. versionadded:: 3.0.6 - - .. seealso:: `array`, `compressed_array`, `source` - - :Parameters: - - {{inplace: `bool`, optional}} - - :Returns: - - `Data` or `None` - The uncompressed data, or `None` if the operation was - in-place. - - **Examples** - - >>> d.get_compression_type() - 'ragged contiguous' - >>> d.uncompress() - >>> d.get_compression_type() - '' - - """ - d = _inplace_enabled_define_and_cleanup(self) - if d.get_compression_type(): - d._del_Array(None) - - return d - def unique(self, split_every=None): """The unique elements of the data. @@ -8245,13 +5342,13 @@ def unique(self, split_every=None): d.soften_mask() # The applicable chunk function will have its own call to - # 'cf_asanyarray', so we can set '_asanyarray=False'. - dx = d.to_dask_array(_asanyarray=False) + # 'cfdm_to_memory', so we can set '_force_to_memory=False'. + dx = d.to_dask_array(_force_to_memory=False) dx = Collapse().unique(dx, split_every=split_every) d._set_dask(dx) - d.hardmask = _DEFAULT_HARDMASK + d.hardmask = self._DEFAULT_HARDMASK return d @@ -8327,248 +5424,20 @@ def ndindex(self): ... () 9 - """ - return product(*[range(0, r) for r in self.shape]) - - @_deprecated_kwarg_check("traceback", version="3.0.0", removed_at="4.0.0") - @_manage_log_level_via_verbosity - def equals( - self, - other, - rtol=None, - atol=None, - ignore_fill_value=False, - ignore_data_type=False, - ignore_type=False, - verbose=None, - traceback=False, - ignore_compression=False, - ): - """True if two data arrays are logically equal, False otherwise. - - {{equals tolerance}} - - :Parameters: - - other: - The object to compare for equality. - - {{rtol: number, optional}} - - {{atol: number, optional}} - - ignore_fill_value: `bool`, optional - If True then data arrays with different fill values are - considered equal. By default they are considered unequal. - - {{ignore_data_type: `bool`, optional}} - - {{ignore_type: `bool`, optional}} - - {{verbose: `int` or `str` or `None`, optional}} - - traceback: deprecated at version 3.0.0 - Use the *verbose* parameter instead. - - {{ignore_compression: `bool`, optional}} - - :Returns: - - `bool` - Whether or not the two instances are equal. - - **Examples** - - >>> d.equals(d) - True - >>> d.equals(d + 1) - False - - """ - # Set default tolerances - if rtol is None: - rtol = self._rtol - - if atol is None: - atol = self._atol - - if not super().equals( - other, - rtol=rtol, - atol=atol, - verbose=verbose, - ignore_data_type=ignore_data_type, - ignore_fill_value=ignore_fill_value, - ignore_type=ignore_type, - _check_values=False, - ): - # TODODASK: consistency with cfdm Data.equals needs to be verified - # possibly via a follow-up PR to cfdm to implement any changes. - return False - - # ------------------------------------------------------------ - # Check that each instance has equal array values - # ------------------------------------------------------------ - self_dx = self.to_dask_array() - other_dx = other.to_dask_array() - - # Check that each instance has the same units. Do this before - # any other possible short circuits. - self_Units = self.Units - other_Units = other.Units - if self_Units != other_Units: - if is_log_level_info(logger): - logger.info( - f"{self.__class__.__name__}: Different Units " - f"({self_Units!r}, {other_Units!r})" - ) - - return False - - rtol = float(rtol) - atol = float(atol) - - # Return False if there are different cached elements. This - # provides a possible short circuit for that case that two - # arrays are not equal (but not in the case that they are). - cache0 = self._get_cached_elements() - if cache0: - cache1 = other._get_cached_elements() - if cache1 and sorted(cache0) == sorted(cache1): - a = [] - b = [] - for key, value0 in cache0.items(): - value1 = cache1[key] - if value0 is np.ma.masked or value1 is np.ma.masked: - # Don't test on masked values - this logic is - # determined elsewhere. - continue - - # Make sure strings are unicode - try: - value0 = value0.decode() - value1 = value1.decode() - except AttributeError: - pass - - a.append(value0) - b.append(value1) - - if a and not _numpy_allclose(a, b, rtol=rtol, atol=atol): - if is_log_level_info(logger): - logger.info( - f"{self.__class__.__name__}: Different array " - f"values (atol={atol}, rtol={rtol})" - ) - - return False - - # Now check that corresponding elements are equal within a tolerance. - # We assume that all inputs are masked arrays. Note we compare the - # data first as this may return False due to different dtype without - # having to wait until the compute call. - self_is_numeric = is_numeric_dtype(self_dx) - other_is_numeric = is_numeric_dtype(other_dx) - if self_is_numeric and other_is_numeric: - data_comparison = _da_ma_allclose( - self_dx, - other_dx, - masked_equal=True, - rtol=rtol, - atol=atol, - ) - elif not self_is_numeric and not other_is_numeric: - # If the array (say d) is fully masked, then the output of - # np.all(d == d) and therefore da.all(d == d) will be a - # np.ma.masked object which has dtype('float64'), and not - # a Boolean, causing issues later. To ensure data_comparison - # is Boolean, we must do an early compute to check if it is - # a masked object and if so, force the desired result (True). - # - # This early compute won't degrade performance because it - # would be performed towards result.compute() below anyway. - data_comparison = da.all(self_dx == other_dx).compute() - if data_comparison is np.ma.masked: - data_comparison = True - - else: # one is numeric and other isn't => not equal (incompat. dtype) - if is_log_level_info(logger): - logger.info( - f"{self.__class__.__name__}: Different data types:" - f"{self_dx.dtype} != {other_dx.dtype}" - ) - - return False - - mask_comparison = da.all( - da.equal(da.ma.getmaskarray(self_dx), da.ma.getmaskarray(other_dx)) - ) - - # Apply a (dask) logical 'and' to confirm if both the mask and the - # data are equal for the pair of masked arrays: - result = da.logical_and(data_comparison, mask_comparison) - if not result.compute(): - if is_log_level_info(logger): - logger.info( - f"{self.__class__.__name__}: Different array values (" - f"atol={atol}, rtol={rtol})" - ) - - return False - else: - return True - - @_deprecated_kwarg_check("i", version="3.0.0", removed_at="4.0.0") - @_inplace_enabled(default=False) - def exp(self, inplace=False, i=False): - """Take the exponential of the data array. - - :Parameters: - - {{inplace: `bool`, optional}} - - {{i: deprecated at version 3.0.0}} - - :Returns: - - `Data` or `None` - - **Examples** - - """ - d = _inplace_enabled_define_and_cleanup(self) - - units = self.Units - if units and not units.isdimensionless: - raise ValueError( - "Can't take exponential of dimensional " - f"quantities: {units!r}" - ) - - if d.Units: - d.Units = _units_1 - - dx = d.to_dask_array() - dx = da.exp(dx) - d._set_dask(dx) - - return d + """ + return product(*[range(0, r) for r in self.shape]) + @_deprecated_kwarg_check("i", version="3.0.0", removed_at="4.0.0") @_inplace_enabled(default=False) - def insert_dimension(self, position=0, inplace=False): - """Expand the shape of the data array in place. - - .. seealso:: `flip`, `squeeze`, `swapaxes`, `transpose` + def exp(self, inplace=False, i=False): + """Take the exponential of the data array. :Parameters: - position: `int`, optional - Specify the position that the new axis will have in the data - array axes. By default the new axis has position 0, the - slowest varying position. - {{inplace: `bool`, optional}} + {{i: deprecated at version 3.0.0}} + :Returns: `Data` or `None` @@ -8576,45 +5445,21 @@ def insert_dimension(self, position=0, inplace=False): **Examples** """ - # TODODASKAPI bring back expand_dims alias (or rather alias this to - # that) - d = _inplace_enabled_define_and_cleanup(self) - # Parse position - if not isinstance(position, int): - raise ValueError("Position parameter must be an integer") - - ndim = d.ndim - if -ndim - 1 <= position < 0: - position += ndim + 1 - elif not 0 <= position <= ndim: + units = self.Units + if units and not units.isdimensionless: raise ValueError( - f"Can't insert dimension: Invalid position {position!r}" + "Can't take exponential of dimensional " + f"quantities: {units!r}" ) - shape = list(d.shape) - shape.insert(position, 1) + if d.Units: + d.Units = _units_1 dx = d.to_dask_array() - dx = dx.reshape(shape) - - # Inserting a dimension doesn't affect the cached elements nor - # the CFA write status - d._set_dask(dx, clear=_ALL ^ _CACHE ^ _CFA) - - # Expand _axes - axis = new_axis_identifier(d._axes) - data_axes = list(d._axes) - data_axes.insert(position, axis) - d._axes = data_axes - - # Update the HDF5 chunking strategy - chunksizes = d.nc_hdf5_chunksizes() - if isinstance(chunksizes, tuple): - chunksizes = list(chunksizes) - chunksizes.insert(position, 1) - d.nc_set_hdf5_chunksizes(chunksizes) + dx = da.exp(dx) + d._set_dask(dx) return d @@ -8964,163 +5809,6 @@ def halo( return d - def harden_mask(self): - """Force the mask to hard. - - Whether the mask of a masked array is hard or soft is - determined by its `hardmask` property. `harden_mask` sets - `hardmask` to `True`. - - .. versionadded:: 3.14.0 - - .. seealso:: `hardmask`, `soften_mask` - - **Examples** - - >>> d = cf.Data([1, 2, 3], hardmask=False) - >>> d.hardmask - False - >>> d.harden_mask() - >>> d.hardmask - True - - >>> d = cf.Data([1, 2, 3], mask=[False, True, False]) - >>> d.hardmask - True - >>> d[1] = 999 - >>> print(d.array) - [1 -- 3] - - """ - # 'cf_harden_mask' has its own call to 'cf_asanyarray', so we - # can set '_asanyarray=False'. - dx = self.to_dask_array(_asanyarray=False) - dx = dx.map_blocks(cf_harden_mask, dtype=self.dtype) - self._set_dask(dx, clear=_NONE) - self.hardmask = True - - def has_calendar(self): - """Whether a calendar has been set. - - .. seealso:: `del_calendar`, `get_calendar`, `set_calendar`, - `has_units`, `Units` - - :Returns: - - `bool` - True if the calendar has been set, otherwise False. - - **Examples** - - >>> d = cf.Data(1, "days since 2000-1-1", calendar="noleap") - >>> d.has_calendar() - True - - >>> d = cf.Data(1, calendar="noleap") - >>> d.has_calendar() - True - - >>> d = cf.Data(1, "days since 2000-1-1") - >>> d.has_calendar() - False - - >>> d = cf.Data(1, "m") - >>> d.has_calendar() - False - - """ - return hasattr(self.Units, "calendar") - - def has_deterministic_name(self): - """Whether there is a deterministic name for the data. - - See `get_deterministic_name` for details. - - .. versionadded:: 3.15.1 - - .. seealso:: `get_deterministic_name` - - :Returns: - - `bool` - Whether or not there is a deterministic name. - - **Examples** - - >>> d = cf.Data([1, 2, 3], 'm') - >>> d.has_deterministic_name() - True - - """ - return self._custom["deterministic"] - - def has_units(self): - """Whether units have been set. - - .. seealso:: `del_units`, `get_units`, `set_units`, - `has_calendar`, `Units` - - :Returns: - - `bool` - True if units have been set, otherwise False. - - **Examples** - - >>> d = cf.Data(1, "") - >>> d.has_units() - True - - >>> d = cf.Data(1, "m") - >>> d.has_units() - True - - >>> d = cf.Data(1) - >>> d.has_units() - False - - >>> d = cf.Data(1, calendar='noleap') - >>> d.has_units() - False - - """ - return hasattr(self.Units, "units") - - def soften_mask(self): - """Force the mask to soft. - - Whether the mask of a masked array is hard or soft is - determined by its `hardmask` property. `soften_mask` sets - `hardmask` to `False`. - - .. versionadded:: 3.14.0 - - .. seealso:: `hardmask`, `harden_mask` - - **Examples** - - >>> d = cf.Data([1, 2, 3]) - >>> d.hardmask - True - >>> d.soften_mask() - >>> d.hardmask - False - - >>> d = cf.Data([1, 2, 3], mask=[False, True, False], hardmask=False) - >>> d.hardmask - False - >>> d[1] = 999 - >>> print(d.array) - [ 1 999 3] - - """ - # 'cf_soften_mask' has its own call to 'cf_asanyarray', so we - # can set '_asanyarray=False'. - dx = self.to_dask_array(_asanyarray=False) - dx = dx.map_blocks(cf_soften_mask, dtype=self.dtype) - self._set_dask(dx, clear=_NONE) - self.hardmask = False - def file_locations(self): """The locations of files containing parts of the data. @@ -9146,8 +5834,8 @@ def file_locations(self): out = set() # The dask graph is never going to be computed, so we can set - # '_asanyarray=False'. - for key, a in self.todict(_asanyarray=False).items(): + # '_force_to_memory=False'. + for key, a in self.todict(_force_to_memory=False).items(): try: out.update(a.file_locations()) except AttributeError: @@ -9156,190 +5844,6 @@ def file_locations(self): return out - @_inplace_enabled(default=False) - def filled(self, fill_value=None, inplace=False): - """Replace masked elements with a fill value. - - .. versionadded:: 3.4.0 - - :Parameters: - - fill_value: scalar, optional - The fill value. By default the fill returned by - `get_fill_value` is used, or if this is not set then the - netCDF default fill value for the data type is used (as - defined by `netCDF.fillvals`). - - {{inplace: `bool`, optional}} - - :Returns: - - `Data` or `None` - The filled data, or `None` if the operation was in-place. - - **Examples** - - >>> d = cf.Data([[1, 2, 3]]) - >>> print(d.filled().array) - [[1 2 3]] - >>> d[0, 0] = cf.masked - >>> print(d.filled().array) - [-9223372036854775806 2 3] - >>> d.set_fill_value(-99) - >>> print(d.filled().array) - [[-99 2 3]] - - """ - d = _inplace_enabled_define_and_cleanup(self) - - if fill_value is None: - fill_value = d.get_fill_value(None) - if fill_value is None: # still... - fill_value = default_netCDF_fillvals().get(d.dtype.str[1:]) - if fill_value is None and d.dtype.kind in ("SU"): - fill_value = default_netCDF_fillvals().get("S1", None) - - if fill_value is None: - raise ValueError( - "Can't determine fill value for " - f"data type {d.dtype.str!r}" - ) - - # 'cf_filled' has its own call to 'cf_asanyarray', so we can - # set '_asanyarray=False'. - dx = d.to_dask_array(_asanyarray=False) - dx = dx.map_blocks(cf_filled, fill_value=fill_value, dtype=d.dtype) - d._set_dask(dx) - - return d - - def first_element(self): - """Return the first element of the data as a scalar. - - .. seealso:: `last_element`, `second_element` - - **Performance** - - If possible, a cached value is returned. Otherwise the delayed - operations needed to compute the element are executed, and - cached for subsequent calls. - - :Returns: - - The first element of the data. - - **Examples** - - >>> d = {{package}}.{{class}}(9.0) - >>> x = d.first_element() - >>> print(x, type(x)) - 9.0 - - >>> d = {{package}}.{{class}}([[1, 2], [3, 4]]) - >>> x = d.first_element() - >>> print(x, type(x)) - 1 - >>> d[0, 0] = {{package}}.masked - >>> y = d.first_element() - >>> print(y, type(y)) - -- - - >>> d = {{package}}.{{class}}(['foo', 'bar']) - >>> x = d.first_element() - >>> print(x, type(x)) - foo - - """ - try: - return self._custom["cached_elements"][0] - except KeyError: - item = super().first_element() - self._set_cached_elements({0: item}) - return item - - def second_element(self): - """Return the second element of the data as a scalar. - - .. seealso:: `first_element`, `last_element` - - **Performance** - - If possible, a cached value is returned. Otherwise the delayed - operations needed to compute the element are executed, and - cached for subsequent calls. - - :Returns: - - The second element of the data. - - **Examples** - - >>> d = {{package}}.{{class}}([[1, 2], [3, 4]]) - >>> x = d.second_element() - >>> print(x, type(x)) - 2 - >>> d[0, 1] = {{package}}.masked - >>> y = d.second_element() - >>> print(y, type(y)) - -- - - >>> d = {{package}}.{{class}}(['foo', 'bar']) - >>> x = d.second_element() - >>> print(x, type(x)) - bar - - """ - try: - return self._custom["cached_elements"][1] - except KeyError: - item = super().second_element() - self._set_cached_elements({1: item}) - return item - - def last_element(self): - """Return the last element of the data as a scalar. - - .. seealso:: `first_element`, `second_element` - - **Performance** - - If possible, a cached value is returned. Otherwise the delayed - operations needed to compute the element are executed, and - cached for subsequent calls. - - :Returns: - - The last element of the data. - - **Examples** - - >>> d = {{package}}.{{class}}(9.0) - >>> x = d.last_element() - >>> print(x, type(x)) - 9.0 - - >>> d = {{package}}.{{class}}([[1, 2], [3, 4]]) - >>> x = d.last_element() - >>> print(x, type(x)) - 4 - >>> d[-1, -1] = {{package}}.masked - >>> y = d.last_element() - >>> print(y, type(y)) - -- - - >>> d = {{package}}.{{class}}(['foo', 'bar']) - >>> x = d.last_element() - >>> print(x, type(x)) - bar - - """ - try: - return self._custom["cached_elements"][-1] - except KeyError: - item = super().last_element() - self._set_cached_elements({-1: item}) - return item - def flat(self, ignore_masked=True): """Return a flat iterator over elements of the data array. @@ -9359,166 +5863,35 @@ def flat(self, ignore_masked=True): returned. By default only unmasked elements are returned - :Returns: - - generator - An iterator over elements of the data array. - - **Examples** - - >>> d = cf.Data([[1, 2], [3,4]], mask=[[0, 1], [0, 0]]) - >>> print(d.array) - [[1 --] - [3 4]] - >>> list(d.flat()) - [1, 3, 4] - >>> list(d.flat(ignore_masked=False)) - [1, masked, 3, 4] - - """ - mask = self.mask - - if ignore_masked: - for index in self.ndindex(): - if not mask[index]: - yield self[index].array.item() - else: - for index in self.ndindex(): - if not mask[index]: - yield self[index].array.item() - else: - yield cf_masked - - @_inplace_enabled(default=False) - def flatten(self, axes=None, inplace=False): - """Flatten specified axes of the data. - - Any subset of the axes may be flattened. - - The shape of the data may change, but the size will not. - - The flattening is executed in row-major (C-style) order. For - example, the array ``[[1, 2], [3, 4]]`` would be flattened across - both dimensions to ``[1 2 3 4]``. - - .. versionadded:: 3.0.2 - - .. seealso:: `compressed`, `flat`, `insert_dimension`, `flip`, - `swapaxes`, `transpose` - - :Parameters: - - axes: (sequence of) `int` - Select the axes to be flattened. By default all axes - are flattened. Each axis is identified by its integer - position. No axes are flattened if *axes* is an empty - sequence. - - {{inplace: `bool`, optional}} - - :Returns: - - `Data` or `None` - The flattened data, or `None` if the operation was - in-place. - - **Examples** - - >>> import numpy as np - >>> d = cf.Data(np.arange(24).reshape(1, 2, 3, 4)) - >>> d - - >>> print(d.array) - [[[[ 0 1 2 3] - [ 4 5 6 7] - [ 8 9 10 11]] - [[12 13 14 15] - [16 17 18 19] - [20 21 22 23]]]] - - >>> e = d.flatten() - >>> e - - >>> print(e.array) - [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23] - - >>> e = d.flatten([]) - >>> e - - - >>> e = d.flatten([1, 3]) - >>> e - - >>> print(e.array) - [[[ 0 4 8] - [ 1 5 9] - [ 2 6 10] - [ 3 7 11] - [12 16 20] - [13 17 21] - [14 18 22] - [15 19 23]]] - - >>> d.flatten([0, -1], inplace=True) - >>> d - - >>> print(d.array) - [[[ 0 4 8] - [12 16 20]] - [[ 1 5 9] - [13 17 21]] - [[ 2 6 10] - [14 18 22]] - [[ 3 7 11] - [15 19 23]]] - - """ - d = _inplace_enabled_define_and_cleanup(self) - - ndim = d.ndim - if not ndim: - if axes or axes == 0: - raise ValueError( - "Can't flatten: Can't remove axes from " - f"scalar {self.__class__.__name__}" - ) - - return d - - if axes is None: - axes = list(range(ndim)) - else: - axes = sorted(d._parse_axes(axes)) + :Returns: - n_axes = len(axes) - if n_axes <= 1: - return d + generator + An iterator over elements of the data array. - dx = d.to_dask_array() + **Examples** - # It is important that the first axis in the list is the - # left-most flattened axis. - # - # E.g. if the shape is (10, 20, 30, 40, 50, 60) and the axes - # to be flattened are [2, 4], then the data must be - # transposed with order [0, 1, 2, 4, 3, 5] - order = [i for i in range(ndim) if i not in axes] - order[axes[0] : axes[0]] = axes - dx = dx.transpose(order) - - # Find the flattened shape. - # - # E.g. if the *transposed* shape is (10, 20, 30, 50, 40, 60) - # and *transposed* axes [2, 3] are to be flattened then - # the new shape will be (10, 20, 1500, 40, 60) - shape = d.shape - new_shape = [n for i, n in enumerate(shape) if i not in axes] - new_shape.insert(axes[0], reduce(mul, [shape[i] for i in axes], 1)) + >>> d = cf.Data([[1, 2], [3,4]], mask=[[0, 1], [0, 0]]) + >>> print(d.array) + [[1 --] + [3 4]] + >>> list(d.flat()) + [1, 3, 4] + >>> list(d.flat(ignore_masked=False)) + [1, masked, 3, 4] - dx = dx.reshape(new_shape) - d._set_dask(dx) + """ + mask = self.mask - return d + if ignore_masked: + for index in self.ndindex(): + if not mask[index]: + yield self[index].array.item() + else: + for index in self.ndindex(): + if not mask[index]: + yield self[index].array.item() + else: + yield masked @_deprecated_kwarg_check("i", version="3.0.0", removed_at="4.0.0") @_inplace_enabled(default=False) @@ -9610,6 +5983,9 @@ def outerproduct(self, a, inplace=False, i=False): """ d = _inplace_enabled_define_and_cleanup(self) + shape = d.shape + chunksizes0 = d.nc_hdf5_chunksizes() + # Cast 'a' as a Data object so that it definitely has sensible # Units. We don't mind if the units of 'a' are incompatible # with those of 'self', but if they are then it's nice if the @@ -9639,6 +6015,20 @@ def outerproduct(self, a, inplace=False, i=False): for a_axis in a._cyclic: d.cyclic(ndim + a._axes.index(a_axis)) + # Update the HDF5 chunking strategy + chunksizes1 = a.nc_hdf5_chunksizes() + if chunksizes0 or chunksizes1: + if isinstance(chunksizes0, tuple): + if isinstance(chunksizes1, tuple): + chunksizes = chunksizes0 + chunksizes1 + else: + chunksizes = chunksizes0 + a.shape + + d.nc_set_hdf5_chunksizes(chunksizes) + elif isinstance(chunksizes1, tuple): + chunksizes = shape + chunksizes1 + d.nc_set_hdf5_chunksizes(chunksizes) + d._update_deterministic(a) return d @@ -9710,46 +6100,6 @@ def change_calendar(self, calendar, inplace=False, i=False): return d - def chunk_indices(self): - """Return indices that define each dask compute chunk. - - .. versionadded:: 3.15.0 - - .. seealso:: `chunks` - - :Returns: - - `itertools.product` - An iterator over tuples of indices of the data array. - - **Examples** - - >>> d = cf.Data(np.arange(405).reshape(3, 9, 15), - ... chunks=((1, 2), (9,), (4, 5, 6))) - >>> d.npartitions - 6 - >>> for index in d.chunk_indices(): - ... print(index) - ... - (slice(0, 1, None), slice(0, 9, None), slice(0, 4, None)) - (slice(0, 1, None), slice(0, 9, None), slice(4, 9, None)) - (slice(0, 1, None), slice(0, 9, None), slice(9, 15, None)) - (slice(1, 3, None), slice(0, 9, None), slice(0, 4, None)) - (slice(1, 3, None), slice(0, 9, None), slice(4, 9, None)) - (slice(1, 3, None), slice(0, 9, None), slice(9, 15, None)) - - """ - from dask.utils import cached_cumsum - - chunks = self.chunks - - cumdims = [cached_cumsum(bds, initial_zero=True) for bds in chunks] - indices = [ - [slice(s, s + dim) for s, dim in zip(starts, shapes)] - for starts, shapes in zip(cumdims, chunks) - ] - return product(*indices) - @_deprecated_kwarg_check("i", version="3.0.0", removed_at="4.0.0") @_inplace_enabled(default=False) def override_units(self, units, inplace=False, i=False): @@ -9791,7 +6141,7 @@ def override_units(self, units, inplace=False, i=False): """ d = _inplace_enabled_define_and_cleanup(self) - d._Units = Units(units) + d._Units = self._Units_class(units) return d @_deprecated_kwarg_check("i", version="3.0.0", removed_at="4.0.0") @@ -9833,91 +6183,9 @@ def override_calendar(self, calendar, inplace=False, i=False): """ d = _inplace_enabled_define_and_cleanup(self) - d._Units = Units(d.Units._units, calendar) + d._Units = d._Units_class(d.Units._units, calendar) return d - def to_dask_array(self, apply_mask_hardness=False, _asanyarray=True): - """Convert the data to a `dask` array. - - .. warning:: By default, the mask hardness of the returned - dask array might not be the same as that - specified by the `hardmask` attribute. - - This could cause problems if a subsequent - operation on the returned dask array involves the - un-masking of masked values (such as by indexed - assignment). - - To guarantee that the mask hardness of the - returned dask array is correct, set the - *apply_mask_hardness* parameter to True. - - .. versionadded:: 3.14.0 - - :Parameters: - - apply_mask_hardness: `bool`, optional - If True then force the mask hardness of the returned - array to be that given by the `hardmask` attribute. - - _asanyarray: `bool`, optional - If True (the default) and the `__asanyarray__` - attribute is also `True`, then a `cf_asanyarray` - operation is added to the graph of the returned Dask - array. If False then this operation is not added. - - In general, setting *_asanyarray* to False should only - be done if it is known that a) the returned Dask array - is never going to be computed; or b) it is not - necessary to add a `cf_asanyarray` operation in lieu of - its functionality being implemented by a new Dask graph - layer that is going to be created at a later stage. See - `cf.data.dask_utils.cf_asanyarray` for further details. - - .. versionadded:: NEXTVERSION - - :Returns: - - `dask.array.Array` - The dask array contained within the `Data` instance. - - **Examples** - - >>> d = cf.Data([1, 2, 3, 4], 'm') - >>> dx = d.to_dask_array() - >>> dx - >>> dask.array - >>> dask.array.asanyarray(d) is dx - True - - >>> d.to_dask_array(apply_mask_hardness=True) - dask.array - - >>> d = cf.Data([1, 2, 3, 4], 'm', hardmask=False) - >>> d.to_dask_array(apply_mask_hardness=True) - dask.array - - """ - dx = self._custom.get("dask") - if dx is None: - raise ValueError(f"{self.__class__.__name__} object has no data") - - if apply_mask_hardness: - if self.hardmask: - self.harden_mask() - else: - self.soften_mask() - - dx = self._custom["dask"] - # Note: The mask hardness functions have their own calls - # to 'cf_asanyarray', so we don't need to worry about - # setting another one. - elif _asanyarray and self.__asanyarray__: - # Add a new cf_asanyarray layer to the output graph - dx = dx.map_blocks(cf_asanyarray, dtype=dx.dtype) - - return dx - def datum(self, *index): """Return an element of the data array as a standard Python scalar. @@ -10060,7 +6328,7 @@ def datum(self, *index): if mask is np.ma.nomask or not mask.item(): return array.item() - return cf_masked + return masked @_inplace_enabled(default=False) def masked_invalid(self, inplace=False): @@ -10095,55 +6363,6 @@ def masked_invalid(self, inplace=False): d._set_dask(dx) return d - def del_calendar(self, default=ValueError()): - """Delete the calendar. - - .. seealso:: `get_calendar`, `has_calendar`, `set_calendar`, - `del_units`, `Units` - - :Parameters: - - default: optional - Return the value of the *default* parameter if the - calendar has not been set. If set to an `Exception` - instance then it will be raised instead. - - :Returns: - - `str` - The value of the deleted calendar. - - **Examples** - - >>> d = cf.Data(1, "days since 2000-1-1", calendar="noleap") - >>> d.del_calendar() - 'noleap' - >>> print(d.del_calendar()) - None - - >>> d = cf.Data(1, "days since 2000-1-1") - >>> print(d.del_calendar()) - None - - >>> d = cf.Data(1, "m") - Traceback (most recent call last): - ... - ValueError: Units have no calendar - - """ - units = self.Units - if not units.isreftime: - return self._default(default, f"Units {units!r} have no calendar") - - calendar = getattr(units, "calendar", None) - if calendar is None: - return self._default( - default, f"{self.__class__.__name__} has no calendar" - ) - - self.override_calendar(None, inplace=True) - return calendar - def del_file_location(self, location): """Remove a file location in-place. @@ -10176,8 +6395,8 @@ def del_file_location(self, location): updated = False # The dask graph is never going to be computed, so we can set - # '_asanyarray=False'. - dsk = self.todict(_asanyarray=False) + # '_force_to_memory=False'. + dsk = self.todict(_force_to_memory=False) for key, a in dsk.items(): try: dsk[key] = a.del_file_location(location) @@ -10190,69 +6409,15 @@ def del_file_location(self, location): updated = True if updated: - dx = self.to_dask_array(_asanyarray=False) + dx = self.to_dask_array(_force_to_memory=False) dx = da.Array(dsk, dx.name, dx.chunks, dx.dtype, dx._meta) - self._set_dask(dx, clear=_NONE, asanyarray=None) + self._set_dask(dx, clear=self._NONE, in_memory=None) return location - def del_units(self, default=ValueError()): - """Delete the units. - - .. seealso:: `get_units`, `has_units`, `set_units`, - `del_calendar`, `Units` - - :Parameters: - - default: optional - Return the value of the *default* parameter if the - units has not been set. If set to an `Exception` - instance then it will be raised instead. - - :Returns: - - `str` - The value of the deleted units. - - **Examples** - - >>> d = cf.Data(1, "m") - >>> d.del_units() - 'm' - >>> d.Units - - >>> d.del_units() - Traceback (most recent call last): - ... - ValueError: Data has no units - - >>> d = cf.Data(1, "days since 2000-1-1", calendar="noleap") - >>> d.del_units() - 'days since 2000-1-1' - >>> d.Units - - - """ - u = self.Units - units = getattr(u, "units", None) - calendar = getattr(u, "calendar", None) - self.override_units(Units(None, calendar), inplace=True) - - if units is not None: - return units - - return self._default( - default, f"{self.__class__.__name__} has no units" - ) - @classmethod def masked_all( - cls, - shape, - dtype=None, - units=None, - calendar=None, - chunks=_DEFAULT_CHUNKS, + cls, shape, dtype=None, units=None, calendar=None, chunks="auto" ): """Return an empty masked array with all elements masked. @@ -10309,60 +6474,6 @@ def masked_all( d._set_dask(dx) return d - @_inplace_enabled(default=False) - def masked_values(self, value, rtol=None, atol=None, inplace=False): - """Mask using floating point equality. - - Masks the data where elements are approximately equal to the - given value. For integer types, exact equality is used. - - .. versionadded:: 3.16.0 - - .. seealso:: `mask` - - :Parameters: - - value: number - Masking value. - - {{rtol: number, optional}} - - {{atol: number, optional}} - - {{inplace: `bool`, optional}} - - :Returns: - - `{{class}}` or `None` - The result of masking the data where approximately - equal to *value*, or `None` if the operation was - in-place. - - **Examples** - - >>> d = {{package}}.{{class}}([1, 1.1, 2, 1.1, 3]) - >>> e = d.masked_values(1.1) - >>> print(e.array) - [1.0 -- 2.0 -- 3.0] - - """ - d = _inplace_enabled_define_and_cleanup(self) - - if rtol is None: - rtol = self._rtol - else: - rtol = float(rtol) - - if atol is None: - atol = self._atol - else: - atol = float(atol) - - dx = d.to_dask_array() - dx = da.ma.masked_values(dx, value, rtol=rtol, atol=atol) - d._set_dask(dx) - return d - @_inplace_enabled(default=False) @_deprecated_kwarg_check("i", version="3.0.0", removed_at="4.0.0") def mid_range( @@ -10501,19 +6612,6 @@ def inspect(self): `None` - **Examples** - - >>> d = cf.Data([9], 'm') - >>> d.inspect() - - ------------------- - {'_components': {'custom': {'_Units': , - '_axes': ('dim0',), - '_cyclic': set(), - '_hardmask': True, - 'dask': dask.array}, - 'netcdf': {}}} - """ from ..functions import inspect @@ -10591,9 +6689,9 @@ def isclose(self, y, rtol=None, atol=None): d = self.copy() d._set_dask(dx) - d.hardmask = _DEFAULT_HARDMASK + d.hardmask = self._DEFAULT_HARDMASK d.override_units(_units_None, inplace=True) - d._update_deterministic(not is_dask_collection(y)) + d._update_deterministic(y) return d @@ -10659,25 +6757,13 @@ def reshape(self, *shape, merge_chunks=True, limit=None, inplace=False): """ d = _inplace_enabled_define_and_cleanup(self) - dx = d.to_dask_array() - dx = dx.reshape(*shape, merge_chunks=merge_chunks, limit=limit) - - # Set axes when the new array has more dimensions than self - axes = None - ndim0 = self.ndim - if not ndim0: - axes = generate_axis_identifiers(dx.ndim) - else: - diff = dx.ndim - ndim0 - if diff > 0: - axes = list(self._axes) - for _ in range(diff): - axes.insert(0, new_axis_identifier(tuple(axes))) - - if axes is not None: - d._axes = axes + super(Data, d).reshape( + *shape, merge_chunks=merge_chunks, limit=limit, inplace=True + ) - d._set_dask(dx) + # Clear cyclic axes, as we can't help but lose them in this + # operation + d._cyclic = _empty_set return d @@ -10842,7 +6928,7 @@ def round(self, decimals=0, inplace=False, i=False): def stats( self, all=False, - compute=True, + values=True, minimum=True, mean=True, median=True, @@ -10881,7 +6967,7 @@ def stats( Calculate all possible statistics, regardless of the value of individual metric parameters. - compute: `bool`, optional + values: `bool`, optional If True (the default), returned values for the statistical calculations in the output dictionary are computed, else each is given in the form of a delayed `Data` operation. @@ -10946,11 +7032,11 @@ def stats( :Returns: `dict` - The statistics, with keys giving the operation names and - values being the result of the corresponding statistical - calculation, which are either the computed numerical - values if `compute` is True, else the delayed `Data` - operations which encapsulate those. + The statistics, with keys giving the operation names + and values being the result of the corresponding + statistical calculation, which are either the computed + numerical values if *values*` is True, else the + delayed `Data` operations which encapsulate those. **Examples** @@ -10998,7 +7084,7 @@ def stats( To ask for delayed operations instead of computed values: - >>> d.stats(compute=False) + >>> d.stats(values=False) {'minimum': , 'mean': , 'median': , @@ -11050,8 +7136,8 @@ def stats( if all or sample_size: out["sample_size"] = delayed(lambda: self.sample_size())() - data_values = globals()["compute"](out)[0] # noqa: F811 - if compute: + data_values = compute(out)[0] + if values: # Convert cf.Data objects holding the scalars (or scalar array # for the case of sample_size only) to scalar values return {op: val.array.item() for op, val in data_values.items()} @@ -11359,9 +7445,9 @@ def where( # Missing values could be affected, so make sure that the mask # hardness has been applied. # - # 'cf_where' has its own calls to 'cf_asanyarray', so we can - # set '_asanyarray=False'. - dx = d.to_dask_array(apply_mask_hardness=True, _asanyarray=False) + # 'cf_where' has its own calls to 'cfdm_to_memory', so we can + # set '_force_to_memory=False'. + dx = d.to_dask_array(_force_to_memory=False) units = d.Units @@ -11376,9 +7462,9 @@ def where( condition = type(self).asdata(condition) condition = where_broadcastable(d, condition, "condition") - # 'cf_where' has its own calls to 'cf_asanyarray', so we can - # set '_asanyarray=False'. - condition = condition.to_dask_array(_asanyarray=False) + # 'cf_where' has its own calls to 'cfdm_to_memory', so we can + # set '_force_to_memory=False'. + condition = condition.to_dask_array(_force_to_memory=False) # If x or y is self then change it to None. This prevents an # unnecessary copy; and, at compute time, an unncessary numpy @@ -11400,7 +7486,7 @@ def where( xy.append(arg) continue - if arg is cf_masked: + if arg is masked: # Replace masked constant with array xy.append(scalar_masked_array(self.dtype)) continue @@ -11422,7 +7508,7 @@ def where( x, y = xy # Apply the where operation - dx = da.core.elemwise(cf_where, dx, condition, x, y, d.hardmask) + dx = da.core.elemwise(cfdm_where, dx, condition, x, y, d.hardmask) d._set_dask(dx) # Don't know (yet) if 'x' and 'y' have a deterministic names @@ -11610,48 +7696,6 @@ def cosh(self, inplace=False): return d - def cull_graph(self): - """Remove unnecessary tasks from the dask graph in-place. - - **Performance** - - An unnecessary task is one which does not contribute to the - computed result. Such tasks are always automatically removed - (culled) at compute time, but removing them beforehand might - improve performance by reducing the amount of work done in - later steps. - - .. versionadded:: 3.14.0 - - .. seealso:: `dask.optimization.cull` - - :Returns: - - `None` - - **Examples** - - >>> d = cf.Data([1, 2, 3, 4, 5], chunks=3) - >>> d = d[:2] - >>> dict(d.to_dask_array().dask) - {('array-21ea057f160746a3d3f0943bba945460', 0): array([1, 2, 3]), - ('array-21ea057f160746a3d3f0943bba945460', 1): array([4, 5]), - ('getitem-3e4edac0a632402f6b45923a6b9d215f', - 0): (, ('array-21ea057f160746a3d3f0943bba945460', - 0), (slice(0, 2, 1),))} - >>> d.cull_graph() - >>> dict(d.to_dask_array().dask) - {('getitem-3e4edac0a632402f6b45923a6b9d215f', - 0): (, ('array-21ea057f160746a3d3f0943bba945460', - 0), (slice(0, 2, 1),)), - ('array-21ea057f160746a3d3f0943bba945460', 0): array([1, 2, 3])} - - """ - dx = self.to_dask_array(_asanyarray=False) - dsk, _ = cull(dx.dask, dx.__dask_keys__()) - dx = da.Array(dsk, name=dx.name, chunks=dx.chunks, dtype=dx.dtype) - self._set_dask(dx, clear=_NONE, asanyarray=None) - @_deprecated_kwarg_check("i", version="3.0.0", removed_at="4.0.0") @_inplace_enabled(default=False) def tanh(self, inplace=False): @@ -11727,141 +7771,29 @@ def log(self, base=None, inplace=False, i=False): {{i: deprecated at version 3.0.0}} - :Returns: - - `Data` or `None` - - """ - d = _inplace_enabled_define_and_cleanup(self) - dx = d.to_dask_array() - - if base is None: - dx = da.log(dx) - elif base == 10: - dx = da.log10(dx) - elif base == 2: - dx = da.log2(dx) - else: - dx = da.log(dx) - dx /= da.log(base) - - d._set_dask(dx) - - d.override_units( - _units_1, inplace=True - ) # all logarithm outputs are unitless - - return d - - @_deprecated_kwarg_check("i", version="3.0.0", removed_at="4.0.0") - @_inplace_enabled(default=False) - def squeeze(self, axes=None, inplace=False, i=False): - """Remove size 1 axes from the data array. - - By default all size 1 axes are removed, but particular axes - may be selected with the keyword arguments. - - .. seealso:: `flatten`, `insert_dimension`, `flip`, - `swapaxes`, `transpose` - - :Parameters: - - axes: (sequence of) int, optional - Select the axes. By default all size 1 axes are - removed. The *axes* argument may be one, or a - sequence, of integers that select the axis - corresponding to the given position in the list of - axes of the data array. - - No axes are removed if *axes* is an empty sequence. - - {{inplace: `bool`, optional}} - - {{i: deprecated at version 3.0.0}} - - :Returns: - - `Data` or `None` - The squeezed data array. - - **Examples** - - >>> v.shape - (1,) - >>> v.squeeze() - >>> v.shape - () - - >>> v.shape - (1, 2, 1, 3, 1, 4, 1, 5, 1, 6, 1) - >>> v.squeeze((0,)) - >>> v.shape - (2, 1, 3, 1, 4, 1, 5, 1, 6, 1) - >>> v.squeeze(1) - >>> v.shape - (2, 3, 1, 4, 1, 5, 1, 6, 1) - >>> v.squeeze([2, 4]) - >>> v.shape - (2, 3, 4, 5, 1, 6, 1) - >>> v.squeeze([]) - >>> v.shape - (2, 3, 4, 5, 1, 6, 1) - >>> v.squeeze() - >>> v.shape - (2, 3, 4, 5, 6) - - """ - d = _inplace_enabled_define_and_cleanup(self) - - if not d.ndim: - if axes or axes == 0: - raise ValueError( - "Can't squeeze: Can't remove an axis from " - f"scalar {d.__class__.__name__}" - ) - - if inplace: - d = None - - return d - - shape = d.shape - - if axes is None: - iaxes = tuple([i for i, n in enumerate(shape) if n == 1]) - else: - iaxes = d._parse_axes(axes) - - # Check the squeeze axes - for i in iaxes: - if shape[i] > 1: - raise ValueError( - f"Can't squeeze {d.__class__.__name__}: " - f"Can't remove axis of size {shape[i]}" - ) + :Returns: - if not iaxes: - # Short circuit if the squeeze is a null operation - return d + `Data` or `None` - # Still here? Then the data array is not scalar and at least - # one size 1 axis needs squeezing. + """ + d = _inplace_enabled_define_and_cleanup(self) dx = d.to_dask_array() - dx = dx.squeeze(axis=iaxes) - # Squeezing a dimension doesn't affect the cached elements - d._set_dask(dx, clear=_ALL ^ _CACHE) + if base is None: + dx = da.log(dx) + elif base == 10: + dx = da.log10(dx) + elif base == 2: + dx = da.log2(dx) + else: + dx = da.log(dx) + dx /= da.log(base) - # Remove the squeezed axes names - d._axes = [axis for i, axis in enumerate(d._axes) if i not in iaxes] + d._set_dask(dx) - # Update the HDF5 chunking strategy - chunksizes = d.nc_hdf5_chunksizes() - if isinstance(chunksizes, tuple): - chunksizes = [ - size for i, size in enumerate(chunksizes) if i not in iaxes - ] - d.nc_set_hdf5_chunksizes(chunksizes) + d.override_units( + _units_1, inplace=True + ) # all logarithm outputs are unitless return d @@ -11926,109 +7858,6 @@ def tan(self, inplace=False, i=False): return d - def todict( - self, optimize_graph=True, apply_mask_hardness=False, _asanyarray=True - ): - """Return a dictionary of the dask graph key/value pairs. - - .. versionadded:: 3.15.0 - - .. seealso:: `to_dask_array` - - :Parameters: - - optimize_graph: `bool` - If True, the default, then prior to being converted to - a dictionary, the graph is optimised to remove unused - chunks. Note that optimising the graph can add a - considerable performance overhead. - - apply_mask_hardness: `bool`, optional - If True then force the mask hardness of the returned - array to be that given by the `hardmask` attribute. - - .. versionadded:: NEXTVERSION - - _asanyarray: `bool`, optional - If True (the default) and the `__asanyarray__` - attribute is also `True`, then a `cf_asanyarray` - operation is added to the dictionary representation of - the Dask graph. If False then this operation is not - added. See `to_dask_array` for details. - - .. versionadded:: NEXTVERSION - - :Returns: - - `dict` - The dictionary of the dask graph key/value pairs. - - **Examples** - - >>> d = cf.Data([1, 2, 3, 4], chunks=2) - >>> d.todict() - {('array-2f41b21b4cd29f757a7bfa932bf67832', 0): array([1, 2]), - ('array-2f41b21b4cd29f757a7bfa932bf67832', 1): array([3, 4])} - >>> e = d[0] - >>> e.todict() - {('getitem-153fd24082bc067cf438a0e213b41ce6', - 0): (, ('array-2f41b21b4cd29f757a7bfa932bf67832', - 0), (slice(0, 1, 1),)), - ('array-2f41b21b4cd29f757a7bfa932bf67832', 0): array([1, 2])} - >>> e.todict(optimize_graph=False) - {('array-2f41b21b4cd29f757a7bfa932bf67832', 0): array([1, 2]), - ('array-2f41b21b4cd29f757a7bfa932bf67832', 1): array([3, 4]), - ('getitem-153fd24082bc067cf438a0e213b41ce6', - 0): (, ('array-2f41b21b4cd29f757a7bfa932bf67832', - 0), (slice(0, 1, 1),))} - - """ - dx = self.to_dask_array( - apply_mask_hardness=apply_mask_hardness, _asanyarray=_asanyarray - ) - - if optimize_graph: - return collections_to_dsk((dx,), optimize_graph=True) - - return dict(collections_to_dsk((dx,), optimize_graph=False)) - - def tolist(self): - """Return the data as a scalar or (nested) list. - - Returns the data as an ``N``-levels deep nested list of Python - scalars, where ``N`` is the number of data dimensions. - - If ``N`` is 0 then, since the depth of the nested list is 0, - it will not be a list at all, but a simple Python scalar. - - .. sealso:: `todict` - - :Returns: - - `list` or scalar - The (nested) list of array elements, or a scalar if - the data has 0 dimensions. - - **Examples** - - >>> d = cf.Data(9) - >>> d.tolist() - 9 - - >>> d = cf.Data([1, 2]) - >>> d.tolist() - [1, 2] - - >>> d = cf.Data(([[1, 2], [3, 4]])) - >>> d.tolist() - [[1, 2], [3, 4]] - - >>> d.equals(cf.Data(d.tolist())) - True - - """ - return self.array.tolist() - def to_memory(self): """Bring data on disk into memory. @@ -12040,81 +7869,6 @@ def to_memory(self): "Consider using 'Data.persist' instead." ) - @_deprecated_kwarg_check("i", version="3.0.0", removed_at="4.0.0") - @_inplace_enabled(default=False) - def transpose(self, axes=None, inplace=False, i=False): - """Permute the axes of the data array. - - .. seealso:: `flatten`, `insert_dimension`, `flip`, `squeeze`, - `swapaxes` - - :Parameters: - - axes: (sequence of) `int` - The new axis order of the data array. By default the order - is reversed. Each axis of the new order is identified by - its original integer position. - - {{inplace: `bool`, optional}} - - {{i: deprecated at version 3.0.0}} - - :Returns: - - `Data` or `None` - - **Examples** - - >>> d.shape - (19, 73, 96) - >>> d.transpose() - >>> d.shape - (96, 73, 19) - >>> d.transpose([1, 0, 2]) - >>> d.shape - (73, 96, 19) - >>> d.transpose((-1, 0, 1)) - >>> d.shape - (19, 73, 96) - - """ - d = _inplace_enabled_define_and_cleanup(self) - - ndim = d.ndim - if axes is None: - iaxes = tuple(range(ndim - 1, -1, -1)) - else: - iaxes = d._parse_axes(axes) - - if iaxes == tuple(range(ndim)): - # Short circuit if the transpose is a null operation - return d - - # Note: The _axes attribute is important because e.g. axes - # labelled as cyclic by the _cyclic attribute use it to - # determine their position (see #discussion_r694096462 - # on PR #247). - data_axes = d._axes - d._axes = [data_axes[i] for i in iaxes] - - dx = d.to_dask_array() - try: - dx = da.transpose(dx, axes=axes) - except ValueError: - raise ValueError( - f"Can't transpose: Axes don't match array: {axes}" - ) - - d._set_dask(dx) - - # Update the HDF5 chunking strategy - chunksizes = d.nc_hdf5_chunksizes() - if isinstance(chunksizes, tuple): - chunksizes = [chunksizes[i] for i in axes] - d.nc_set_hdf5_chunksizes(chunksizes) - - return d - @_deprecated_kwarg_check("i", version="3.0.0", removed_at="4.0.0") @_inplace_enabled(default=False) def trunc(self, inplace=False, i=False): @@ -12153,237 +7907,6 @@ def trunc(self, inplace=False, i=False): d._set_dask(dx) return d - @classmethod - def empty( - cls, - shape, - dtype=None, - units=None, - calendar=None, - fill_value=None, - chunks=_DEFAULT_CHUNKS, - ): - """Return a new array of given shape and type, without - initialising entries. - - .. seealso:: `full`, `ones`, `zeros` - - :Parameters: - - shape: `int` or `tuple` of `int` - The shape of the new array. e.g. ``(2, 3)`` or ``2``. - - dtype: data-type - The desired output data-type for the array, e.g. - `numpy.int8`. The default is `numpy.float64`. - - units: `str` or `Units` - The units for the new data array. - - calendar: `str`, optional - The calendar for reference time units. - - {{chunks: `int`, `tuple`, `dict` or `str`, optional}} - - .. versionadded:: 3.14.0 - - fill_value: deprecated at version 3.14.0 - Use `set_fill_value` instead. - - :Returns: - - `Data` - Array of uninitialised (arbitrary) data of the given - shape and dtype. - - **Examples** - - >>> d = cf.Data.empty((2, 2)) - >>> print(d.array) - [[ -9.74499359e+001 6.69583040e-309], - [ 2.13182611e-314 3.06959433e-309]] #uninitialised - - >>> d = cf.Data.empty((2,), dtype=bool) - >>> print(d.array) - [ False True] #uninitialised - - """ - dx = da.empty(shape, dtype=dtype, chunks=chunks) - return cls(dx, units=units, calendar=calendar) - - @classmethod - def full( - cls, - shape, - fill_value, - dtype=None, - units=None, - calendar=None, - chunks=_DEFAULT_CHUNKS, - ): - """Return a new array of given shape and type, filled with a - fill value. - - .. seealso:: `empty`, `ones`, `zeros` - - :Parameters: - - shape: `int` or `tuple` of `int` - The shape of the new array. e.g. ``(2, 3)`` or ``2``. - - fill_value: scalar - The fill value. - - dtype: data-type - The desired data-type for the array. The default, `None`, - means ``np.array(fill_value).dtype``. - - units: `str` or `Units` - The units for the new data array. - - calendar: `str`, optional - The calendar for reference time units. - - {{chunks: `int`, `tuple`, `dict` or `str`, optional}} - - .. versionadded:: 3.14.0 - - :Returns: - - `Data` - Array of *fill_value* with the given shape and data - type. - - **Examples** - - >>> d = cf.Data.full((2, 3), -99) - >>> print(d.array) - [[-99 -99 -99] - [-99 -99 -99]] - - >>> d = cf.Data.full(2, 0.0) - >>> print(d.array) - [0. 0.] - - >>> d = cf.Data.full((2,), 0, dtype=bool) - >>> print(d.array) - [False False] - - """ - if dtype is None: - # Need to explicitly set the default because dtype is not - # a named keyword of da.full - dtype = getattr(fill_value, "dtype", None) - if dtype is None: - dtype = np.array(fill_value).dtype - - dx = da.full(shape, fill_value, dtype=dtype, chunks=chunks) - return cls(dx, units=units, calendar=calendar) - - @classmethod - def ones( - cls, - shape, - dtype=None, - units=None, - calendar=None, - chunks=_DEFAULT_CHUNKS, - ): - """Returns a new array filled with ones of set shape and type. - - .. seealso:: `empty`, `full`, `zeros` - - :Parameters: - - shape: `int` or `tuple` of `int` - The shape of the new array. e.g. ``(2, 3)`` or ``2``. - - dtype: data-type - The desired data-type for the array, e.g. - `numpy.int8`. The default is `numpy.float64`. - - units: `str` or `Units` - The units for the new data array. - - calendar: `str`, optional - The calendar for reference time units. - - {{chunks: `int`, `tuple`, `dict` or `str`, optional}} - - .. versionadded:: 3.14.0 - - :Returns: - - `Data` - Array of ones with the given shape and data type. - - **Examples** - - >>> d = cf.Data.ones((2, 3)) - >>> print(d.array) - [[1. 1. 1.] - [1. 1. 1.]] - - >>> d = cf.Data.ones((2,), dtype=bool) - >>> print(d.array) - [ True True] - - """ - dx = da.ones(shape, dtype=dtype, chunks=chunks) - return cls(dx, units=units, calendar=calendar) - - @classmethod - def zeros( - cls, - shape, - dtype=None, - units=None, - calendar=None, - chunks=_DEFAULT_CHUNKS, - ): - """Returns a new array filled with zeros of set shape and type. - - .. seealso:: `empty`, `full`, `ones` - - :Parameters: - - shape: `int` or `tuple` of `int` - The shape of the new array. - - dtype: data-type - The data-type of the new array. By default the - data-type is ``float``. - - units: `str` or `Units` - The units for the new data array. - - calendar: `str`, optional - The calendar for reference time units. - - {{chunks: `int`, `tuple`, `dict` or `str`, optional}} - - .. versionadded:: 3.14.0 - - :Returns: - - `Data` - Array of zeros with the given shape and data type. - - **Examples** - - >>> d = cf.Data.zeros((2, 3)) - >>> print(d.array) - [[0. 0. 0.] - [0. 0. 0.]] - - >>> d = cf.Data.zeros((2,), dtype=bool) - >>> print(d.array) - [False False] - - """ - dx = da.zeros(shape, dtype=dtype, chunks=chunks) - return cls(dx, units=units, calendar=calendar) - @_deprecated_kwarg_check("out", version="3.14.0", removed_at="5.0.0") @_deprecated_kwarg_check("i", version="3.0.0", removed_at="4.0.0") @_inplace_enabled(default=False) @@ -13386,11 +8909,6 @@ def sqrt(self, dtype=None, inplace=False): # ---------------------------------------------------------------- # Aliases # ---------------------------------------------------------------- - @property - def dtarray(self): - """Alias for `datetime_array`""" - return self.datetime_array - @_inplace_enabled(default=False) @_deprecated_kwarg_check("i", version="3.0.0", removed_at="4.0.0") def maximum( diff --git a/cf/data/fragment/netcdffragmentarray.py b/cf/data/fragment/netcdffragmentarray.py index ee34501e94..4719741904 100644 --- a/cf/data/fragment/netcdffragmentarray.py +++ b/cf/data/fragment/netcdffragmentarray.py @@ -1,7 +1,7 @@ import cfdm from ..array.abstract import Array -from ..array.mixin import FileArrayMixin, IndexMixin +from ..array.mixin import FileArrayMixin from .h5netcdffragmentarray import H5netcdfFragmentArray from .mixin import FragmentArrayMixin from .netcdf4fragmentarray import NetCDF4FragmentArray @@ -9,9 +9,9 @@ class NetCDFFragmentArray( FragmentArrayMixin, - IndexMixin, cfdm.data.mixin.NetCDFFileMixin, FileArrayMixin, + cfdm.data.mixin.IndexMixin, cfdm.data.mixin.FileArrayMixin, Array, ): diff --git a/cf/data/utils.py b/cf/data/utils.py index c7982fc857..c1b1a63920 100644 --- a/cf/data/utils.py +++ b/cf/data/utils.py @@ -1,323 +1,17 @@ """General functions useful for `Data` functionality.""" -from functools import lru_cache, partial, reduce -from itertools import product +from functools import partial, reduce from operator import mul -import dask.array as da import numpy as np -from ..cfdatetime import ( - canonical_calendar, - default_calendar, - dt, - dt2rt, - rt2dt, - st2rt, -) +from ..cfdatetime import canonical_calendar, default_calendar from ..units import Units from .dask_utils import cf_YMDhms _units_None = Units(None) -def is_numeric_dtype(array): - """True if the given array is of a numeric or boolean data type. - - .. versionadded:: 3.14.0 - - :Parameters: - - array: numpy-like array - - :Returns: - - `bool` - Whether or not the array holds numeric elements. - - **Examples** - - >>> a = np.array([0, 1, 2]) - >>> cf.data.utils.is_numeric_dtype(a) - True - >>> a = np.array([False, True, True]) - >>> cf.data.utils.is_numeric_dtype(a) - True - >>> a = np.array(["a", "b", "c"], dtype="S1") - >>> cf.data.utils.is_numeric_dtype(a) - False - >>> a = np.ma.array([10.0, 2.0, 3.0], mask=[1, 0, 0]) - >>> cf.data.utils.is_numeric_dtype(a) - True - >>> a = np.array(10) - >>> cf.data.utils.is_numeric_dtype(a) - True - >>> a = np.empty(1, dtype=object) - >>> cf.data.utils.is_numeric_dtype(a) - False - - """ - dtype = array.dtype - - # This checks if the dtype is either a standard "numeric" type (i.e. - # int types, floating point types or complex floating point types) - # or Boolean, which are effectively a restricted int type (0 or 1). - # We determine the former by seeing if it sits under the 'np.number' - # top-level dtype in the NumPy dtype hierarchy; see the - # 'Hierarchy of type objects' figure diagram under: - # https://numpy.org/doc/stable/reference/arrays.scalars.html#scalars - return np.issubdtype(dtype, np.number) or np.issubdtype(dtype, np.bool_) - - -def convert_to_datetime(a, units): - """Convert a dask array of numbers to one of date-time objects. - - .. versionadded:: 3.14.0 - - .. seealso `convert_to_reftime` - - :Parameters: - - a: `dask.array.Array` - The input numeric reference time values. - - units: `Units` - The reference time units that define the output - date-time objects. - - :Returns: - - `dask.array.Array` - A new dask array containing date-time objects. - - **Examples** - - >>> import dask.array as da - >>> d = da.from_array(2.5) - >>> e = cf.data.utils.convert_to_datetime(d, cf.Units("days since 2000-12-01")) - >>> print(e.compute()) - 2000-12-03 12:00:00 - - """ - return a.map_blocks( - partial(rt2dt, units_in=units), - dtype=object, - meta=np.array((), dtype=object), - ) - - -def convert_to_reftime(a, units=None, first_value=None): - """Convert a dask array of string or object date-times to floating - point reference times. - - .. versionadded:: 3.14.0 - - .. seealso `convert_to_datetime` - - :Parameters: - - a: `dask.array.Array` - - units: `Units`, optional - Specify the units for the output reference time - values. By default the units are inferred from the first - non-missing value in the array, or set to ```` if all values are missing. - - first_value: optional - If set, then assumed to be equal to the first non-missing - value of the array, thereby removing the need to find it - by inspection of *a*, which may be expensive. By default - the first non-missing value is found from *a*. - - :Returns: - - (`dask.array.Array`, `Units`) - The reference times, and their units. - - >>> import dask.array as da - >>> d = da.from_array(2.5) - >>> e = cf.data.utils.convert_to_datetime(d, cf.Units("days since 2000-12-01")) - - >>> f, u = cf.data.utils.convert_to_reftime(e) - >>> f.compute() - 0.5 - >>> u - - - >>> f, u = cf.data.utils.convert_to_reftime(e, cf.Units("days since 1999-12-01")) - >>> f.compute() - 368.5 - >>> u - - - """ - kind = a.dtype.kind - if kind in "US": - # Convert date-time strings to reference time floats - if not units: - first_value = first_non_missing_value(a, cached=first_value) - if first_value is not None: - YMD = str(first_value).partition("T")[0] - else: - YMD = "1970-01-01" - - units = Units( - "days since " + YMD, - getattr(units, "calendar", default_calendar), - ) - - a = a.map_blocks( - partial(st2rt, units_in=units, units_out=units), dtype=float - ) - - elif kind == "O": - # Convert date-time objects to reference time floats - first_value = first_non_missing_value(a, cached=first_value) - if first_value is not None: - x = first_value - else: - x = dt(1970, 1, 1, calendar=default_calendar) - - x_since = "days since " + "-".join(map(str, (x.year, x.month, x.day))) - x_calendar = getattr(x, "calendar", default_calendar) - - d_calendar = getattr(units, "calendar", None) - d_units = getattr(units, "units", None) - - if x_calendar != "": - if not units: - d_calendar = x_calendar - elif not units.equivalent(Units(x_since, x_calendar)): - raise ValueError( - "Incompatible units: " - f"{units!r}, {Units(x_since, x_calendar)!r}" - ) - - if not units: - # Set the units to something that is (hopefully) close to - # all of the datetimes, in an attempt to reduce errors - # arising from the conversion to reference times - units = Units(x_since, calendar=d_calendar) - else: - units = Units(d_units, calendar=d_calendar) - - # Convert the date-time objects to reference times - a = a.map_blocks(dt2rt, units_in=None, units_out=units, dtype=float) - - if not units.isreftime: - raise ValueError( - f"Can't create a reference time array with units {units!r}" - ) - - return a, units - - -def first_non_missing_value(a, cached=None, method="index"): - """Return the first non-missing value of a dask array. - - .. versionadded:: 3.14.0 - - :Parameters: - - a: `dask.array.Array` - The array to be inspected. - - cached: scalar, optional - If set to a value other than `None`, then return without - inspecting the array. This allows a previously found first - value to be used instead of a potentially costly array - access. - - method: `str`, optional - Select the method used to find the first non-missing - value. - - The default ``'index'`` method evaulates sequentially the - elements of the flattened array and returns when the first - non-missing value is found. - - The ``'mask'`` method finds the first non-missing value of - the flattened array as that which has the same location as - the first False element of the flattened array mask. - - It is considered likely that the ``'index'`` method is - fastest for data for which the first element is not - missing, but this may not always be the case. - - :Returns: - - If set, then *cached* is returned. Otherwise returns the - first non-missing value of *a*, or `None` if there isn't - one. - - **Examples** - - >>> import dask.array as da - >>> d = da.arange(8).reshape(2, 4) - >>> print(d.compute()) - [[0 1 2 3] - [4 5 6 7]] - >>> cf.data.utils.first_non_missing_value(d) - 0 - >>> cf.data.utils.first_non_missing_value(d, cached=99) - 99 - >>> d[0, 0] = cf.masked - >>> cf.data.utils.first_non_missing_value(d) - 1 - >>> d[0, :] = cf.masked - >>> cf.data.utils.first_non_missing_value(d) - 4 - >>> cf.data.utils.first_non_missing_value(d, cached=99) - 99 - >>> d[...] = cf.masked - >>> print(cf.data.utils.first_non_missing_value(d)) - None - >>> print(cf.data.utils.first_non_missing_value(d, cached=99)) - 99 - - """ - if cached is not None: - return cached - - if method == "index": - shape = a.shape - for i in range(a.size): - index = np.unravel_index(i, shape) - x = a[index].compute() - if not (x is np.ma.masked or np.ma.getmask(x)): - try: - return x.item() - except AttributeError: - return x - - return - - if method == "mask": - mask = da.ma.getmaskarray(a) - if not a.ndim: - # Scalar data - if mask: - return - - a = a.compute() - try: - return a.item() - except AttributeError: - return a - - x = a[da.unravel_index(mask.argmin(), a.shape)].compute() - if x is np.ma.masked: - return - - try: - return x.item() - except AttributeError: - return x - - raise ValueError(f"Unknown value of 'method': {method!r}") - - def unique_calendars(a): """Find the unique calendars from a dask array of date-time objects. @@ -360,162 +54,6 @@ def _get_calendar(x): return set(out) -@lru_cache(maxsize=32) -def new_axis_identifier(existing_axes=(), basename="dim"): - """Return a new, unique axis identifier. - - The name is arbitrary and has no semantic meaning. - - .. versionadded:: 3.14.0 - - :Parameters: - - existing_axes: sequence of `str`, optional - Any existing axis names that are not to be duplicated. - - basename: `str`, optional - The root of the new axis identifier. The new axis - identifier will be this root followed by an integer. - - :Returns: - - `str` - The new axis idenfifier. - - **Examples** - - >>> cf.data.utils.new_axis_identifier() - 'dim0' - >>> cf.data.utils.new_axis_identifier(['dim0']) - 'dim1' - >>> cf.data.utils.new_axis_identifier(['dim3']) - 'dim1' - >>> cf.data.utils.new_axis_identifier(['dim1']) - 'dim2' - >>> cf.data.utils.new_axis_identifier(['dim1', 'dim0']) - 'dim2' - >>> cf.data.utils.new_axis_identifier(['dim3', 'dim4']) - 'dim2' - >>> cf.data.utils.new_axis_identifier(['dim2', 'dim0']) - 'dim3' - >>> cf.data.utils.new_axis_identifier(['dim3', 'dim4', 'dim0']) - 'dim5' - >>> cf.data.utils.new_axis_identifier(basename='axis') - 'axis0' - >>> cf.data.utils.new_axis_identifier(basename='axis') - 'axis0' - >>> cf.data.utils.new_axis_identifier(['dim0'], basename='axis') - 'axis1' - >>> cf.data.utils.new_axis_identifier(['dim0', 'dim1'], basename='axis') - 'axis2' - - """ - n = len(existing_axes) - axis = f"{basename}{n}" - while axis in existing_axes: - n += 1 - axis = f"{basename}{n}" - - return axis - - -def chunk_positions(chunks): - """Find the position of each chunk. - - .. versionadded:: 3.14.0 - - .. seealso:: `chunk_indices`, `chunk_locations`, `chunk_shapes` - - :Parameters: - - chunks: `tuple` - The chunk sizes along each dimension, as output by - `dask.array.Array.chunks`. - - **Examples** - - >>> chunks = ((1, 2), (9,), (44, 55, 66)) - >>> for position in cf.data.utils.chunk_positions(chunks): - ... print(position) - ... - (0, 0, 0) - (0, 0, 1) - (0, 0, 2) - (1, 0, 0) - (1, 0, 1) - (1, 0, 2) - - """ - return product(*(range(len(bds)) for bds in chunks)) - - -def chunk_shapes(chunks): - """Find the shape of each chunk. - - .. versionadded:: 3.14.0 - - .. seealso:: `chunk_indices`, `chunk_locations`, `chunk_positions` - - :Parameters: - - chunks: `tuple` - The chunk sizes along each dimension, as output by - `dask.array.Array.chunks`. - - **Examples** - - >>> chunks = ((1, 2), (9,), (4, 5, 6)) - >>> for shape in cf.data.utils.chunk_shapes(chunks): - ... print(shape) - ... - (1, 9, 4) - (1, 9, 5) - (1, 9, 6) - (2, 9, 4) - (2, 9, 5) - (2, 9, 6) - - """ - return product(*chunks) - - -def chunk_locations(chunks): - """Find the shape of each chunk. - - .. versionadded:: 3.15.0 - - .. seealso:: `chunk_indices`, `chunk_positions`, `chunk_shapes` - - :Parameters: - - chunks: `tuple` - The chunk sizes along each dimension, as output by - `dask.array.Array.chunks`. - - **Examples** - - >>> chunks = ((1, 2), (9,), (4, 5, 6)) - >>> for location in cf.data.utils.chunk_locations(chunks): - ... print(location) - ... - ((0, 1), (0, 9), (0, 4)) - ((0, 1), (0, 9), (4, 9)) - ((0, 1), (0, 9), (9, 15)) - ((1, 3), (0, 9), (0, 4)) - ((1, 3), (0, 9), (4, 9)) - ((1, 3), (0, 9), (9, 15)) - - """ - from dask.utils import cached_cumsum - - cumdims = [cached_cumsum(bds, initial_zero=True) for bds in chunks] - locations = [ - [(s, s + dim) for s, dim in zip(starts, shapes)] - for starts, shapes in zip(cumdims, chunks) - ] - return product(*locations) - - def scalar_masked_array(dtype=float): """Return a scalar masked array. @@ -879,10 +417,10 @@ def collapse( kwargs["ddof"] = ddof # The applicable chunk function will have its own call to - # 'cf_asanyarray', so we can set '_asanyarray=False'. Also, setting - # _asanyarray=False will ensure that any active storage operations - # are not compromised. - dx = d.to_dask_array(_asanyarray=False) + # 'cfdm_to_memory', so we can set '_force_to_memory=False'. Also, + # setting _force_to_memory=False will ensure that any active + # storage operations are not compromised. + dx = d.to_dask_array(_force_to_memory=False) dx = func(dx, **kwargs) d._set_dask(dx) @@ -1011,52 +549,3 @@ def parse_weights(d, weights, axis=None): # Return the product of the weights components, which will be # broadcastable to d return reduce(mul, w) - - -def normalize_chunks(chunks, shape=None, dtype=None): - """Normalize chunks to tuple of tuples. - - The shape may contain sizes of ``nan``. This could occur when the - underlying data is compressed in a way which makes the shape - impossible to infer without actually uncompressing the data. - - If *shape* contains no ``nan`` sizes then this function is - identical to `dask.array.core.normalize_chunks`. If it does, then - the output chunks for each such axis will be ``(nan,)``. - - .. versionadded 3.16.0 - - :Parameters: - - chunks: tuple, int, dict, or string - The chunks to be normalized. See - `dask.array.core.normalize_chunks` for details. - - shape: `tuple` - The shape of the data. - - dtype: data-type - The data-type for the data. - - :Returns: - - `tuple` - The normalized chunks. - - """ - from math import isnan, nan - - from dask.array.core import normalize_chunks - - if not any(map(isnan, shape)): - return normalize_chunks(chunks, shape=shape, dtype=dtype) - - out = [ - ( - (nan,) - if isnan(size) - else normalize_chunks(chunk, shape=(size,), dtype=dtype)[0] - ) - for chunk, size in zip(chunks, shape) - ] - return tuple(out) diff --git a/cf/docstring/docstring.py b/cf/docstring/docstring.py index c24ec0d436..f3a12ff306 100644 --- a/cf/docstring/docstring.py +++ b/cf/docstring/docstring.py @@ -179,42 +179,6 @@ value given by the *radius* parameter is used instead. A value of ``'earth'`` is equivalent to a default value of 6371229 metres.""", - # chunks - "{{chunks: `int`, `tuple`, `dict` or `str`, optional}}": """chunks: `int`, `tuple`, `dict` or `str`, optional - Specify the chunking of the underlying dask array. - - Any value accepted by the *chunks* parameter of the - `dask.array.from_array` function is allowed. - - By default, ``"auto"`` is used to specify the array - chunking, which uses a chunk size in bytes defined by - the `cf.chunksize` function, preferring square-like - chunk shapes. - - *Parameter example:* - A blocksize like ``1000``. - - *Parameter example:* - A blockshape like ``(1000, 1000)``. - - *Parameter example:* - Explicit sizes of all blocks along all dimensions - like ``((1000, 1000, 500), (400, 400))``. - - *Parameter example:* - A size in bytes, like ``"100MiB"`` which will choose - a uniform block-like shape, preferring square-like - chunk shapes. - - *Parameter example:* - A blocksize of ``-1`` or `None` in a tuple or - dictionary indicates the size of the corresponding - dimension. - - *Parameter example:* - Blocksizes of some or all dimensions mapped to - dimension positions, like ``{1: 200}``, or ``{0: -1, - 1: (400, 400)}``.""", # Returns formula "{{Returns formula}}": """5-`tuple` * The standard name of the parametric coordinates. @@ -233,28 +197,6 @@ domain axis. If the vertical axis does not appear in the computed non-parametric coordinates then this an empty tuple.""", - # collapse axes - "{{collapse axes: (sequence of) `int`, optional}}": """axes: (sequence of) `int`, optional - The axes to be collapsed. By default all axes are - collapsed, resulting in output with size 1. Each axis - is identified by its integer position. If *axes* is an - empty sequence then the collapse is applied to each - scalar element and the result has the same shape as - the input data.""", - # collapse squeeze - "{{collapse squeeze: `bool`, optional}}": """squeeze: `bool`, optional - By default, the axes which are collapsed are left in - the result as dimensions with size one, so that the - result will broadcast correctly against the input - array. If set to True then collapsed axes are removed - from the data.""", - # collapse keepdims - "{{collapse keepdims: `bool`, optional}}": """keepdims: `bool`, optional - By default, the axes which are collapsed are left in - the result as dimensions with size one, so that the - result will broadcast correctly against the input - array. If set to False then collapsed axes are removed - from the data.""", # weights "{{weights: data_like, `dict`, or `None`, optional}}": """weights: data_like, `dict`, or `None`, optional Weights associated with values of the data. By default @@ -304,26 +246,6 @@ non-missing elements. A value of 1 applies Bessel's correction. If the calculation is weighted then *ddof* can only be 0 or 1.""", - # split_every - "{{split_every: `int` or `dict`, optional}}": """split_every: `int` or `dict`, optional - Determines the depth of the recursive aggregation. If - set to or more than the number of input chunks, the - aggregation will be performed in two steps, one - partial collapse per input chunk and a single - aggregation at the end. If set to less than that, an - intermediate aggregation step will be used, so that - any of the intermediate or final aggregation steps - operates on no more than ``split_every`` inputs. The - depth of the aggregation graph will be - :math:`log_{split\_every}}(\textnormal{input chunks - along reduced axes})`. Setting to a low value can reduce - cache size and network transfers, at the cost of more - CPU and a larger dask graph. - - By default, `dask` heuristically decides on a good - value. A default can also be set globally with the - ``split_every`` key in `dask.config`. See - `dask.array.reduction` for details.""", # active_storage "{{active_storage: `bool`, optional}}": """{{active_storage: `bool`, optional}} If True then attempt to perform the collapse using diff --git a/cf/field.py b/cf/field.py index 5325b41241..ec65d5be61 100644 --- a/cf/field.py +++ b/cf/field.py @@ -1002,7 +1002,7 @@ def _binary_operation(self, other, method): for axis in f.domain_axes(todict=True): identity = None - if self.is_discrete_axis(axis): + if f.is_discrete_axis(axis): # This is a discrete axis whose identity is # inferred from all of its auxiliary coordinates x = {} @@ -1320,7 +1320,9 @@ def _binary_operation(self, other, method): # ------------------------------------------------------------ # Operate on the data # ------------------------------------------------------------ - new_data = field0.data._binary_operation(field1.data, method) + new_data = field0.data._binary_operation( + field0.data, field1.data, method + ) field0.set_data(new_data, set_axes=False, copy=False) @@ -4087,7 +4089,7 @@ def weights( # ------------------------------------------------------------ # Still here? Return a weights field which is the outer - # product of the component weights + # product of the component weights. # ------------------------------------------------------------ pp = sorted(comp.items()) waxes, wdata = pp.pop(0) diff --git a/cf/functions.py b/cf/functions.py index 22820bc3db..bdfe306f00 100644 --- a/cf/functions.py +++ b/cf/functions.py @@ -11,7 +11,6 @@ from collections.abc import Iterable from itertools import product from math import isnan -from numbers import Integral from os import mkdir from os.path import abspath as _os_path_abspath from os.path import dirname as _os_path_dirname @@ -24,9 +23,7 @@ import cfdm import netCDF4 import numpy as np -from dask import config as _config from dask.base import is_dask_collection -from dask.utils import parse_bytes from psutil import virtual_memory from . import __cfa_version__, __file__, __version__ @@ -548,7 +545,7 @@ def FREE_MEMORY(): # We can inherit the generic logic for the cf-python log_level() # function as contained in _log_level, but can't inherit the # user-facing log_level() from cfdm as it operates on cfdm's CONSTANTS -# dict. Define cf-python's own. This also means the log_level +# dict. Define cf-python's own. This also means the log_level # dostrings are independent which is important for providing # module-specific documentation links and directives, etc. _reset_log_emergence_level = cfdm._reset_log_emergence_level @@ -573,6 +570,10 @@ class atol(ConstantAccess, cfdm.atol): pass +class chunksize(ConstantAccess, cfdm.chunksize): + pass + + class rtol(ConstantAccess, cfdm.rtol): pass @@ -778,74 +779,6 @@ def _parse(cls, arg): return bool(arg) -class chunksize(ConstantAccess): - """Set the default chunksize used by `dask` arrays. - - If called without any arguments then the existing chunksize is - returned. - - .. note:: Setting the chunk size will also change the `dask` - global configuration value ``'array.chunk-size'``. If - `chunksize` is used in a context manager then the `dask` - configuration value is only altered within that context. - Setting the chunk size directly from the `dask` - configuration API will affect subsequent data creation, - but will *not* change the value of `chunksize`. - - :Parameters: - - arg: number or `str` or `Constant`, optional - The chunksize in bytes. Any size accepted by - `dask.utils.parse_bytes` is accepted, for instance - ``100``, ``'100'``, ``'1e6'``, ``'100 MB'``, ``'100M'``, - ``'5kB'``, ``'5.4 kB'``, ``'1kiB'``, ``'1e6 kB'``, and - ``'MB'`` are all valid sizes. - - Note that if *arg* is a `float`, or a string that implies - a non-integral amount of bytes, then the integer part - (rounded down) will be used. - - *Parameter example:* - A chunksize of 2 MiB may be specified as ``'2097152'`` - or ``'2 MiB'`` - - *Parameter example:* - Chunksizes of ``'2678.9'`` and ``'2.6789 KB'`` are both - equivalent to ``2678``. - - :Returns: - - `Constant` - The value prior to the change, or the current value if no - new value was specified. - - """ - - _name = "CHUNKSIZE" - - def _parse(cls, arg): - """Parse a new constant value. - - .. versionaddedd:: 3.8.0 - - :Parameters: - - cls: - This class. - - arg: - The given new constant value. - - :Returns: - - A version of the new constant value suitable for insertion - into the `CONSTANTS` dictionary. - - """ - _config.set({"array.chunk-size": arg}) - return parse_bytes(arg) - - class tempdir(ConstantAccess): """The directory for internally generated temporary files. @@ -2191,6 +2124,10 @@ def parse_indices(shape, indices, cyclic=False, keepdims=True): indices: `tuple` The indices to be applied. + cyclic: `bool`, optional + If True then allow cyclic slices (such as ``slice(-4, 3, + 1)``). + keepdims: `bool`, optional If True then an integral index is converted to a slice. For instance, ``3`` would become ``slice(3, 4)``. @@ -2219,108 +2156,55 @@ def parse_indices(shape, indices, cyclic=False, keepdims=True): >>> cf.parse_indices((5, 8), (cf.Data([1, 3]),)) [dask.array, slice(None, None, None)] - """ - parsed_indices = [] - roll = {} - - if not isinstance(indices, tuple): - indices = (indices,) - - # Initialise the list of parsed indices as the input indices with any - # Ellipsis objects expanded - length = len(indices) - n = len(shape) - ndim = n - for index in indices: - if index is Ellipsis: - m = n - length + 1 - parsed_indices.extend([slice(None)] * m) - n -= m - else: - parsed_indices.append(index) - n -= 1 - - length -= 1 - - len_parsed_indices = len(parsed_indices) - - if ndim and len_parsed_indices > ndim: - raise IndexError( - f"Invalid indices {parsed_indices} for array with shape {shape}" - ) + parsed_indices = cfdm.parse_indices(shape, indices, keepdims=keepdims) - if len_parsed_indices < ndim: - parsed_indices.extend([slice(None)] * (ndim - len_parsed_indices)) - - if not ndim and parsed_indices: - raise IndexError( - "Scalar array can only be indexed with () or Ellipsis" - ) + if not cyclic: + return parsed_indices + roll = {} for i, (index, size) in enumerate(zip(parsed_indices, shape)): - if cyclic and isinstance(index, slice): - # Check for a cyclic slice - try: - index = normalize_slice(index, size, cyclic=True) - except IndexError: - # Non-cyclic slice - pass - else: - # Cyclic slice - start = index.start - stop = index.stop - step = index.step - if ( - step > 0 - and -size <= start < 0 - and 0 <= stop <= size + start - ): - # x = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] - # x[ -1:0:1] => [9] - # x[ -1:1:1] => [9, 0] - # x[ -1:3:1] => [9, 0, 1, 2] - # x[ -1:9:1] => [9, 0, 1, 2, 3, 4, 5, 6, 7, 8] - # x[ -4:0:1] => [6, 7, 8, 9] - # x[ -4:1:1] => [6, 7, 8, 9, 0] - # x[ -4:3:1] => [6, 7, 8, 9, 0, 1, 2] - # x[ -4:6:1] => [6, 7, 8, 9, 0, 1, 2, 3, 4, 5] - # x[ -9:0:1] => [1, 2, 3, 4, 5, 6, 7, 8, 9] - # x[ -9:1:1] => [1, 2, 3, 4, 5, 6, 7, 8, 9, 0] - # x[-10:0:1] => [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] - index = slice(0, stop - start, step) - roll[i] = -start - - elif ( - step < 0 and 0 <= start < size and start - size <= stop < 0 - ): - # x = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] - # x[0: -4:-1] => [0, 9, 8, 7] - # x[6: -1:-1] => [6, 5, 4, 3, 2, 1, 0] - # x[6: -2:-1] => [6, 5, 4, 3, 2, 1, 0, 9] - # x[6: -4:-1] => [6, 5, 4, 3, 2, 1, 0, 9, 8, 7] - # x[0: -2:-1] => [0, 9] - # x[0:-10:-1] => [0, 9, 8, 7, 6, 5, 4, 3, 2, 1] - index = slice(start - stop - 1, None, step) - roll[i] = -1 - stop - - elif keepdims and isinstance(index, Integral): - # Convert an integral index to a slice - if index == -1: - index = slice(-1, None, None) - else: - index = slice(index, index + 1, 1) - - elif hasattr(index, "to_dask_array"): - to_dask_array = index.to_dask_array - if callable(to_dask_array): - # Replace index with its Dask array - index = to_dask_array() - - parsed_indices[i] = index + if not isinstance(index, slice): + continue - if not cyclic: - return parsed_indices + try: + index = normalize_slice(index, size, cyclic=True) + except IndexError: + # Non-cyclic slice + pass + else: + # Cyclic slice + start = index.start + stop = index.stop + step = index.step + if step > 0 and -size <= start < 0 and 0 <= stop <= size + start: + # x = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] + # x[ -1:0:1] => [9] + # x[ -1:1:1] => [9, 0] + # x[ -1:3:1] => [9, 0, 1, 2] + # x[ -1:9:1] => [9, 0, 1, 2, 3, 4, 5, 6, 7, 8] + # x[ -4:0:1] => [6, 7, 8, 9] + # x[ -4:1:1] => [6, 7, 8, 9, 0] + # x[ -4:3:1] => [6, 7, 8, 9, 0, 1, 2] + # x[ -4:6:1] => [6, 7, 8, 9, 0, 1, 2, 3, 4, 5] + # x[ -9:0:1] => [1, 2, 3, 4, 5, 6, 7, 8, 9] + # x[ -9:1:1] => [1, 2, 3, 4, 5, 6, 7, 8, 9, 0] + # x[-10:0:1] => [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] + index = slice(0, stop - start, step) + roll[i] = -start + + elif step < 0 and 0 <= start < size and start - size <= stop < 0: + # x = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] + # x[0: -4:-1] => [0, 9, 8, 7] + # x[6: -1:-1] => [6, 5, 4, 3, 2, 1, 0] + # x[6: -2:-1] => [6, 5, 4, 3, 2, 1, 0, 9] + # x[6: -4:-1] => [6, 5, 4, 3, 2, 1, 0, 9, 8, 7] + # x[0: -2:-1] => [0, 9] + # x[0:-10:-1] => [0, 9, 8, 7, 6, 5, 4, 3, 2, 1] + index = slice(start - stop - 1, None, step) + roll[i] = -1 - stop + + parsed_indices[i] = index return parsed_indices, roll diff --git a/cf/mixin/propertiesdata.py b/cf/mixin/propertiesdata.py index 269b9d79ef..6437ed3fc7 100644 --- a/cf/mixin/propertiesdata.py +++ b/cf/mixin/propertiesdata.py @@ -7,7 +7,6 @@ from ..cfdatetime import dt from ..data import Data -from ..data.data import _DEFAULT_CHUNKS from ..decorators import ( _deprecated_kwarg_check, _inplace_enabled, @@ -622,11 +621,11 @@ def _binary_operation(self, y, method): if not inplace: new = self.copy() # data=False) TODO - new_data = data._binary_operation(y, method) + new_data = data._binary_operation(data, y, method) new.set_data(new_data, copy=False) else: new = self - new.data._binary_operation(y, method) + new.data._binary_operation(new.data, y, method) if method in _relational_methods: # Booleans have no units @@ -5392,7 +5391,7 @@ def override_units(self, units, inplace=False, i=False): @_inplace_enabled(default=False) def rechunk( self, - chunks=_DEFAULT_CHUNKS, + chunks="auto", threshold=None, block_size_limit=None, balance=False, diff --git a/cf/mixin/propertiesdatabounds.py b/cf/mixin/propertiesdatabounds.py index a5581ca478..b369db4336 100644 --- a/cf/mixin/propertiesdatabounds.py +++ b/cf/mixin/propertiesdatabounds.py @@ -4,7 +4,6 @@ from cfdm import is_log_level_debug, is_log_level_info from ..data import Data -from ..data.data import _DEFAULT_CHUNKS from ..decorators import ( _deprecated_kwarg_check, _inplace_enabled, @@ -4052,7 +4051,7 @@ def persist(self, bounds=True, inplace=False): @_inplace_enabled(default=False) def rechunk( self, - chunks=_DEFAULT_CHUNKS, + chunks="auto", threshold=None, block_size_limit=None, balance=False, diff --git a/cf/mixin2/container.py b/cf/mixin2/container.py index 44397301f4..c5f0081462 100644 --- a/cf/mixin2/container.py +++ b/cf/mixin2/container.py @@ -6,6 +6,7 @@ """ from ..docstring import _docstring_substitution_definitions +from ..functions import atol, rtol class Container: @@ -54,3 +55,23 @@ def __docstring_package_depth__(self): """ return 0 + + @property + def _atol(self): + """Internal alias for `{{package}}.atol`. + + An alias is necessary to avoid a name clash with the keyword + argument of identical name (`atol`) in calling functions. + + """ + return atol().value + + @property + def _rtol(self): + """Internal alias for `{{package}}.rtol`. + + An alias is necessary to avoid a name clash with the keyword + argument of identical name (`rtol`) in calling functions. + + """ + return rtol().value diff --git a/cf/read_write/netcdf/netcdfread.py b/cf/read_write/netcdf/netcdfread.py index 883cc7b5a2..4ed7e3462d 100644 --- a/cf/read_write/netcdf/netcdfread.py +++ b/cf/read_write/netcdf/netcdfread.py @@ -1,6 +1,4 @@ import cfdm -import netCDF4 -import numpy as np from packaging.version import Version @@ -209,22 +207,6 @@ def _create_data( if data.npartitions == 1: data._cfa_set_write(True) - if ( - not compression_index - and self.read_vars.get("cache") - and self.implementation.get_construct_type(construct) - != "field" - ): - # Only cache values from non-field data and - # non-compression-index data, on the assumptions that: - # - # a) Field data is, in general, so large that finding - # the cached values takes too long. - # - # b) Cached values are never really required for - # compression index data. - self._cache_data_elements(data, ncvar) - return data # ------------------------------------------------------------ @@ -314,99 +296,6 @@ def _is_cfa_variable(self, ncvar): and ncvar not in g["external_variables"] ) - def _create_Data( - self, - array, - ncvar, - units=None, - calendar=None, - ncdimensions=(), - **kwargs, - ): - """Create a Data object from a netCDF variable. - - .. versionadded:: 3.0.0 - - :Parameters: - - array: `Array` - The file array. - - ncvar: `str` - The netCDF variable containing the array. - - units: `str`, optional - The units of *array*. By default, or if `None`, it is - assumed that there are no units. - - calendar: `str`, optional - The calendar of *array*. By default, or if `None`, it is - assumed that there is no calendar. - - ncdimensions: sequence of `str`, optional - The netCDF dimensions spanned by the array. - - .. versionadded:: 3.14.0 - - kwargs: optional - Extra parameters to pass to the initialisation of the - returned `Data` object. - - :Returns: - - `Data` - - """ - if array.dtype is None: - # The array is based on a netCDF VLEN variable, and - # therefore has unknown data type. To find the correct - # data type (e.g. "=1) netCDF string type variable comes out - # as a numpy object array, so convert it to numpy - # string array. - array = array.astype("U", copy=False) - # NetCDF4 doesn't auto-mask VLEN variables - array = np.ma.where(array == "", np.ma.masked, array) - - # Parse dask chunks - chunks = self._parse_chunks(ncvar) - - data = super()._create_Data( - array, - ncvar, - units=units, - calendar=calendar, - chunks=chunks, - **kwargs, - ) - - return data - def _customise_read_vars(self): """Customise the read parameters. @@ -467,160 +356,6 @@ def _customise_read_vars(self): for term_ncvar in parsed_aggregated_data.values(): g["do_not_create_field"].add(term_ncvar) - def _cache_data_elements(self, data, ncvar): - """Cache selected element values. - - Updates *data* in-place to store its first, second, - penultimate, and last element values (as appropriate). - - These values are used by, amongst other things, - `cf.Data.equals`, `cf.aggregate` and for inspection. - - Doing this here is quite cheap because only the individual - elements are read from the already-open file, as opposed to - being retrieved from *data* (which would require a whole dask - chunk to be read to get each single value). - - However, empirical evidence shows that using netCDF4 to access - the first and last elements of a large array on disk - (e.g. shape (1, 75, 1207, 1442)) is slow (e.g. ~2 seconds) and - doesn't scale well with array size (i.e. it takes - disproportionally longer for larger arrays). Such arrays are - usually in field constructs, for which `cf.aggregate` does not - need to know any array values, so this method should be used - with caution, if at all, on field construct data. - - .. versionadded:: 3.14.0 - - :Parameters: - - data: `Data` - The data to be updated with its cached values. - - ncvar: `str` - The name of the netCDF variable that contains the - data. - - :Returns: - - `None` - - """ - - if data.data.get_compression_type(): - # Don't get cached elements from arrays compressed by - # convention, as they'll likely be wrong. - return - - g = self.read_vars - - # Get the netCDF4.Variable for the data - if g["has_groups"]: - group, name = self._netCDF4_group( - g["variable_grouped_dataset"][ncvar], ncvar - ) - variable = group.variables.get(name) - else: - variable = g["variables"].get(ncvar) - - # Get the required element values - size = data.size - ndim = data.ndim - - char = False - if variable.ndim == ndim + 1: - dtype = variable.dtype - if dtype is not str and dtype.kind in "SU": - # This variable is a netCDF classic style char array - # with a trailing dimension that needs to be collapsed - char = True - - if ndim == 1: - # Also cache the second element for 1-d data, on the - # assumption that they may well be dimension coordinate - # data. - if size == 1: - indices = (0, -1) - value = variable[...] - values = (value, value) - elif size == 2: - indices = (0, 1, -1) - value = variable[-1:] - values = (variable[:1], value, value) - else: - indices = (0, 1, -1) - values = (variable[:1], variable[1:2], variable[-1:]) - elif ndim == 2 and data.shape[-1] == 2: - # Assume that 2-d data with a last dimension of size 2 - # contains coordinate bounds, for which it is useful to - # cache the upper and lower bounds of the the first and - # last cells. - indices = (0, 1, -2, -1) - ndim1 = ndim - 1 - values = ( - variable[(slice(0, 1),) * ndim1 + (slice(0, 1),)], - variable[(slice(0, 1),) * ndim1 + (slice(1, 2),)], - ) - if data.size == 2: - values = values + values - else: - values += ( - variable[(slice(-1, None, 1),) * ndim1 + (slice(0, 1),)], - variable[(slice(-1, None, 1),) * ndim1 + (slice(1, 2),)], - ) - elif size == 1: - indices = (0, -1) - value = variable[...] - values = (value, value) - elif size == 3: - indices = (0, 1, -1) - if char: - values = variable[...].reshape(3, variable.shape[-1]) - else: - values = variable[...].flatten() - else: - indices = (0, -1) - values = ( - variable[(slice(0, 1),) * ndim], - variable[(slice(-1, None, 1),) * ndim], - ) - - # Create a dictionary of the element values - elements = {} - for index, value in zip(indices, values): - if char: - # Variable is a netCDF classic style char array, so - # collapse (by concatenation) the outermost (fastest - # varying) dimension. E.g. [['a','b','c']] becomes - # ['abc'] - if value.dtype.kind == "U": - value = value.astype("S") - - a = netCDF4.chartostring(value) - shape = a.shape - a = np.array([x.rstrip() for x in a.flat]) - a = np.reshape(a, shape) - value = np.ma.masked_where(a == "", a) - - if np.ma.is_masked(value): - value = np.ma.masked - else: - try: - value = value.item() - except (AttributeError, ValueError): - # AttributeError: A netCDF string type scalar - # variable comes out as Python str object, which - # has no 'item' method. - # - # ValueError: A size-0 array can't be converted to - # a Python scalar. - pass - - elements[index] = value - - # Store the elements in the data object - data._set_cached_elements(elements) - def _create_cfanetcdfarray( self, ncvar, @@ -671,7 +406,6 @@ def _create_cfanetcdfarray( # Get rid of the incorrect shape. This will end up getting set # correctly by the CFANetCDFArray instance. kwargs.pop("shape", None) - aggregated_data = g["cfa_aggregated_data"][ncvar] standardised_terms = ("location", "file", "address", "format") @@ -774,71 +508,6 @@ def _create_cfanetcdfarray_term( return array, kwargs - def _parse_chunks(self, ncvar): - """Parse the dask chunks. - - .. versionadded:: 3.14.0 - - :Parameters: - - ncvar: `str` - The name of the netCDF variable containing the array. - - :Returns: - - `str`, `int` or `dict` - The parsed chunks that are suitable for passing to a - `Data` object containing the variable's array. - - """ - g = self.read_vars - - default_chunks = "auto" - chunks = g.get("chunks", default_chunks) - - if chunks is None: - return -1 - - if isinstance(chunks, dict): - if not chunks: - return default_chunks - - # For ncdimensions = ('time', 'lat'): - # - # chunks={} -> ["auto", "auto"] - # chunks={'ncdim%time': 12} -> [12, "auto"] - # chunks={'ncdim%time': 12, 'ncdim%lat': 10000} -> [12, 10000] - # chunks={'ncdim%time': 12, 'ncdim%lat': "20MB"} -> [12, "20MB"] - # chunks={'ncdim%time': 12, 'latitude': -1} -> [12, -1] - # chunks={'ncdim%time': 12, 'Y': None} -> [12, None] - # chunks={'ncdim%time': 12, 'ncdim%lat': (30, 90)} -> [12, (30, 90)] - # chunks={'ncdim%time': 12, 'ncdim%lat': None, 'X': 5} -> [12, None] - attributes = g["variable_attributes"] - chunks2 = [] - for ncdim in g["variable_dimensions"][ncvar]: - key = f"ncdim%{ncdim}" - if key in chunks: - chunks2.append(chunks[key]) - continue - - found_coord_attr = False - dim_coord_attrs = attributes.get(ncdim) - if dim_coord_attrs is not None: - for attr in ("standard_name", "axis"): - key = dim_coord_attrs.get(attr) - if key in chunks: - found_coord_attr = True - chunks2.append(chunks[key]) - break - - if not found_coord_attr: - # Use default chunks for this dimension - chunks2.append(default_chunks) - - chunks = chunks2 - - return chunks - def _customise_field_ancillaries(self, parent_ncvar, f): """Create customised field ancillary constructs. diff --git a/cf/read_write/netcdf/netcdfwrite.py b/cf/read_write/netcdf/netcdfwrite.py index e2c8da0236..c8bc9e254e 100644 --- a/cf/read_write/netcdf/netcdfwrite.py +++ b/cf/read_write/netcdf/netcdfwrite.py @@ -3,8 +3,8 @@ import cfdm import dask.array as da import numpy as np +from cfdm.data.dask_utils import cfdm_to_memory -from ...data.dask_utils import cf_asanyarray from .netcdfread import NetCDFRead @@ -747,9 +747,9 @@ def _cfa_write_non_standard_terms( # more than one unique value then the fragment's value is # missing data. # - # '_cfa_unique' has its own call to 'cf_asanyarray', so - # we can set '_asanyarray=False'. - dx = data.to_dask_array(_asanyarray=False) + # '_cfa_unique' has its own call to 'cfdm_to_memory', so + # we can set '_force_to_memory=False'. + dx = data.to_dask_array(_force_to_memory=False) dx_ind = tuple(range(dx.ndim)) out_ind = dx_ind dx = da.blockwise( @@ -807,7 +807,7 @@ def _cfa_unique(cls, a): data if there is not a unique value. """ - a = cf_asanyarray(a) + a = cfdm_to_memory(a) out_shape = (1,) * a.ndim a = np.unique(a) @@ -963,7 +963,7 @@ def _cfa_aggregation_instructions(self, data, cfvar): # ------------------------------------------------------------ dtype = np.dtype(np.int32) if ( - max(data.to_dask_array(_asanyarray=False).chunksize) + max(data.to_dask_array(_force_to_memory=False).chunksize) > np.iinfo(dtype).max ): dtype = np.dtype(np.int64) diff --git a/cf/read_write/read.py b/cf/read_write/read.py index c6ad881db9..9b18944e87 100644 --- a/cf/read_write/read.py +++ b/cf/read_write/read.py @@ -2,7 +2,6 @@ import os import tempfile from glob import glob -from numbers import Integral from os.path import isdir from re import Pattern from urllib.parse import urlparse @@ -60,12 +59,14 @@ def read( mask=True, unpack=True, warn_valid=False, - chunks="auto", + dask_chunks="storage-aligned", + store_hdf5_chunks=True, domain=False, cfa=None, netcdf_backend=None, storage_options=None, cache=True, + chunks="auto", ): """Read field or domain constructs from files. @@ -557,80 +558,201 @@ def read( .. versionadded:: 1.5 - chunks: `str`, `int`, `None`, or `dict`, optional - Specify the `dask` chunking of dimensions for data in - the input files. - - By default, ``'auto'`` is used to specify the array - chunking, which uses a chunk size in bytes defined by - the `cf.chunksize` function, preferring square-like - chunk shapes across all data dimensions. - - If *chunks* is a `str` then each data array uses this - chunk size in bytes, preferring square-like chunk - shapes across all data dimensions. Any string value - accepted by the *chunks* parameter of the - `dask.array.from_array` function is permitted. - - *Parameter example:* - A chunksize of 2 MiB may be specified as - ``'2097152'`` or ``'2 MiB'``. - - If *chunks* is `-1` or `None` then for each there is no - chunking, i.e. every data array has one chunk - regardless of its size. - - If *chunks* is a positive `int` then each data array - dimension has chunks with this number of elements. - - If *chunks* is a `dict`, then each of its keys - identifies dimension in the file, with a value that - defines the chunking for that dimension whenever it is - spanned by data. - - Each dictionary key identifies a file dimension in one - of three ways: 1. the netCDF dimension name, preceded - by ``ncdim%`` (e.g. ``'ncdim%lat'``); 2. the "standard - name" attribute of a CF-netCDF coordinate variable - that spans the dimension (e.g. ``'latitude'``); or - 3. the "axis" attribute of a CF-netCDF coordinate - variable that spans the dimension (e.g. ``'Y'``). - - The dictionary values may be `str`, `int` or `None`, - with the same meanings as those types for the *chunks* - parameter but applying only to the specified - dimension. A `tuple` or `list` of integers that sum to - the dimension size may also be given. - - Not specifying a file dimension in the dictionary is - equivalent to it being defined with a value of - ``'auto'``. - - *Parameter example:* - ``{'T': '0.5 MiB', 'Y': [36, 37], 'X': None}`` - - *Parameter example:* - If a netCDF file contains dimensions ``time``, - ``z``, ``lat`` and ``lon``, then ``{'ncdim%time': - 12, 'ncdim%lat', None, 'ncdim%lon': None}`` will - ensure that all ``time`` axes have a chunksize of - 12; and all ``lat`` and ``lon`` axes are not - chunked; and all ``z`` axes are chunked to comply as - closely as possible with the default chunks size. - - If the netCDF also contains a ``time`` coordinate - variable with a ``standard_name`` attribute of - ``'time'`` and an ``axis`` attribute of ``'T'``, - then the same chunking could be specified with - either ``{'time': 12, 'ncdim%lat', None, 'ncdim%lon': - None}`` or ``{'T': 12, 'ncdim%lat', None, - 'ncdim%lon': None}``. - - .. note:: The *chunks* parameter is ignored for PP and - UM fields files, for which the chunking is - pre-determined by the file format. - - .. versionadded:: 3.14.0 + dask_chunks: `str`, `int`, `None`, or `dict`, optional + Specify the Dask chunking for data. May be one of the + following: + + * ``'storage-aligned'`` + + This is the default. The Dask chunk size in bytes will + be as close as possible the size given by + `cf.chunksize`, favouring square-like chunk shapes, + with the added restriction that the entirety of each + storage chunk must also lie within exactly one Dask + chunk. + + When reading the data from disk, an entire storage chunk + will be read once per Dask storage chunk that contains + any part of it, so ensuring that a storage chunk lies + within only one Dask chunk can increase performance by + reducing the amount of disk access (particularly when + the data are stored remotely to the client). + + For instance, consider a file variable that has an array + of 64-bit floats with shape (400, 300, 60) and a storage + chunk shape of (100, 5, 60), giving 240 storage chunks + each of size 100*5*60*8 bytes = 0.23 MiB. Then: + + * If `cf.chunksize` returned 134217728 (i.e. 128 MiB), + then the storage-aligned Dask chunks will have shape + (400, 300, 60), giving 1 Dask chunk with size of 54.93 + MiB (compare with a Dask chunk shape of (400, 300, 60) + and size 54.93 MiB, if *dask_chunks* were ``'auto'``.) + + * If `cf.chunksize` returned 33554432 (i.e. 32 MiB), + then the storage-aligned Dask chunks will have shape + (200, 260, 60), giving 4 Dask chunks with a maximum + size of 23.80 MiB (compare with a Dask chunk shape of + (264, 264, 60) and maximum size 31.90 MiB, if + *dask_chunks* were ``'auto'``.) + + * If `cf.chunksize` returned 4194304 (i.e. 4 MiB), + then the storage-aligned Dask chunks will have shape + (100, 85, 60), giving 16 Dask chunks with a maximum + size of 3.89 MiB (compare with a Dask chunk shape of + (93, 93, 60) and maximum size 3.96 MiB, if + *dask_chunks* were ``'auto'``.) + + There are, however, some occasions when, for particular + data arrays in the file, the ``'auto'`` option will + automatically be used instead of storage-aligned Dask + chunks. This occurs when: + + * The data array in the file is stored contiguously. + + * The data array in the file is compressed by convention + (e.g. ragged array representations, compression by + gathering, subsampled coordinates, etc.). In this case + the Dask chunks are for the uncompressed data, and so + cannot be aligned with the storage chunks of the + compressed array in the file. + + * ``'storage-exact'`` + + Each Dask chunk will contain exactly one storage chunk + and each storage chunk will lie within exactly one Dask + chunk. + + For instance, consider a file variable that has an array + of 64-bit floats with shape (400, 300, 60) and a storage + chunk shape of (100, 5, 60) (i.e. there are 240 storage + chunks, each of size 0.23 MiB). Then the storage-exact + Dask chunks will also have shape (100, 5, 60) giving 240 + Dask chunks with a maximum size of 0.23 MiB. + + There are, however, some occasions when, for particular + data arrays in the file, the ``'auto'`` option will + automatically be used instead of storage-exact Dask + chunks. This occurs when: + + * The data array in the file is stored contiguously. + + * The data array in the file is compressed by convention + (e.g. ragged array representations, compression by + gathering, subsampled coordinates, etc.). In this case + the Dask chunks are for the uncompressed data, and so + cannot be aligned with the storage chunks of the + compressed array in the file. + + * ``auto`` + + The Dask chunk size in bytes will be as close as + possible to the size given by `cf.chunksize`, + favouring square-like chunk shapes. This may give + similar Dask chunk shapes as the ``'storage-aligned'`` + option, but without the guarantee that each storage + chunk will lie within exactly one Dask chunk. + + * A byte-size given by a `str` + + The Dask chunk size in bytes will be as close as + possible to the given byte-size, favouring square-like + chunk shapes. Any string value, accepted by the *chunks* + parameter of the `dask.array.from_array` function is + permitted. + + *Example:* + A Dask chunksize of 2 MiB may be specified as + ``'2097152'`` or ``'2 MiB'``. + + * `-1` or `None` + + There is no Dask chunking, i.e. every data array has one + Dask chunk regardless of its size. + + * Positive `int` + + Every dimension of all Dask chunks has this number of + elements. + + *Example:* + For 3-dimensional data, *dask_chunks* of `10` will + give Dask chunks with shape (10, 10, 10). + + * `dict` + + Each of dictionary key identifies a file dimension, with + a value that defines the Dask chunking for that + dimension whenever it is spanned by a data array. A file + dimension is identified in one of three ways: + + 1. the netCDF dimension name, preceded by ``ncdim%`` + (e.g. ``'ncdim%lat'``); + + 2. the value of the "standard name" attribute of a + CF-netCDF coordinate variable that spans the + dimension (e.g. ``'latitude'``); + + 3. the value of the "axis" attribute of a CF-netCDF + coordinate variable that spans the dimension + (e.g. ``'Y'``). + + The dictionary values may be a byte-size string, + ``'auto'``, `int` or `None`, with the same meanings as + those types for the *dask_chunks* parameter itself, but + applying only to the specified dimension. In addition, a + dictionary value may be a `tuple` or `list` of integers + that sum to the dimension size. + + Not specifying a file dimension in the dictionary is + equivalent to it being defined with a value of + ``'auto'``. + + *Example:* + ``{'T': '0.5 MiB', 'Z': 'auto', 'Y': [36, 37], 'X': + None}`` + + *Example:* + If a netCDF file contains dimensions ``time``, ``z``, + ``lat`` and ``lon``, then ``{'ncdim%time': 12, + 'ncdim%lat', None, 'ncdim%lon': None}`` will ensure + that, for all applicable data arrays, all ``time`` + axes have a `dask` chunksize of 12; all ``lat`` and + ``lon`` axes are not `dask` chunked; and all ``z`` + axes are `dask` chunked to comply as closely as + possible with the default `dask` chunk size. + + If the netCDF file also contains a ``time`` coordinate + variable with a "standard_name" attribute of + ``'time'`` and an "axis" attribute of ``'T'``, then + the same `dask` chunking could be specified with + either ``{'time': 12, 'ncdim%lat', None, 'ncdim%lon': + None}`` or ``{'T': 12, 'ncdim%lat', None, 'ncdim%lon': + None}``. + + .. versionadded:: NEXTVERSION + + store_hdf5_chunks: `bool`, optional + If True (the default) then store the HDF5 chunking + strategy for each returned data array. The HDF5 chunking + strategy is then accessible via an object's + `nc_hdf5_chunksizes` method. When the HDF5 chunking + strategy is stored, it will be used when the data is + written to a new netCDF4 file with `cf.write` (unless + the strategy was modified prior to writing). + + If False, or if the file being read is not in netCDF4 + format, then no HDF5 chunking strategy is stored. + (i.e. an `nc_hdf5_chunksizes` method will return `None` + for all `Data` objects). In this case, when the data is + written to a new netCDF4 file, the HDF5 chunking strategy + will be determined by `cf.write`. + + See the `cf.write` *hdf5_chunks* parameter for details + on how the HDF5 chunking strategy is determined at the + time of writing. + + .. versionadded:: NEXTVERSION domain: `bool`, optional If True then return only the domain constructs that are @@ -767,7 +889,10 @@ def read( Use methods on the returned `FieldList` instead. chunk: deprecated at version 3.14.0 - Use the *chunks* parameter instead. + Use the *dask_chunks* parameter instead. + + chunks: deprecated at version NEXTVERSION + Use the *dask_chunks* parameter instead. :Returns: @@ -848,7 +973,16 @@ def read( _DEPRECATION_ERROR_FUNCTION_KWARGS( "cf.read", {"chunk": chunk}, - "Use keyword 'chunks' instead.", + "Use keyword 'dask_chunks' instead.", + version="3.14.0", + removed_at="5.0.0", + ) # pragma: no cover + + if chunks != "auto": + _DEPRECATION_ERROR_FUNCTION_KWARGS( + "cf.read", + {"chunk": chunk}, + "Use keyword 'dask_chunks' instead.", version="3.14.0", removed_at="5.0.0", ) # pragma: no cover @@ -857,13 +991,6 @@ def read( if isinstance(select, (str, Query, Pattern)): select = (select,) - # Check chunks - if chunks is not None and not isinstance(chunks, (str, Integral, dict)): - raise ValueError( - "'chunks' parameter must be of type str, int, None or dict. " - f"Got: {chunks!r}" - ) - # Manage input parameters where contradictions are possible: if cdl_string and fmt: if fmt == "CDL": @@ -910,8 +1037,6 @@ def read( cfa_options["substitutions"] = substitutions - cache = bool(cache) - # Initialise the output list of fields/domains if domain: out = DomainList() @@ -1038,7 +1163,8 @@ def read( um=um, extra=extra, height_at_top_of_model=height_at_top_of_model, - chunks=chunks, + dask_chunks=dask_chunks, + store_hdf5_chunks=store_hdf5_chunks, mask=mask, unpack=unpack, warn_valid=warn_valid, @@ -1159,7 +1285,8 @@ def _read_a_file( mask=True, unpack=True, warn_valid=False, - chunks="auto", + dask_chunks="storage-aligned", + store_hdf5_chunks=True, select=None, domain=False, cfa_options=None, @@ -1249,11 +1376,9 @@ def _read_a_file( umversion = float(str(umversion).replace(".", "0", 1)) extra_read_vars = { - "chunks": chunks, "fmt": selected_fmt, "ignore_read_error": ignore_read_error, "cfa_options": cfa_options, - "cache": cache, } # ---------------------------------------------------------------- @@ -1296,6 +1421,9 @@ def _read_a_file( domain=domain, storage_options=storage_options, netcdf_backend=netcdf_backend, + dask_chunks=dask_chunks, + store_hdf5_chunks=store_hdf5_chunks, + cache=cache, ) except MaskError: # Some data required for field interpretation is missing, diff --git a/cf/regrid/regrid.py b/cf/regrid/regrid.py index 51132e9364..cbe53e616a 100644 --- a/cf/regrid/regrid.py +++ b/cf/regrid/regrid.py @@ -2470,10 +2470,10 @@ def create_esmpy_weights( # Write the weights to a netCDF file (copying the # dimension and variable names and structure of a weights # file created by ESMF). + from cfdm.data.locks import netcdf_lock from netCDF4 import Dataset from .. import __version__ - from ..data.array.locks import netcdf_lock if ( max(dst_esmpy_field.data.size, src_esmpy_field.data.size) @@ -2499,48 +2499,51 @@ def create_esmpy_weights( if src_grid.ln_z: regrid_method += f", ln {src_grid.method} in vertical" - netcdf_lock.acquire() - nc = Dataset(weights_file, "w", format="NETCDF4") + with netcdf_lock: + nc = Dataset(weights_file, "w", format="NETCDF4") - nc.title = ( - f"Regridding weights from source {src_grid.type} " - f"with shape {src_shape} to destination " - f"{dst_grid.type} with shape {dst_shape}" - ) - nc.source = f"cf v{__version__}, esmpy v{esmpy.__version__}" - nc.history = f"Created at {datetime.now()}" - nc.regrid_method = regrid_method - nc.ESMF_unmapped_action = r.unmapped_action - nc.ESMF_ignore_degenerate = int(r.ignore_degenerate) - - nc.createDimension("n_s", weights.size) - nc.createDimension("src_grid_rank", src_esmpy_grid.rank) - nc.createDimension("dst_grid_rank", dst_esmpy_grid.rank) - - v = nc.createVariable("src_grid_dims", i_dtype, ("src_grid_rank",)) - v.long_name = "Source grid shape" - v[...] = src_shape - - v = nc.createVariable("dst_grid_dims", i_dtype, ("dst_grid_rank",)) - v.long_name = "Destination grid shape" - v[...] = dst_shape - - v = nc.createVariable("S", weights.dtype, ("n_s",)) - v.long_name = "Weights values" - v[...] = weights - - v = nc.createVariable("row", i_dtype, ("n_s",), zlib=True) - v.long_name = "Destination/row indices" - v.start_index = start_index - v[...] = row - - v = nc.createVariable("col", i_dtype, ("n_s",), zlib=True) - v.long_name = "Source/col indices" - v.start_index = start_index - v[...] = col - - nc.close() - netcdf_lock.release() + nc.title = ( + f"Regridding weights from source {src_grid.type} " + f"with shape {src_shape} to destination " + f"{dst_grid.type} with shape {dst_shape}" + ) + nc.source = f"cf v{__version__}, esmpy v{esmpy.__version__}" + nc.history = f"Created at {datetime.now()}" + nc.regrid_method = regrid_method + nc.ESMF_unmapped_action = r.unmapped_action + nc.ESMF_ignore_degenerate = int(r.ignore_degenerate) + + nc.createDimension("n_s", weights.size) + nc.createDimension("src_grid_rank", src_esmpy_grid.rank) + nc.createDimension("dst_grid_rank", dst_esmpy_grid.rank) + + v = nc.createVariable( + "src_grid_dims", i_dtype, ("src_grid_rank",) + ) + v.long_name = "Source grid shape" + v[...] = src_shape + + v = nc.createVariable( + "dst_grid_dims", i_dtype, ("dst_grid_rank",) + ) + v.long_name = "Destination grid shape" + v[...] = dst_shape + + v = nc.createVariable("S", weights.dtype, ("n_s",)) + v.long_name = "Weights values" + v[...] = weights + + v = nc.createVariable("row", i_dtype, ("n_s",), zlib=True) + v.long_name = "Destination/row indices" + v.start_index = start_index + v[...] = row + + v = nc.createVariable("col", i_dtype, ("n_s",), zlib=True) + v.long_name = "Source/col indices" + v.start_index = start_index + v[...] = col + + nc.close() if esmpy_regrid_operator is None: # Destroy esmpy objects (the esmpy.Grid objects exist even if diff --git a/cf/regrid/regridoperator.py b/cf/regrid/regridoperator.py index 10a77bc641..7621addc7e 100644 --- a/cf/regrid/regridoperator.py +++ b/cf/regrid/regridoperator.py @@ -725,28 +725,26 @@ def tosparse(self): weights_file = self.weights_file if weights_file is not None: # Read the weights from the weights file + from cfdm.data.locks import netcdf_lock from netCDF4 import Dataset - from ..data.array.locks import netcdf_lock + with netcdf_lock: + nc = Dataset(weights_file, "r") + weights = nc.variables["S"][...] + row = nc.variables["row"][...] + col = nc.variables["col"][...] - netcdf_lock.acquire() - nc = Dataset(weights_file, "r") - weights = nc.variables["S"][...] - row = nc.variables["row"][...] - col = nc.variables["col"][...] + try: + col_start_index = nc.variables["col"].start_index + except AttributeError: + col_start_index = 1 - try: - col_start_index = nc.variables["col"].start_index - except AttributeError: - col_start_index = 1 + try: + row_start_index = nc.variables["row"].start_index + except AttributeError: + row_start_index = 1 - try: - row_start_index = nc.variables["row"].start_index - except AttributeError: - row_start_index = 1 - - nc.close() - netcdf_lock.release() + nc.close() else: raise ValueError( "Conversion to sparse array format requires at least " diff --git a/cf/test/individual_tests.sh b/cf/test/individual_tests.sh index 425c7dd435..f67383e173 100755 --- a/cf/test/individual_tests.sh +++ b/cf/test/individual_tests.sh @@ -5,9 +5,6 @@ do echo "Running $file" python $file rc=$? -# if [[ $rc != 0 ]]; then -# exit $rc -# fi done file=setup_create_field.py @@ -18,14 +15,30 @@ if [[ $rc != 0 ]]; then exit $rc fi +style="lots" + for file in test_*.py do echo "Running $file" python $file rc=$? if [[ $rc != 0 ]]; then - exit $rc - # echo -e "\n\n$file FAILED \n\n" + if [[ "$file" == "test_style.py" ]] ; then + style="none" + else + exit $rc + # echo -e "\n\n$file FAILED \n\n" + fi fi done +echo +if [[ "$style" == "none" ]] ; then + echo "------------------------------------------" + echo "All tests passed, APART FROM test_style.py" + echo "------------------------------------------" +else + echo "================" + echo "All tests passed" + echo "================" +fi diff --git a/cf/test/test_Data.py b/cf/test/test_Data.py index e29c192062..dcb28cc85f 100644 --- a/cf/test/test_Data.py +++ b/cf/test/test_Data.py @@ -39,7 +39,8 @@ def _remove_tmpfiles(): atexit.register(_remove_tmpfiles) -# To facilitate the testing of logging outputs (see comment tag 'Logging note') +# To facilitate the testing of logging outputs (see comment tag +# 'Logging note') logger = cf.logging.getLogger(__name__) @@ -60,11 +61,6 @@ def _remove_tmpfiles(): mw = np.ma.array(w, mask=ma.mask) -# If True, all tests that will not pass temporarily due to the LAMA-to-Dask -# migration will be skipped. These skips will be incrementally removed as the -# migration progresses. TODODASK: ensure all skips are removed once complete. -TEST_DASKIFIED_ONLY = True - def reshape_array(a, axes): """Reshape array reducing given axes' dimensions to a final axis.""" @@ -77,24 +73,19 @@ def reshape_array(a, axes): return b -def axis_combinations(a): - """Return a list of axes combinations to iterate over.""" +def axis_combinations(ndim): + """Create axes permutations for `test_Data_flatten`""" return [ axes - for n in range(1, a.ndim + 1) - for axes in itertools.combinations(range(a.ndim), n) + for n in range(1, ndim + 1) + for axes in itertools.permutations(range(ndim), n) ] class DataTest(unittest.TestCase): """Unit test for the Data class.""" - axes_combinations = axis_combinations(a) - # [ - # axes - # for n in range(1, a.ndim + 1) - # for axes in itertools.combinations(range(a.ndim), n) - # ] + axes_combinations = axis_combinations(a.ndim) filename = os.path.join( os.path.dirname(os.path.abspath(__file__)), "test_file.nc" @@ -751,7 +742,7 @@ def test_Data_stats(self): # Test outputs covering a representative selection of parameters s1 = d.stats() - s1_lazy = d.stats(compute=False) + s1_lazy = d.stats(values=False) exp_result = { "minimum": 1, "mean": 1.0, @@ -771,7 +762,7 @@ def test_Data_stats(self): ) s2 = d.stats(all=True) - s2_lazy = d.stats(compute=False, all=True) + s2_lazy = d.stats(values=False, all=True) exp_result = { "minimum": 1, "mean": 1.0, @@ -798,7 +789,7 @@ def test_Data_stats(self): ) s3 = d.stats(sum=True, weights=1) - s3_lazy = d.stats(compute=False, sum=True, weights=1) + s3_lazy = d.stats(values=False, sum=True, weights=1) exp_result = { "minimum": 1, "mean": 1.0, @@ -820,7 +811,7 @@ def test_Data_stats(self): s4 = d.stats(mean_of_upper_decile=True, range=False, weights=2.0) s4_lazy = d.stats( - compute=False, mean_of_upper_decile=True, range=False, weights=2.0 + values=False, mean_of_upper_decile=True, range=False, weights=2.0 ) exp_result = { "minimum": 1, @@ -1002,21 +993,25 @@ def test_Data_cumsum(self): self.assertTrue(cf.functions._numpy_allclose(e.array, b)) def test_Data_flatten(self): - """Test the `flatten` Data method.""" - d = cf.Data(self.ma.copy()) - self.assertTrue(d.equals(d.flatten([]), verbose=2)) + """Test Data.flatten.""" + ma = np.ma.arange(24).reshape(1, 2, 3, 4) + ma[0, 1, 1, 2] = cf.masked + ma[0, 0, 2, 1] = cf.masked + + d = cf.Data(ma.copy()) + self.assertTrue(d.equals(d.flatten([]), verbose=3)) self.assertIsNone(d.flatten(inplace=True)) - d = cf.Data(self.ma.copy()) + d = cf.Data(ma.copy()) - b = self.ma.flatten() + b = ma.flatten() for axes in (None, list(range(d.ndim))): e = d.flatten(axes) self.assertEqual(e.ndim, 1) self.assertEqual(e.shape, b.shape) - self.assertTrue(cf.functions._numpy_allclose(e.array, b)) + self.assertTrue(e.equals(cf.Data(b), verbose=3)) - for axes in self.axes_combinations: + for axes in axis_combinations(d.ndim): e = d.flatten(axes) if len(axes) <= 1: @@ -1028,10 +1023,25 @@ def test_Data_flatten(self): np.prod([n for i, n in enumerate(d.shape) if i in axes]), ) - self.assertEqual(e.shape, tuple(shape)) + self.assertEqual(e.shape, tuple(shape), axes) self.assertEqual(e.ndim, d.ndim - len(axes) + 1) self.assertEqual(e.size, d.size) + for n in range(4): + e = d.flatten(n) + f = d.flatten([n]) + self.assertTrue(e.equals(f)) + + with self.assertRaises(ValueError): + d.flatten(99) + + d = cf.Data(9) + self.assertTrue(d.equals(d.flatten())) + self.assertTrue(d.equals(d.flatten([]))) + + with self.assertRaises(ValueError): + d.flatten(0) + def test_Data_cached_arithmetic_units(self): """Test arithmetic with, and units of, Data cached to disk.""" d = cf.Data(self.a, "m") @@ -3212,19 +3222,20 @@ def test_Data_compute(self): self.assertEqual(d.compute(), 2.5) def test_Data_persist(self): - """Test the `persist` Data method.""" + """Test Data.persist.""" d = cf.Data(9, "km") self.assertIsNone(d.persist(inplace=True)) - d = cf.Data([1, 2, 3.0, 4], "km", mask=[0, 1, 0, 0], chunks=2) - self.assertGreater(len(d.to_dask_array().dask.layers), 1) + d = cf.Data([[1, 2, 3.0, 4]], "km", chunks=2) + self.assertEqual(len(d.to_dask_array().dask.layers), 2) + d.transpose(inplace=True) + self.assertEqual(len(d.to_dask_array().dask.layers), 3) e = d.persist() self.assertIsInstance(e, cf.Data) - self.assertEqual(len(e.to_dask_array().dask.layers), 1) - self.assertEqual( - e.to_dask_array().npartitions, d.to_dask_array().npartitions - ) + self.assertEqual(len(e.to_dask_array().dask.layers), 2) + self.assertEqual(d.npartitions, 2) + self.assertEqual(e.npartitions, d.npartitions) self.assertTrue(e.equals(d)) def test_Data_cyclic(self): @@ -3412,7 +3423,7 @@ def test_Data_integral(self): weights = self.w d = cf.Data(a, "m", chunks=(2, 3, 2, 5)) - for axis in axis_combinations(a): + for axis in axis_combinations(a.ndim): b = reshape_array(a, axis) w = reshape_array(weights, axis) b = np.sum(b * w, axis=-1) @@ -3430,7 +3441,7 @@ def test_Data_max(self): a = self.ma d = cf.Data(a, "m", chunks=(2, 3, 2, 5)) - for axis in axis_combinations(a): + for axis in axis_combinations(a.ndim): b = reshape_array(a, axis) b = np.max(b, axis=-1) b = np.ma.asanyarray(b) @@ -3447,7 +3458,7 @@ def test_Data_maximum_absolute_value(self): a = self.ma d = cf.Data(a, "m", chunks=(2, 3, 2, 5)) - for axis in axis_combinations(a): + for axis in axis_combinations(a.ndim): b = reshape_array(a, axis) b = np.max(abs(b), axis=-1) b = np.ma.asanyarray(b) @@ -3465,7 +3476,7 @@ def test_Data_mean(self): weights = self.w d = cf.Data(a, "m", chunks=(2, 3, 2, 5)) - for axis in axis_combinations(a): + for axis in axis_combinations(a.ndim): b = reshape_array(a, axis) w = reshape_array(weights, axis) b = np.ma.average(b, axis=-1, weights=w) @@ -3484,7 +3495,7 @@ def test_Data_mean_absolute_value(self): weights = self.w d = cf.Data(a, "m", chunks=(2, 3, 2, 5)) - for axis in axis_combinations(a): + for axis in axis_combinations(a.ndim): b = reshape_array(a, axis) w = reshape_array(weights, axis) b = np.ma.average(abs(b), axis=-1, weights=w) @@ -3502,7 +3513,7 @@ def test_Data_mid_range(self): a = self.ma d = cf.Data(a, "m", chunks=(2, 3, 2, 5)) - for axis in axis_combinations(a): + for axis in axis_combinations(a.ndim): b = reshape_array(a, axis) b = (np.max(b, axis=-1) + np.min(b, axis=-1)) / 2.0 b = np.ma.asanyarray(b) @@ -3522,7 +3533,7 @@ def test_Data_min(self): a = self.ma d = cf.Data(a, "m", chunks=(2, 3, 2, 5)) - for axis in axis_combinations(a): + for axis in axis_combinations(a.ndim): b = reshape_array(a, axis) b = np.min(b, axis=-1) b = np.ma.asanyarray(b) @@ -3539,7 +3550,7 @@ def test_Data_minimum_absolute_value(self): a = self.ma d = cf.Data(a, "m", chunks=(2, 3, 2, 5)) - for axis in axis_combinations(a): + for axis in axis_combinations(a.ndim): b = reshape_array(a, axis) b = np.min(abs(b), axis=-1) b = np.ma.asanyarray(b) @@ -3557,7 +3568,7 @@ def test_Data_range(self): d = cf.Data(a, "m", chunks=(2, 3, 2, 5)) - for axis in axis_combinations(a): + for axis in axis_combinations(a.ndim): b = reshape_array(a, axis) b = np.max(b, axis=-1) - np.min(b, axis=-1) b = np.ma.asanyarray(b) @@ -3578,7 +3589,7 @@ def test_Data_root_mean_square(self): weights = self.w d = cf.Data(a, "m", chunks=(2, 3, 2, 5)) - for axis in axis_combinations(a): + for axis in axis_combinations(a.ndim): b = reshape_array(a, axis) w = reshape_array(weights, axis) b = np.ma.average(b * b, axis=-1, weights=w) ** 0.5 @@ -3596,7 +3607,7 @@ def test_Data_sample_size(self): a = self.ma d = cf.Data(a, "m", chunks=(2, 3, 2, 5)) - for axis in axis_combinations(a): + for axis in axis_combinations(a.ndim): b = reshape_array(a, axis) b = np.sum(np.ones_like(b), axis=-1) b = np.ma.asanyarray(b) @@ -3611,7 +3622,7 @@ def test_Data_sample_size(self): a = self.a d = cf.Data(a, "m", chunks=(2, 3, 2, 5)) - for axis in axis_combinations(a): + for axis in axis_combinations(a.ndim): b = reshape_array(a, axis) b = np.sum(np.ones_like(b), axis=-1) b = np.asanyarray(b) @@ -3640,7 +3651,7 @@ def test_Data_sum(self): weights = self.w d = cf.Data(a, "m", chunks=(2, 3, 2, 5)) - for axis in axis_combinations(a): + for axis in axis_combinations(a.ndim): b = reshape_array(a, axis) w = reshape_array(weights, axis) b = np.sum(b * w, axis=-1) @@ -3659,7 +3670,7 @@ def test_Data_sum_of_squares(self): weights = self.w d = cf.Data(a, "m", chunks=(2, 3, 2, 5)) - for axis in axis_combinations(a): + for axis in axis_combinations(a.ndim): b = reshape_array(a, axis) w = reshape_array(weights, axis) b = np.sum(b * b * w, axis=-1) @@ -3679,7 +3690,7 @@ def test_Data_sum_of_weights(self): d = cf.Data(a, "m", chunks=(2, 3, 2, 5)) # Weights=None - for axis in axis_combinations(a): + for axis in axis_combinations(a.ndim): b = reshape_array(a, axis) b = np.sum(np.ones_like(b), axis=-1) b = np.ma.asanyarray(b) @@ -3690,7 +3701,7 @@ def test_Data_sum_of_weights(self): self.assertTrue((e.mask == b.mask).all()) self.assertTrue(np.allclose(e, b)) - for axis in axis_combinations(a): + for axis in axis_combinations(a.ndim): b = reshape_array(a, axis) w = reshape_array(weights, axis) w = np.ma.masked_where(b.mask, w) @@ -3711,12 +3722,12 @@ def test_Data_sum_of_weights2(self): d = cf.Data(a, "m", chunks=(2, 3, 2, 5)) # Weights=None - for axis in axis_combinations(a): + for axis in axis_combinations(a.ndim): e = d.sum_of_weights2(axes=axis) f = d.sum_of_weights(axes=axis) self.assertTrue(e.equals(f)) - for axis in axis_combinations(a): + for axis in axis_combinations(a.ndim): b = reshape_array(a, axis) w = reshape_array(weights, axis) w = np.ma.masked_where(b.mask, w) @@ -3737,7 +3748,7 @@ def test_Data_var(self): d = cf.Data(a, "m", chunks=(2, 3, 2, 5)) # Weighted ddof = 0 - for axis in axis_combinations(a): + for axis in axis_combinations(a.ndim): b = reshape_array(a, axis) w = reshape_array(weights, axis) mu, V1 = np.ma.average(b, axis=-1, weights=w, returned=True) @@ -3755,7 +3766,7 @@ def test_Data_var(self): self.assertTrue(np.allclose(e, b), f"e={e}\nb={b}\ne-b={e - b}") # Weighted ddof = 1 - for axis in axis_combinations(a): + for axis in axis_combinations(a.ndim): b = reshape_array(a, axis) w = reshape_array(weights, axis) mu, V1 = np.ma.average(b, axis=-1, weights=w, returned=True) @@ -3774,7 +3785,7 @@ def test_Data_var(self): self.assertTrue(np.allclose(e, b)) # Unweighted ddof = 1 - for axis in axis_combinations(a): + for axis in axis_combinations(a.ndim): b = reshape_array(a, axis) mu, V1 = np.ma.average(b, axis=-1, returned=True) mu = mu.reshape(mu.shape + (1,)) @@ -3796,7 +3807,7 @@ def test_Data_mean_of_upper_decile(self): weights = self.w d = cf.Data(a, "m", chunks=(2, 3, 2, 5)) - for axis in axis_combinations(a): + for axis in axis_combinations(a.ndim): b = reshape_array(a, axis) w = reshape_array(weights, axis) b = np.ma.filled(b, np.nan) @@ -3944,12 +3955,12 @@ def test_Data_collapse_keepdims(self): d.var, d.mean_of_upper_decile, ): - for axis in axis_combinations(d): + for axis in axis_combinations(d.ndim): e = func(axes=axis, squeeze=False) s = [1 if i in axis else n for i, n in enumerate(d.shape)] self.assertEqual(e.shape, tuple(s)) - for axis in axis_combinations(d): + for axis in axis_combinations(d.ndim): e = func(axes=axis, squeeze=True) s = [n for i, n in enumerate(d.shape) if i not in axis] self.assertEqual(e.shape, tuple(s)) @@ -4130,10 +4141,9 @@ def test_Data_to_dask_array(self): dx = d.to_dask_array() self.assertIsInstance(dx, da.Array) self.assertTrue((d.array == dx.compute()).all()) - self.assertIs(da.asanyarray(d), dx) def test_Data_flat(self): - """Test the `flat` Data method.""" + """Test the Data.flat.""" d = cf.Data([[1, 2], [3, 4]], mask=[[0, 1], [0, 0]]) self.assertEqual(list(d.flat()), [1, 3, 4]) self.assertEqual( @@ -4141,7 +4151,7 @@ def test_Data_flat(self): ) def test_Data_tolist(self): - """Test the `tolist` Data method.""" + """Test the Data.tolist""" for x in (1, [1, 2], [[1, 2], [3, 4]]): d = cf.Data(x) e = d.tolist() @@ -4246,23 +4256,21 @@ def test_Data_atol(self): """Test the `_atol` Data property.""" d = cf.Data(1) self.assertEqual(d._atol, cf.atol()) - cf.atol(0.001) - self.assertEqual(d._atol, 0.001) + with cf.atol(0.001): + self.assertEqual(d._atol, 0.001) def test_Data_rtol(self): """Test the `_rtol` Data property.""" d = cf.Data(1) self.assertEqual(d._rtol, cf.rtol()) - cf.rtol(0.001) - self.assertEqual(d._rtol, 0.001) + with cf.rtol(0.001): + self.assertEqual(d._rtol, 0.001) def test_Data_hardmask(self): - """Test the `hardmask` Data property.""" + """Test Data.hardmask.""" d = cf.Data([1, 2, 3]) d.hardmask = True self.assertTrue(d.hardmask) - self.assertEqual(len(d.to_dask_array().dask.layers), 1) - d[0] = cf.masked self.assertTrue((d.array.mask == [True, False, False]).all()) d[...] = 999 @@ -4273,18 +4281,24 @@ def test_Data_hardmask(self): self.assertTrue((d.array.mask == [False, False, False]).all()) def test_Data_harden_mask(self): - """Test the `harden_mask` Data method.""" + """Test Data.harden_mask.""" d = cf.Data([1, 2, 3], hardmask=False) d.harden_mask() self.assertTrue(d.hardmask) - self.assertEqual(len(d.to_dask_array().dask.layers), 2) + d[0] = cf.masked + self.assertEqual(d[0].array, np.ma.masked) + d[0] = 99 + self.assertEqual(d[0].array, np.ma.masked) def test_Data_soften_mask(self): - """Test the `soften_mask` Data method.""" + """Test Data.soften_mask.""" d = cf.Data([1, 2, 3], hardmask=True) d.soften_mask() self.assertFalse(d.hardmask) - self.assertEqual(len(d.to_dask_array().dask.layers), 2) + d[0] = cf.masked + self.assertEqual(d[0].array, np.ma.masked) + d[0] = 99 + self.assertEqual(d[0].array, 99) def test_Data_compressed_array(self): """Test the `compressed_array` Data property.""" @@ -4448,8 +4462,8 @@ def test_Data_get_filenames(self): cf.write(f, file_A) cf.write(f, file_B) - a = cf.read(file_A, chunks=4)[0].data - b = cf.read(file_B, chunks=4)[0].data + a = cf.read(file_A, dask_chunks=4)[0].data + b = cf.read(file_B, dask_chunks=4)[0].data b += 999 c = cf.Data(b.array, units=b.Units, chunks=4) @@ -4519,16 +4533,32 @@ def test_Data__str__(self): self.assertNotIn(element, d._get_cached_elements()) def test_Data_cull_graph(self): - """Test `Data.cull`""" - # Note: The number of layers in the culled graphs include a - # `cf_asanyarray` layer + """Test Data.cull_graph.""" d = cf.Data([1, 2, 3, 4, 5], chunks=3) d = d[:2] - self.assertEqual(len(dict(d.to_dask_array(_asanyarray=False).dask)), 3) + self.assertEqual( + len( + dict( + d.to_dask_array( + _force_mask_hardness=False, _force_to_memory=False + ).dask + ) + ), + 3, + ) # Check that there are fewer keys after culling d.cull_graph() - self.assertEqual(len(dict(d.to_dask_array(_asanyarray=False).dask)), 2) + self.assertEqual( + len( + dict( + d.to_dask_array( + _force_mask_hardness=False, _force_to_memory=False + ).dask + ) + ), + 2, + ) def test_Data_npartitions(self): """Test the `npartitions` Data property.""" @@ -4559,7 +4589,7 @@ def test_Data_convert_reference_time(self): self.assertTrue((e.array == [72, 48, 24, 0]).all()) def test_Data_clear_after_dask_update(self): - """Test Data._clear_after_dask_update""" + """Test Data._clear_after_dask_update.""" d = cf.Data([1, 2, 3], "m") dx = d.to_dask_array() @@ -4569,8 +4599,8 @@ def test_Data_clear_after_dask_update(self): self.assertTrue(d._get_cached_elements()) - _ALL = cf.data.data._ALL - _CACHE = cf.data.data._CACHE + _ALL = cf.Data._ALL + _CACHE = cf.Data._CACHE d._set_dask(dx, clear=_ALL ^ _CACHE) self.assertTrue(d._get_cached_elements()) @@ -4674,7 +4704,7 @@ def test_Data_file_location(self): ) cf.write(f, file_A) - d = cf.read(file_A, chunks=4)[0].data + d = cf.read(file_A, dask_chunks=4)[0].data self.assertGreater(d.npartitions, 1) e = d.copy() @@ -4693,9 +4723,9 @@ def test_Data_file_location(self): self.assertEqual(d.file_locations(), set((location,))) def test_Data_todict(self): - """Test Data.todict""" + """Test Data.todict.""" d = cf.Data([1, 2, 3, 4], chunks=2) - key = d.to_dask_array().name + key = d.to_dask_array(_force_mask_hardness=False).name x = d.todict() self.assertIsInstance(x, dict) diff --git a/cf/test/test_Data_utils.py b/cf/test/test_Data_utils.py index 874e2f8c84..f6f75dcc65 100644 --- a/cf/test/test_Data_utils.py +++ b/cf/test/test_Data_utils.py @@ -12,156 +12,6 @@ class DataUtilsTest(unittest.TestCase): - def test_Data_Utils__da_ma_allclose(self): - """TODO.""" - # Create a range of inputs to test against. - # Note that 'a' and 'a2' should be treated as 'allclose' for this - # method, the same result as np.ma.allclose would give because all - # of the *unmasked* elements are 'allclose', whereas in our - # Data.equals method that builds on this method, we go even further - # and insist on the mask being identical as well as the data - # (separately, i.e. unmasked) all being 'allclose', so inside our - # cf.Data objects 'a' and 'a2' would instead *not* be considered equal. - a_np = np.ma.array([1.0, 2.0, 3.0], mask=[1, 0, 0]) - a = da.from_array(a_np) - a2 = da.from_array(np.ma.array([10.0, 2.0, 3.0], mask=[1, 0, 0])) - b_np = np.ma.array([1.0, 2.0, 3.0], mask=[0, 1, 0]) - b = da.from_array(b_np) - c_np = np.ma.array([1.0, 2.0, 100.0], mask=[1, 0, 0]) - c = da.from_array(c_np) - d = da.from_array(np.array([1.0, 2.0, 3.0])) - e = a + 5e-04 # outside of tolerance to set, namely rtol=1e-05 - f = a + 5e-06 # within set tolerance to be specified, as above - - # Test the function with these inputs as both numpy and dask arrays... - allclose = cf.data.dask_utils._da_ma_allclose - - self.assertTrue(allclose(a, a).compute()) - self.assertTrue(allclose(a2, a).compute()) - self.assertTrue(allclose(b, a).compute()) - - # ...including testing the 'masked_equal' parameter - self.assertFalse(allclose(b, a, masked_equal=False).compute()) - - self.assertFalse(allclose(c, a).compute()) - self.assertTrue(allclose(d, a).compute()) - self.assertFalse(allclose(e, a).compute()) - - self.assertTrue(allclose(f, a, rtol=1e-05).compute()) - - # Test when array inputs have different chunk sizes - a_chunked = da.from_array(a_np, chunks=(1, 2)) - self.assertTrue( - allclose(da.from_array(b_np, chunks=(3,)), a_chunked).compute() - ) - self.assertFalse( - allclose( - da.from_array(b_np, chunks=(3,)), a_chunked, masked_equal=False - ).compute() - ) - self.assertFalse( - allclose(da.from_array(c_np, chunks=(3,)), a_chunked).compute() - ) - - # Test the 'rtol' and 'atol' parameters: - self.assertFalse(allclose(e, a, rtol=1e-06).compute()) - b1 = e / 10000 - b2 = a / 10000 - self.assertTrue(allclose(b1, b2, atol=1e-05).compute()) - - def test_Data_Utils_is_numeric_dtype(self): - """TODO.""" - is_numeric_dtype = cf.data.utils.is_numeric_dtype - for a in [ - np.array([0, 1, 2]), - np.array([False, True, True]), - np.ma.array([10.0, 2.0, 3.0], mask=[1, 0, 0]), - np.array(10), - ]: - self.assertTrue(is_numeric_dtype(a)) - - for b in [ - np.array(["a", "b", "c"], dtype="S1"), - np.empty(1, dtype=object), - ]: - self.assertFalse(is_numeric_dtype(b)) - - def test_Data_Utils_convert_to_datetime(self): - """TODO.""" - a = cftime.DatetimeGregorian(2000, 12, 3, 12) - for x in (2.5, [2.5]): - d = da.from_array(x) - e = cf.data.utils.convert_to_datetime( - d, cf.Units("days since 2000-12-01") - ) - self.assertEqual(e.compute(), a) - - a = [ - cftime.DatetimeGregorian(2000, 12, 1), - cftime.DatetimeGregorian(2000, 12, 2), - cftime.DatetimeGregorian(2000, 12, 3), - ] - for x in ([0, 1, 2], [[0, 1, 2]]): - d = da.from_array([0, 1, 2], chunks=2) - e = cf.data.utils.convert_to_datetime( - d, cf.Units("days since 2000-12-01") - ) - self.assertTrue((e.compute() == a).all()) - - def test_Data_Utils_convert_to_reftime(self): - """TODO.""" - a = cftime.DatetimeGregorian(2000, 12, 3, 12) - d = da.from_array(np.array(a, dtype=object)) - - e, u = cf.data.utils.convert_to_reftime(d) - self.assertEqual(e.compute(), 0.5) - self.assertEqual(u, cf.Units("days since 2000-12-03", "standard")) - - units = cf.Units("days since 2000-12-01") - e, u = cf.data.utils.convert_to_reftime(d, units=units) - self.assertEqual(e.compute(), 2.5) - self.assertEqual(u, units) - - a = "2000-12-03T12:00" - d = da.from_array(np.array(a, dtype=str)) - - e, u = cf.data.utils.convert_to_reftime(d) - self.assertEqual(e.compute(), 0.5) - self.assertEqual(u, cf.Units("days since 2000-12-03", "standard")) - - units = cf.Units("days since 2000-12-01") - e, u = cf.data.utils.convert_to_reftime(d, units=units) - self.assertEqual(e.compute(), 2.5) - self.assertEqual(u, units) - - a = [ - [ - cftime.DatetimeGregorian(2000, 12, 1), - cftime.DatetimeGregorian(2000, 12, 2), - cftime.DatetimeGregorian(2000, 12, 3), - ] - ] - d = da.from_array(np.ma.array(a, mask=[[1, 0, 0]]), chunks=2) - - e, u = cf.data.utils.convert_to_reftime(d) - self.assertTrue((e.compute() == [-99, 0, 1]).all()) - self.assertEqual(u, cf.Units("days since 2000-12-02", "standard")) - - units = cf.Units("days since 2000-12-03") - e, u = cf.data.utils.convert_to_reftime(d, units=units) - self.assertTrue((e.compute() == [-99, -1, 0]).all()) - self.assertEqual(u, units) - - d = cf.Data( - ["2004-02-29", "2004-02-30", "2004-03-01"], calendar="360_day" - ) - self.assertEqual(d.Units, cf.Units("days since 2004-02-29", "360_day")) - self.assertTrue((d.array == [0, 1, 2]).all()) - - d = cf.Data(["2004-02-29", "2004-03-01"], dt=True) - self.assertEqual(d.Units, cf.Units("days since 2004-02-29")) - self.assertTrue((d.array == [0, 1]).all()) - def test_Data_Utils_unique_calendars(self): """TODO.""" a = [ @@ -198,93 +48,6 @@ def test_Data_Utils_unique_calendars(self): c = cf.data.utils.unique_calendars(d) self.assertEqual(c, set(["all_leap", "standard"])) - def test_Data_Utils_first_non_missing_value(self): - """TODO.""" - for method in ("index", "mask"): - # Scalar data - d = da.from_array(0) - self.assertEqual( - cf.data.utils.first_non_missing_value(d, method=method), 0 - ) - self.assertEqual( - cf.data.utils.first_non_missing_value( - d, cached=99, method=method - ), - 99, - ) - - d[()] = np.ma.masked - self.assertEqual( - cf.data.utils.first_non_missing_value(d, method=method), None - ) - self.assertEqual( - cf.data.utils.first_non_missing_value( - d, cached=99, method=method - ), - 99, - ) - - # 1-d data - d = da.arange(8) - self.assertEqual( - cf.data.utils.first_non_missing_value(d, method=method), 0 - ) - self.assertEqual( - cf.data.utils.first_non_missing_value( - d, cached=99, method=method - ), - 99, - ) - - d[0] = np.ma.masked - self.assertEqual( - cf.data.utils.first_non_missing_value(d, method=method), 1 - ) - self.assertEqual( - cf.data.utils.first_non_missing_value( - d, cached=99, method=method - ), - 99, - ) - - # 2-d data - d = da.arange(8).reshape(2, 4) - self.assertEqual( - cf.data.utils.first_non_missing_value(d, method=method), 0 - ) - self.assertEqual( - cf.data.utils.first_non_missing_value( - d, cached=99, method=method - ), - 99, - ) - - d[0] = np.ma.masked - self.assertEqual( - cf.data.utils.first_non_missing_value(d, method=method), 4 - ) - self.assertEqual( - cf.data.utils.first_non_missing_value( - d, cached=99, method=method - ), - 99, - ) - - d[...] = np.ma.masked - self.assertEqual( - cf.data.utils.first_non_missing_value(d, method=method), None - ) - self.assertEqual( - cf.data.utils.first_non_missing_value( - d, cached=99, method=method - ), - 99, - ) - - # Bad method - with self.assertRaises(ValueError): - cf.data.utils.first_non_missing_value(d, method="bad") - def test_Data_Utils_conform_units(self): for x in (1, [1, 2], "foo", np.array([[1]])): self.assertEqual(cf.data.utils.conform_units(x, cf.Units("m")), x) diff --git a/cf/test/test_DimensionCoordinate.py b/cf/test/test_DimensionCoordinate.py index b39181b245..35694b6aeb 100644 --- a/cf/test/test_DimensionCoordinate.py +++ b/cf/test/test_DimensionCoordinate.py @@ -611,7 +611,14 @@ def test_DimensiconCoordinate_persist(self): e = d.persist() self.assertIsInstance(e, cf.DimensionCoordinate) - self.assertEqual(len(e.to_dask_array().dask.layers), 1) + self.assertEqual( + len( + e.data.to_dask_array( + _force_mask_hardness=False, _force_to_memory=False + ).dask.layers + ), + 1, + ) self.assertTrue(e.equals(d)) self.assertIsNone(d.persist(inplace=True)) diff --git a/cf/test/test_Field.py b/cf/test/test_Field.py index b1df9ef1f1..e282389ec0 100644 --- a/cf/test/test_Field.py +++ b/cf/test/test_Field.py @@ -2562,13 +2562,19 @@ def test_Field_percentile(self): def test_Field_grad_xy(self): f = cf.example_field(0) - # Spherical polar coordinates + # theta=0 is at the north pole theta = 90 - f.convert("Y", full_domain=True) sin_theta = theta.sin() radius = 2 r = f.radius(radius) + g = f.copy() + lon = g.dimension_coordinate("latitude") + lat = g.dimension_coordinate("longitude") + lon.Units = cf.Units("radians") + lat.Units = cf.Units("radians") + for wrap in (False, True, None): for one_sided in (True, False): x, y = f.grad_xy( @@ -2578,23 +2584,24 @@ def test_Field_grad_xy(self): self.assertEqual(x.Units, y.Units) self.assertEqual(y.Units, cf.Units("m-1")) - x0 = f.derivative( + x0 = g.derivative( "X", wrap=wrap, one_sided_at_boundary=one_sided, + ignore_coordinate_units=True, ) / (sin_theta * r) y0 = ( - f.derivative( + g.derivative( "Y", one_sided_at_boundary=one_sided, + ignore_coordinate_units=True, ) / r ) # Check the data - with cf.rtol(1e-10): - self.assertTrue((x.data == x0.data).all()) - self.assertTrue((y.data == y0.data).all()) + self.assertTrue(x.data.allclose(x0.data)) + self.assertTrue(y.data.allclose(y0.data)) # Check that x and y have the same metadata as f # (except standard_name, long_name, and units). @@ -2626,7 +2633,9 @@ def test_Field_grad_xy(self): self.assertEqual(y.Units, cf.Units("m-1")) x0 = f.derivative( - "X", wrap=wrap, one_sided_at_boundary=one_sided + "X", + wrap=wrap, + one_sided_at_boundary=one_sided, ) y0 = f.derivative("Y", one_sided_at_boundary=one_sided) @@ -2725,7 +2734,7 @@ def test_Field_laplacian_xy(self): def test_Field_to_dask_array(self): f = self.f0.copy() - self.assertIs(f.to_dask_array(), f.data.to_dask_array()) + self.assertTrue((f.array == f.to_dask_array().compute()).all()) f.del_data() with self.assertRaises(ValueError): @@ -2834,11 +2843,25 @@ def test_Field_persist(self): f = cf.example_field(0) f *= 2 - self.assertGreater(len(f.to_dask_array().dask.layers), 1) + self.assertGreater( + len( + f.data.to_dask_array( + _force_mask_hardness=False, _force_to_memory=False + ).dask.layers + ), + 2, + ) g = f.persist() self.assertIsInstance(g, cf.Field) - self.assertEqual(len(g.to_dask_array().dask.layers), 1) + self.assertEqual( + len( + g.data.to_dask_array( + _force_mask_hardness=False, _force_to_memory=False + ).dask.layers + ), + 1, + ) self.assertTrue(g.equals(f)) self.assertIsNone(g.persist(inplace=True)) diff --git a/cf/test/test_Maths.py b/cf/test/test_Maths.py index 349bd495dd..add50ae710 100644 --- a/cf/test/test_Maths.py +++ b/cf/test/test_Maths.py @@ -45,7 +45,7 @@ def test_curl_xy(self): # Check the data with cf.rtol(1e-10): - self.assertTrue((c.data == c0.data).all()) + self.assertTrue(c.data.allclose(c0.data)) del c.long_name c0.set_data(c.data) @@ -120,21 +120,23 @@ def test_div_xy(self): x_wrap=wrap, one_sided_at_boundary=one_sided, ) - self.assertEqual(d.Units, cf.Units("m-2")) term1 = x.derivative( - "X", wrap=wrap, one_sided_at_boundary=one_sided + "X", + wrap=wrap, + one_sided_at_boundary=one_sided, ) term2 = (y * sin_theta).derivative( - "Y", one_sided_at_boundary=one_sided + "Y", + one_sided_at_boundary=one_sided, ) d0 = (term1 + term2) / (sin_theta * r) # Check the data with cf.rtol(1e-10): - self.assertTrue((d.data == d0.data).all()) + self.assertTrue(d.data.allclose(d0.data)) del d.long_name d0.set_data(d.data) diff --git a/cf/test/test_active_storage.py b/cf/test/test_active_storage.py index f14e063849..4770368808 100644 --- a/cf/test/test_active_storage.py +++ b/cf/test/test_active_storage.py @@ -41,7 +41,9 @@ def test_active_storage(self): f = cf.example_field(0) cf.write(f, tmpfile) - f = cf.read(tmpfile, chunks={"latitude": (4, 1), "longitude": (3, 5)}) + f = cf.read( + tmpfile, dask_chunks={"latitude": (4, 1), "longitude": (3, 5)} + ) f = f[0] self.assertEqual(f.data.chunks, ((4, 1), (3, 5))) diff --git a/cf/test/test_functions.py b/cf/test/test_functions.py index f6cce13ae0..ad7a59c4f3 100644 --- a/cf/test/test_functions.py +++ b/cf/test/test_functions.py @@ -31,12 +31,10 @@ def test_example_field_example_fields(self): def test_keyword_deprecation(self): # Use as test case 'i' kwarg, the deprecated old name for # 'inplace': - a = cf.Data([list(range(100))]) - a.squeeze(inplace=True) # new way to specify operation tested below - - b = cf.Data([list(range(100))]) + f = cf.example_field(0) + f.squeeze(inplace=True) # new way to specify operation tested below with self.assertRaises(cf.functions.DeprecationError): - b.squeeze(i=True) + f.squeeze(i=True) def test_aliases(self): self.assertEqual(cf.log_level(), cf.LOG_LEVEL()) @@ -197,6 +195,10 @@ def test_configuration(self): # messages: cf.log_level("DISABLE") + # Reset configuration + cf.configuration(**org) + self.assertEqual(cf.configuration(), org) + def test_context_managers(self): # rtol, atol for func in (cf.atol, cf.rtol): diff --git a/cf/test/test_read_write.py b/cf/test/test_read_write.py index f0bf697fac..613280eaf3 100644 --- a/cf/test/test_read_write.py +++ b/cf/test/test_read_write.py @@ -844,49 +844,6 @@ def test_read_write_domain(self): self.assertIsInstance(e[1], cf.Domain) self.assertTrue(e[0].equals(e[1])) - def test_read_chunks(self): - f = cf.example_field(0) - f.construct("latitude").axis = "Y" - cf.write(f, tmpfile) - - f = cf.read(tmpfile, chunks={})[0] - self.assertEqual(f.data.chunks, ((5,), (8,))) - - f = cf.read(tmpfile, chunks=-1)[0] - self.assertEqual(f.data.chunks, ((5,), (8,))) - - f = cf.read(tmpfile, chunks=None)[0] - self.assertEqual(f.data.chunks, ((5,), (8,))) - - f = cf.read(tmpfile, chunks={"foo": 2, "bar": 3})[0] - self.assertEqual(f.data.chunks, ((5,), (8,))) - - with cf.chunksize("200GB"): - f = cf.read(tmpfile)[0] - self.assertEqual(f.data.chunks, ((5,), (8,))) - - with cf.chunksize("150B"): - f = cf.read(tmpfile)[0] - self.assertEqual(f.data.chunks, ((4, 1), (4, 4))) - - f = cf.read(tmpfile, chunks="150B")[0] - self.assertEqual(f.data.chunks, ((4, 1), (4, 4))) - - f = cf.read(tmpfile, chunks=3)[0] - self.assertEqual(f.data.chunks, ((3, 2), (3, 3, 2))) - - y = f.construct("Y") - self.assertEqual(y.data.chunks, ((3, 2),)) - - f = cf.read(tmpfile, chunks={"ncdim%lon": 3})[0] - self.assertEqual(f.data.chunks, ((5,), (3, 3, 2))) - - f = cf.read(tmpfile, chunks={"longitude": 5, "Y": "150B"})[0] - self.assertEqual(f.data.chunks, ((3, 2), (5, 3))) - - y = f.construct("Y") - self.assertEqual(y.data.chunks, ((5,),)) - def test_write_omit_data(self): """Test the `omit_data` parameter to `write`.""" f = cf.example_field(1) diff --git a/cf/test/test_regrid.py b/cf/test/test_regrid.py index f171e83994..2001b4cce6 100644 --- a/cf/test/test_regrid.py +++ b/cf/test/test_regrid.py @@ -756,7 +756,9 @@ def test_Field_regrid_chunks(self): filename = os.path.join( os.path.dirname(os.path.abspath(__file__)), "regrid.nc" ) - dst, src = cf.read(filename, chunks={"latitude": 20, "longitude": 30}) + dst, src = cf.read( + filename, dask_chunks={"latitude": 20, "longitude": 30} + ) self.assertEqual(src.data.numblocks, (1, 2, 2)) self.assertEqual(dst.data.numblocks, (1, 4, 4)) diff --git a/cf/units.py b/cf/units.py index 09c2d79c51..49486800a8 100644 --- a/cf/units.py +++ b/cf/units.py @@ -22,8 +22,10 @@ class Units: """ def __new__(cls, *args, **kwargs): + """Return a new Units instance.""" return cfUnits(*args, **kwargs) @staticmethod def conform(*args, **kwargs): + """Conform values to equivalent values in a compatible unit.""" return cfUnits.conform(*args, **kwargs) diff --git a/docs/source/check_docs_api_coverage.py b/docs/source/check_docs_api_coverage.py index 5b8ab98f99..8cdd5f8aa2 100644 --- a/docs/source/check_docs_api_coverage.py +++ b/docs/source/check_docs_api_coverage.py @@ -72,7 +72,7 @@ print(f"Method {method} not in {rst_file}") except FileNotFoundError: n_missing_files += 1 - print("File {rst_file} does not exist") + print(f"File {rst_file} does not exist") if n_undocumented_methods or n_missing_files: raise ValueError( diff --git a/docs/source/class/cf.Data.rst b/docs/source/class/cf.Data.rst index eb1fc87826..625daccbe2 100644 --- a/docs/source/class/cf.Data.rst +++ b/docs/source/class/cf.Data.rst @@ -86,6 +86,7 @@ Dask :template: attribute.rst ~cf.Data.chunks + ~cf.Data.chunksize ~cf.Data.npartitions ~cf.Data.numblocks @@ -332,6 +333,7 @@ Mask support ~cf.Data.has_fill_value ~cf.Data.set_fill_value ~cf.Data.soften_mask + ~cf.Data.masked_where .. rubric:: Attributes diff --git a/docs/source/installation.rst b/docs/source/installation.rst index 601fc487fc..a39acd77d6 100644 --- a/docs/source/installation.rst +++ b/docs/source/installation.rst @@ -192,9 +192,11 @@ Required * `Python `_, 3.8.0 or newer. -* `numpy `_, 1.22.0 or newer. +* `numpy `_, versions 1.15 up to, but not + including, 2.0. -* `dask `_, 2022.12.1 or newer. +* `dask `_, versions 2024.6.0 to + 2024.7.1 inclusive. * `netCDF4 `_, 1.6.5 or newer. @@ -206,7 +208,7 @@ Required * `h5py `_, version 3.10.0 or newer. -* `s3fs `_, version 2024.2.0 or newer. +* `s3fs `_, version 2024.6.0 or newer. * `scipy `_, version 1.10.0 or newer. diff --git a/docs/source/recipes/plot_17_recipe.py b/docs/source/recipes/plot_17_recipe.py index 0738c62a3a..656f7c8717 100644 --- a/docs/source/recipes/plot_17_recipe.py +++ b/docs/source/recipes/plot_17_recipe.py @@ -91,9 +91,7 @@ if i == 0: set_title = "Perceptually uniform\ncolour maps" elif i == 1: - set_title = ( - "NCL colour maps enhanced to \nhelp with colour blindness" - ) + set_title = "NCL colour maps enhanced to \nhelp with colour blindness" elif i == 2: set_title = "Orography/bathymetry\ncolour maps" else: diff --git a/docs/source/recipes/plot_18_recipe.py b/docs/source/recipes/plot_18_recipe.py index d219bdfe19..3e306906ed 100644 --- a/docs/source/recipes/plot_18_recipe.py +++ b/docs/source/recipes/plot_18_recipe.py @@ -13,11 +13,11 @@ # %% # 1. Import cf-python, cf-plot and other required packages: import cfplot as cfp -import cf - import matplotlib.pyplot as plt import scipy.stats.mstats as mstats +import cf + # %% # 2. Read the data in and unpack the Fields from FieldLists using indexing. # In our example We are investigating the influence of the land height on @@ -62,7 +62,7 @@ # unitless fraction, but the values are in the tens, so we need to # normalise these to all lie between 0 and 1 and change the units # appropriately: -sub_snow = ((sub_snow - sub_snow.minimum()) / (sub_snow.range())) +sub_snow = (sub_snow - sub_snow.minimum()) / (sub_snow.range()) sub_snow.override_units("1", inplace=True) # %% @@ -93,7 +93,9 @@ # and its strength visually. We use 'gpos' to position the plots in two # columns and apply some specific axes ticks and labels for clarity. cfp.gopen( - rows=1, columns=2, top=0.85, + rows=1, + columns=2, + top=0.85, file="snow_and_orog_on_same_grid.png", user_position=True, ) @@ -131,10 +133,12 @@ # Don't add extentions on the colourbar since it can only be 0 to 1 inclusive cfp.levs(min=0, max=1, step=0.1, extend="neither") cfp.cscale("precip_11lev", ncols=11, reverse=1) -cfp.con(sub_snow, lines=False, - title="Snow cover extent (from satellite imagery)", - colorbar_drawedges=False, - **label_info +cfp.con( + sub_snow, + lines=False, + title="Snow cover extent (from satellite imagery)", + colorbar_drawedges=False, + **label_info, ) cfp.gclose() diff --git a/docs/source/recipes/plot_20_recipe.py b/docs/source/recipes/plot_20_recipe.py index 11c3250842..1745652afc 100644 --- a/docs/source/recipes/plot_20_recipe.py +++ b/docs/source/recipes/plot_20_recipe.py @@ -10,6 +10,7 @@ # %% # 1. Import cf-python and cf-plot: import cfplot as cfp + import cf # %% @@ -81,7 +82,8 @@ cfp.mapset(resolution="10m") cfp.cscale("ncl_default") cfp.gopen( - file=f"irish-sea-currents-divergence-{chosen_time.replace(' ', '-')}.png") + file=f"irish-sea-currents-divergence-{chosen_time.replace(' ', '-')}.png" +) cfp.vect(u=u_2, v=v_2, stride=6, scale=3, key_length=1) cfp.con( div, @@ -89,6 +91,6 @@ title=( f"Depth-averaged Irish Sea currents at {chosen_time} with " "their divergence" - ) + ), ) cfp.gclose() diff --git a/docs/source/tutorial.rst b/docs/source/tutorial.rst index ceafb8815e..57f17bd07d 100644 --- a/docs/source/tutorial.rst +++ b/docs/source/tutorial.rst @@ -624,7 +624,7 @@ retrieved with the `~Field.properties` method: .. code-block:: python :caption: *Retrieve all of the descriptive properties* - >>> q, t = cf.read('file.nc')[1] + >>> t = cf.read('file.nc')[1] >>> t.properties() {'Conventions': 'CF-1.11', 'project': 'research', diff --git a/requirements.txt b/requirements.txt index 8b01daddca..94886c0d57 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,12 +1,12 @@ netCDF4>=1.6.5 cftime>=1.6.2 -numpy>=1.22 +numpy>=1.22,<2.0 cfdm>=1.11.2.0, <1.11.3.0 psutil>=0.6.0 cfunits>=3.3.7 -dask>=2024.4.0 +dask>=2024.6.0,<=2024.7.1 packaging>=20.0 scipy>=1.10.0 h5netcdf>=1.3.0 h5py>=3.10.0 -s3fs>=2024.2.0 +s3fs>=2024.6.0