diff --git a/Changelog.rst b/Changelog.rst index 233264fd9b..b419a7fe2d 100644 --- a/Changelog.rst +++ b/Changelog.rst @@ -3,6 +3,14 @@ Version NEXTVERSION **2025-??-??** +* Implement lossy compression via quantization + (https://github.com/NCAS-CMS/cf-python/issues/870) +* New quantization class: `cf.Quantization` + (https://github.com/NCAS-CMS/cf-python/issues/870) +* New quantization methods: `cf.Field.get_quantization`, + `cf.Field.get_quantize_on_write`, `cf.Field.set_quantize_on_write`, + `cf.Field.del_quantize_on_write` + (https://github.com/NCAS-CMS/cf-python/issues/870) * New keyword parameter to `cf.write`: ``chunk_cache`` (https://github.com/NCAS-CMS/cf-python/issues/871) * Read Zarr datasets with `cf.read` diff --git a/README.md b/README.md index 9f5c75a8b6..ce752fd0af 100644 --- a/README.md +++ b/README.md @@ -116,6 +116,8 @@ of its array manipulation and can: * create running means from field constructs, * apply differential operators to field constructs, * create derived quantities (such as relative vorticity). +* read and write that data that are quantized to eliminate false + precision. Visualization ============= diff --git a/RELEASE.md b/RELEASE.md index 070b545b3d..b3cf931b4a 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -17,10 +17,11 @@ - [ ] Change the version and date in `cf/__init__.py` (`__version__` and `__date__` variables) -- [ ] Ensure that the requirements on dependencies & their versions are - up-to-date and consistent in both the `requirements.txt` and in - `docs/source/installation.rst`; and in the `_requires` list and - `Version` checks in `cf/__init__.py`. +- [ ] Ensure that the requirements on dependencies & their versions + are up-to-date and consistent in both the `requirements.txt` and in + `docs/source/installation.rst` (paying particular attention to + `cfdm`); and in the `_requires` list and `Version` checks in + `cf/__init__.py`. - [ ] Make sure that `README.md` is up to date. diff --git a/cf/__init__.py b/cf/__init__.py index 2df23b25e5..4004955cbb 100644 --- a/cf/__init__.py +++ b/cf/__init__.py @@ -241,6 +241,7 @@ from .nodecountproperties import NodeCountProperties from .partnodecountproperties import PartNodeCountProperties from .interiorring import InteriorRing +from .quantization import Quantization from .tiepointindex import TiePointIndex from .bounds import Bounds diff --git a/cf/cfimplementation.py b/cf/cfimplementation.py index ece90bde2c..543a1fca9b 100644 --- a/cf/cfimplementation.py +++ b/cf/cfimplementation.py @@ -23,6 +23,7 @@ List, NodeCountProperties, PartNodeCountProperties, + Quantization, TiePointIndex, ) from .data import Data @@ -147,6 +148,7 @@ def set_construct(self, parent, construct, axes=None, copy=True, **kwargs): H5netcdfArray=H5netcdfArray, NetCDF4Array=NetCDF4Array, PointTopologyArray=PointTopologyArray, + Quantization=Quantization, RaggedContiguousArray=RaggedContiguousArray, RaggedIndexedArray=RaggedIndexedArray, RaggedIndexedContiguousArray=RaggedIndexedContiguousArray, @@ -203,6 +205,7 @@ def implementation(): 'H5netcdfArray': cf.data.array.h5netcdfarray.H5netcdfArray, 'NetCDF4Array': cf.data.array.netcdf4array.NetCDF4Array, 'PointTopologyArray': , + 'Quantization': cf.quantization.Quantization, 'RaggedContiguousArray': cf.data.array.raggedcontiguousarray.RaggedContiguousArray, 'RaggedIndexedArray': cf.data.array.raggedindexedarray.RaggedIndexedArray, 'RaggedIndexedContiguousArray': cf.data.array.raggedindexedcontiguousarray.RaggedIndexedContiguousArray, diff --git a/cf/field.py b/cf/field.py index c9eb0c19bd..c8b2bc9a48 100644 --- a/cf/field.py +++ b/cf/field.py @@ -22,6 +22,7 @@ Flags, Index, List, + Quantization, mixin, ) from .constants import masked as cf_masked @@ -280,7 +281,7 @@ def __new__(cls, *args, **kwargs): instance._Domain = Domain instance._DomainAncillary = DomainAncillary instance._DomainAxis = DomainAxis - # instance._Data = Data + instance._Quantization = Quantization instance._RaggedContiguousArray = RaggedContiguousArray instance._RaggedIndexedArray = RaggedIndexedArray instance._RaggedIndexedContiguousArray = RaggedIndexedContiguousArray diff --git a/cf/fieldancillary.py b/cf/fieldancillary.py index ca32eeda3c..6df6dcaa7a 100644 --- a/cf/fieldancillary.py +++ b/cf/fieldancillary.py @@ -1,7 +1,42 @@ import cfdm -from . import mixin +from . import Quantization, mixin class FieldAncillary(mixin.PropertiesData, cfdm.FieldAncillary): - pass + """A field ancillary construct of the CF data model. + + The field ancillary construct provides metadata which are + distributed over the same sampling domain as the field itself. For + example, if a data variable holds a variable retrieved from a + satellite instrument, a related ancillary data variable might + provide the uncertainty estimates for those retrievals (varying + over the same spatiotemporal domain). + + The field ancillary construct consists of an array of the + ancillary data, which is zero-dimensional or which depends on one + or more of the domain axes, and properties to describe the + data. It is assumed that the data do not depend on axes of the + domain which are not spanned by the array, along which the values + are implicitly propagated. CF-netCDF ancillary data variables + correspond to field ancillary constructs. Note that a field + ancillary construct is constrained by the domain definition of the + parent field construct but does not contribute to the domain's + definition, unlike, for instance, an auxiliary coordinate + construct or domain ancillary construct. + + **NetCDF interface** + + {{netCDF variable}} + + {{netCDF dataset chunks}} + + .. versionadded:: 2.0 + + """ + + def __new__(cls, *args, **kwargs): + """Store component classes.""" + instance = super().__new__(cls) + instance._Quantization = Quantization + return instance diff --git a/cf/quantization.py b/cf/quantization.py new file mode 100644 index 0000000000..26e2861ac6 --- /dev/null +++ b/cf/quantization.py @@ -0,0 +1,59 @@ +import cfdm + + +class Quantization(cfdm.Quantization): + """A quantization variable. + + A quantization variable describes a quantization algorithm via a + collection of parameters. + + The ``algorithm`` parameter names a specific quantization + algorithm via one of the keys in the `algorithm_parameters` + dictionary. + + The ``implementation`` parameter contains unstandardised text that + concisely conveys the algorithm provenance including the name of + the library or client that performed the quantization, the + software version, and any other information required to + disambiguate the source of the algorithm employed. The text must + take the form ``software-name version version-string + [(optional-information)]``. + + The retained precision of the algorithm is defined with either the + ``quantization_nsb`` or ``quantization_nsd`` parameter. + + For instance, the following parameters describe quantization via + the BitRound algorithm, retaining 6 significant bits, and + implemented by libnetcdf:: + + >>> q = {{package}}.{{class}}( + ... parameters={'algorithm': 'bitround', + ... 'quantization_nsb': 6, + ... 'implementation': 'libnetcdf version 4.9.4'} + ... ) + >>> q.parameters() + {'algorithm': 'bitround', + 'quantization_nsb': 6, + 'implementation': 'libnetcdf version 4.9.4'} + + See CF section 8.4. "Lossy Compression via Quantization". + + **NetCDF interface** + + {{netCDF variable}} + + {{netCDF group attributes}} + + .. versionadded:: NEXTVERSION + + """ + + def __repr__(self): + """Called by the `repr` built-in function. + + x.__repr__() <==> repr(x) + + .. versionadded:: NEXTVERSION + + """ + return super().__repr__().replace("<", "` for all of its array -manipulation and can: +The `cf` package can: * read :term:`field constructs ` and :term:`domain constructs ` from netCDF, CDL, PP and UM datasets @@ -120,7 +119,10 @@ manipulation and can: * apply differential operators to field constructs, -* create derived quantities (such as relative vorticity). +* create derived quantities (such as relative vorticity), + +* read and write that data that are quantized to eliminate false + precision. ---- @@ -146,9 +148,9 @@ of plotting possibilities with example code. **Performance** --------------- -As of version 3.14.0 (released 2023-01-31), cf uses :ref:`Dask -` for all of its data manipulations, which provides lazy, -parallelised, and out-of-core computations of array operations. +The `cf` package uses :ref:`Dask ` for all of its data +manipulations, which provides lazy, parallelised, and out-of-core +computations of array operations. ---- diff --git a/docs/source/spelling_false_positives.txt b/docs/source/spelling_false_positives.txt index 6d324f615e..6f7ff66d65 100644 --- a/docs/source/spelling_false_positives.txt +++ b/docs/source/spelling_false_positives.txt @@ -1,6 +1,7 @@ absoluted absolutised abspath +actify addattr aggregatable al @@ -293,6 +294,7 @@ lbproc lbtim le len +libnetcdf linebreaks Lineplot ln @@ -379,6 +381,8 @@ programmatically pseudolevels py pyplot +Quantization +quantization radd RaggedContiguousArray RaggedIndexedArray @@ -521,6 +525,7 @@ unmapped unselected Unselected unsqueeze +unstandardised url uri var @@ -534,8 +539,11 @@ Voronoi vorticity wi wo +xarray xor xy +Zarr +ZarrArray Zimmermann δu δv diff --git a/docs/source/tutorial.rst b/docs/source/tutorial.rst index c741aed4e1..4b6806b619 100644 --- a/docs/source/tutorial.rst +++ b/docs/source/tutorial.rst @@ -6692,16 +6692,201 @@ The content of the new file is: 4, 0, 5 ; } ----- - .. _Coordinate-subampling: Coordinate subsampling ^^^^^^^^^^^^^^^^^^^^^^ `Lossy compression by coordinate subsampling`_ was introduced into the -CF conventions at CF-1.9, but is not yet available in cfdm. It will be -ready in a future 3.x.0 release. +CF conventions at CF-1.10 for applications for which the coordinates +can require considerably more storage than the data itself. Space may +be saved in the netCDF file by storing a subsample of the coordinates +that describe the data, and the uncompressed coordinate and auxiliary +coordinate variables are reconstituted by interpolation, from the +subsampled coordinate values to the domain of the data + +This is illustrated with the file ``subsampled.nc`` (found in the +:ref:`sample datasets `): + + +.. code-block:: console + :caption: *Inspect the compressed dataset with the ncdump command + line tool.* + + $ ncdump -h subsampled.nc + netcdf subsampled { + dimensions: + time = 2 ; + lat = 18 ; + lon = 12 ; + tp_lat = 4 ; + tp_lon = 5 ; + variables: + float time(time) ; + time:standard_name = "time" ; + time:units = "days since 2000-01-01" ; + float lat(tp_lat, tp_lon) ; + lat:standard_name = "latitude" ; + lat:units = "degrees_north" ; + lat:bounds_tie_points = "lat_bounds" ; + float lon(tp_lat, tp_lon) ; + lon:standard_name = "longitude" ; + lon:units = "degrees_east" ; + lon:bounds_tie_points = "lon_bounds" ; + float lat_bounds(tp_lat, tp_lon) ; + float lon_bounds(tp_lat, tp_lon) ; + int lat_indices(tp_lat) ; + lat_indices:long_name = "Tie point indices for latitude dimension" ; + int lon_indices(tp_lon) ; + lon_indices:long_name = "Tie point indices for longitude dimension" ; + int bilinear ; + bilinear:interpolation_name = "bi_linear" ; + bilinear:computational_precision = "64" ; + bilinear:tie_point_mapping = + "lat: lat_indices tp_lat lon: lon_indices tp_lon" ; + float q(time, lat, lon) ; + q:standard_name = "specific_humidity" ; + q:units = "1" ; + q:coordinate_interpolation = "lat: lon: bilinear" ; + + // global attributes: + :Conventions = "CF-1.11" ; + } + + +Reading and inspecting this file shows the latitude and longitude +coordinates in uncompressed form, whilst their underlying arrays are +still in subsampled representation described in the file: + +.. code-block:: python + :caption: *Read a field construct from a dataset that has been + compressed by corodinate subsampling, and inspect + coordinates.* + + >>> f = cf.read('subsampled.nc')[0] + >>> print(f) + Field: specific_humidity (ncvar%q) + ---------------------------------- + Data : specific_humidity(time(2), ncdim%lat(18), ncdim%lon(12)) 1 + Dimension coords: time(2) = [2000-01-01 00:00:00, 2000-02-01 00:00:00] + Auxiliary coords: latitude(ncdim%lat(18), ncdim%lon(12)) = [[-85.0, ..., 85.0]] degrees_north + : longitude(ncdim%lat(18), ncdim%lon(12)) = [[15.0, ..., 345.0]] degrees_east + >>> lon = f.construct('longitude') + >>> lon + + >>> lon.data.source() + + >>> print(lon.array) + [[15.0 45.0 75.0 105.0 135.0 165.0 195.0 225.0 255.0 285.0 315.0 345.0] + [15.0 45.0 75.0 105.0 135.0 165.0 195.0 225.0 255.0 285.0 315.0 345.0] + [15.0 45.0 75.0 105.0 135.0 165.0 195.0 225.0 255.0 285.0 315.0 345.0] + [15.0 45.0 75.0 105.0 135.0 165.0 195.0 225.0 255.0 285.0 315.0 345.0] + [15.0 45.0 75.0 105.0 135.0 165.0 195.0 225.0 255.0 285.0 315.0 345.0] + [15.0 45.0 75.0 105.0 135.0 165.0 195.0 225.0 255.0 285.0 315.0 345.0] + [15.0 45.0 75.0 105.0 135.0 165.0 195.0 225.0 255.0 285.0 315.0 345.0] + [15.0 45.0 75.0 105.0 135.0 165.0 195.0 225.0 255.0 285.0 315.0 345.0] + [15.0 45.0 75.0 105.0 135.0 165.0 195.0 225.0 255.0 285.0 315.0 345.0] + [15.0 45.0 75.0 105.0 135.0 165.0 195.0 225.0 255.0 285.0 315.0 345.0] + [15.0 45.0 75.0 105.0 135.0 165.0 195.0 225.0 255.0 285.0 315.0 345.0] + [15.0 45.0 75.0 105.0 135.0 165.0 195.0 225.0 255.0 285.0 315.0 345.0] + [15.0 45.0 75.0 105.0 135.0 165.0 195.0 225.0 255.0 285.0 315.0 345.0] + [15.0 45.0 75.0 105.0 135.0 165.0 195.0 225.0 255.0 285.0 315.0 345.0] + [15.0 45.0 75.0 105.0 135.0 165.0 195.0 225.0 255.0 285.0 315.0 345.0] + [15.0 45.0 75.0 105.0 135.0 165.0 195.0 225.0 255.0 285.0 315.0 345.0] + [15.0 45.0 75.0 105.0 135.0 165.0 195.0 225.0 255.0 285.0 315.0 345.0] + [15.0 45.0 75.0 105.0 135.0 165.0 195.0 225.0 255.0 285.0 315.0 345.0]] + >>> lon.data.source().source() + + >>> print(lon.data.source().source().array) + [[ 15. 135. 225. 255. 345.] + [ 15. 135. 225. 255. 345.] + [ 15. 135. 225. 255. 345.] + [ 15. 135. 225. 255. 345.]] + +As with all other forms of compression, the field may be treated as if +were not compressed: + +.. code-block:: python + :caption: *Get subspaces based on indices of the uncompressed + data.* + + >>> g = f[0, 6, :] + >>> print(g) + Field: specific_humidity (ncvar%q) + ---------------------------------- + Data : specific_humidity(time(1), ncdim%lat(1), ncdim%lon(12)) 1 + Dimension coords: time(1) = [2000-01-01 00:00:00] + Auxiliary coords: latitude(ncdim%lat(1), ncdim%lon(12)) = [[-25.0, ..., -25.0]] degrees_north + : longitude(ncdim%lat(1), ncdim%lon(12)) = [[15.0, ..., 345.0]] degrees_east + >>> print(g.construct('longitude').array) + [[15.0 45.0 75.0 105.0 135.0 165.0 195.0 225.0 255.0 285.0 315.0 345.0]] + + +The metadata that define the subsampling are contained within the +coordinate's `Data` object: + +.. code-block:: python + :caption: *Get subspaces based on indices of the uncompressed + data.* + + >>> lon = f.construct('longitude') + >>> d = lon.data.source() + >>> d.get_tie_point_indices() + {0: , + 1: } + >>> d.get_computational_precision() + '64' + +It is not yet, as of version 1.10.0.0, possible to write to disk a +field construct with compression by coordinate subsampling. + +.. _Lossy-compression-via-quantization: + +Lossy compression via quantization +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +`Lossy compression via quantization`_ eliminates false precision, +usually by rounding the least significant bits of floating-point +mantissas to zeros, so that a subsequent compression on disk is more +efficient. Quantization is described by the following parameters: + +* The ``algorithm`` parameter names a specific quantization algorithm. + +* The ``implementation`` parameter contains unstandardised text that + concisely conveys the algorithm provenance including the name of the + library or client that performed the quantization, the software + version, and any other information required to disambiguate the + source of the algorithm employed. The text must take the form + ``software-name version version-string [(optional-information)]``. + +* The retained precision of the algorithm is defined with either the + ``quantization_nsb`` or ``quantization_nsd`` parameter. + +If quantization has been applied to the data, then it may be described +with in a `Quantization` object, accessed via the construct's +`!get_quantization` method. To apply quantization at the time of +writing the data to disk, use the construct's `!set_quantize_on_write` +method: + +.. code-block:: python + :caption: *Lossy compression via quantization.* + + >>> q, t = cf.read('file.nc') + >>> t.set_quantize_on_write(algorithm='bitgroom', quantization_nsd=1) + >>> cf.write(t, 'quantized.nc') + >>> quantized = cf.read('quantized.nc')[0] + >>> c = quantized.get_quantization() + >>> c + + >>> c.parameters() + {'algorithm': 'bitgroom', + 'implementation': 'libnetcdf version 4.9.4-development', + '_QuantizeBitGroomNumberOfSignificantDigits': np.int32(1), + 'quantization_nsd': np.int64(1)} + >>> t[0, 0, 0].array + array([[[262.8]]]) + >>> quantized[0, 0, 0].array + array([[[256.]]]) ---- diff --git a/requirements.txt b/requirements.txt index 9d875bb541..1bfcaec175 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ netCDF4>=1.7.2 cftime>=1.6.4 numpy>=2.0.0 -cfdm>=1.12.1.0, <1.12.2.0 +cfdm>=1.12.2.0, <1.12.3.0 psutil>=0.6.0 cfunits>=3.3.7 dask>=2025.5.1 diff --git a/setup.py b/setup.py index 706d89af63..495dfa08a8 100755 --- a/setup.py +++ b/setup.py @@ -227,7 +227,10 @@ def compile(): * apply differential operators to field constructs, -* create derived quantities (such as relative vorticity). +* create derived quantities (such as relative vorticity), + +* read and write that data that are quantized to eliminate false + precision. """