diff --git a/Changelog.rst b/Changelog.rst index b2a939d604..90108a6409 100644 --- a/Changelog.rst +++ b/Changelog.rst @@ -9,6 +9,8 @@ version NEXT to regrid the vertical axis in logarithmic coordinates to `cf.Field.regrids` and `cf.Field.regridc` (https://github.com/NCAS-CMS/cf-python/issues/715) +* Improve the performance of reading and accessing the data of PP and + UM fields files (https://github.com/NCAS-CMS/cf-python/issues/746) * Improve `cf.Field.collapse` performance by lazily computing reduced axis coordinates (https://github.com/NCAS-CMS/cf-python/issues/741) * Improve `cf.Field.__getitem__` performance by not re-calculating diff --git a/cf/data/array/umarray.py b/cf/data/array/umarray.py index ab5d0d857f..a5e24ddb74 100644 --- a/cf/data/array/umarray.py +++ b/cf/data/array/umarray.py @@ -8,7 +8,7 @@ load_stash2standard_name, parse_indices, ) -from ...umread_lib.umfile import File +from ...umread_lib.umfile import File, Rec from .abstract import Array from .mixin import FileArrayMixin @@ -272,13 +272,22 @@ def _get_rec(self, f, header_offset): The record container. """ - # TODOCFA: This method doesn't require data_offset and disk_length, - # so plays nicely with CFA. Is it fast enough that we can - # use this method always? - for v in f.vars: - for r in v.recs: - if r.hdr_offset == header_offset: - return r + return Rec.from_file_and_offsets(f, header_offset) + + # ------------------------------------------------------------ + # Leave the following commented code here for debugging + # purposes. Replacing the above line with this code moves the + # calculation of the data offset and disk length from pure + # Python to the C library, at the expense of completely + # parsing the file. Note: If you do replace the above line + # with the commented code, then you *must* also set + # 'parse=True' in the `open` method. + # ------------------------------------------------------------ + + # for v in f.vars: + # for r in v.recs: + # if r.hdr_offset == header_offset: + # return r def _set_units(self, int_hdr): """The units and calendar properties. @@ -666,16 +675,18 @@ def get_word_size(self): return self._get_component("word_size", None) def open(self): - """Returns an open dataset containing the data array. + """Returns an open dataset and the address of the data. :Returns: - `umfile_lib.File`, `int` + `umfile_lib.umfile.File`, `int` + The open file object, and the start address in bytes + of the lookup header. **Examples** >>> f.open() - (, 44567) + (, 4) """ return super().open( @@ -683,4 +694,5 @@ def open(self): byte_ordering=self.get_byte_ordering(), word_size=self.get_word_size(), fmt=self.get_fmt(), + parse=False, ) diff --git a/cf/field.py b/cf/field.py index 7efe404212..3f37b4843d 100644 --- a/cf/field.py +++ b/cf/field.py @@ -454,8 +454,11 @@ def __getitem__(self, indices): # below. if org_cyclic: new_cyclic = new_data.cyclic() - [new.cyclic(i, iscyclic=False) for i in org_cyclic if i not in new_cyclic] - + [ + new.cyclic(i, iscyclic=False) + for i in org_cyclic + if i not in new_cyclic + ] # ------------------------------------------------------------ # Subspace constructs with data diff --git a/cf/read_write/um/umread.py b/cf/read_write/um/umread.py index c073adb2fa..a409ba2bd6 100644 --- a/cf/read_write/um/umread.py +++ b/cf/read_write/um/umread.py @@ -2469,9 +2469,9 @@ def data_type_in_file(self, rec): if rec.int_hdr.item(lbuser2) == 3: # Boolean return np.dtype(bool) - else: - # Int or float - return rec.get_type_and_num_words()[0] + + # Int or float + return rec.get_type_and_num_words()[0] def printfdr(self, display=False): """Print out the contents of PP field headers. @@ -3439,7 +3439,7 @@ def read( else: byte_ordering = None - f = self.file_open(filename) + f = self.file_open(filename, parse=True) info = is_log_level_info(logger) @@ -3472,6 +3472,7 @@ def _open_um_file( fmt=None, word_size=None, byte_ordering=None, + parse=True, ): """Open a UM fields file or PP file. @@ -3480,10 +3481,18 @@ def _open_um_file( filename: `str` The file to be opened. + parse: `bool`, optional + If True, the default, then parse the contents. If + False then the contents are not parsed, which can be + considerably faster in cases when the contents are not + required. + + .. versionadded:: NEXTVERSION + :Returns: - `umread.umfile.File` - The opened file with an open file descriptor. + `umread_lib.umfile.File` + The open PP or FF file object. """ self.file_close() @@ -3493,6 +3502,7 @@ def _open_um_file( byte_ordering=byte_ordering, word_size=word_size, fmt=fmt, + parse=parse, ) except Exception as error: try: @@ -3527,7 +3537,9 @@ def is_um_file(self, filename): """ try: - self.file_open(filename) + # Note: No need to completely parse the file to ascertain + # if it's PP or FF. + self.file_open(filename, parse=False) except Exception: self.file_close() return False @@ -3549,7 +3561,7 @@ def file_close(self): self._um_file = None - def file_open(self, filename): + def file_open(self, filename, parse=True): """Open the file for reading. :Paramters: @@ -3557,8 +3569,19 @@ def file_open(self, filename): filename: `str` The file to be read. + parse: `bool`, optional + If True, the default, then parse the contents. If + False then the contents are not parsed, which can be + considerably faster in cases when the contents are not + required. + + .. versionadded:: NEXTVERSION + :Returns: + `umread_lib.umfile.File` + The open PP or FF file object. + """ g = getattr(self, "read_vars", {}) @@ -3567,6 +3590,7 @@ def file_open(self, filename): byte_ordering=g.get("byte_ordering"), word_size=g.get("word_size"), fmt=g.get("fmt"), + parse=parse, ) diff --git a/cf/test/test_pp.py b/cf/test/test_pp.py index 9391a21288..0db5f9049e 100644 --- a/cf/test/test_pp.py +++ b/cf/test/test_pp.py @@ -112,9 +112,8 @@ def test_PP_WGDOS_UNPACKING(self): f = cf.read(self.ppfile)[0] - # TODO: reinstate "CFA4" at version>3.14 - for fmt in ("NETCDF4",): # "CFA4"): - cf.write(f, tmpfile, fmt=fmt) + for cfa in (False, True): + cf.write(f, tmpfile, cfa=cfa) g = cf.read(tmpfile)[0] self.assertTrue((f.array == array).all()) diff --git a/cf/test/test_read_write.py b/cf/test/test_read_write.py index 0eefa1b2ac..3347fee8ea 100644 --- a/cf/test/test_read_write.py +++ b/cf/test/test_read_write.py @@ -551,7 +551,7 @@ def test_write_netcdf_mode(self): def test_read_write_netCDF4_compress_shuffle(self): f = cf.read(self.filename)[0] - # TODO: reinstate "CFA4" at version > 3.14 + # TODODASK: reinstate "CFA4" at version > 3.14 for fmt in ("NETCDF4", "NETCDF4_CLASSIC"): # , "CFA4"): cf.write(f, tmpfile, fmt=fmt, compress=1, shuffle=True) g = cf.read(tmpfile)[0] @@ -920,6 +920,9 @@ def test_write_omit_data(self): self.assertFalse(g.array.count()) self.assertTrue(g.construct("grid_latitude").array.count()) + @unittest.skipUnless( + False, "URL TEST: UNRELIABLE FLAKEY URL DESTINATION. TODO REPLACE URL" + ) def test_read_url(self): """Test reading urls.""" for scheme in ("http", "https"): diff --git a/cf/umread_lib/umfile.py b/cf/umread_lib/umfile.py index b16aadb682..94fbb33043 100644 --- a/cf/umread_lib/umfile.py +++ b/cf/umread_lib/umfile.py @@ -11,6 +11,12 @@ class UMFileException(Exception): pass +# Lookup header pointers +LBLREC = 14 # Length of data record (including any extra data) +LBPACK = 20 # Packing method indicator +LBEGIN = 28 # Disk address/Start Record + + class File: """A class for a UM file that gives a view of the file including sets of PP records combined into variables.""" @@ -34,7 +40,7 @@ def __init__( 'little_endian' or 'big_endian' word_size: `int`, optional - 4 or 8 + The size in bytes of one word. Either ``4`` or ``8``. fmt: `str`, optional 'FF' or 'PP' @@ -281,12 +287,14 @@ def __init__( self.file = file @classmethod - def from_file_and_offsets(cls, file, hdr_offset, data_offset, disk_length): + def from_file_and_offsets( + cls, file, hdr_offset, data_offset=None, disk_length=None + ): """Instantiate a `Rec` object from the `File` object and the header and data offsets. - The headers are read in, and also the record object is ready for - calling `get_data`. + The lookup header is read from disk immediately, and the + returned record object is ready for calling `get_data`. :Parameters: @@ -295,13 +303,17 @@ def from_file_and_offsets(cls, file, hdr_offset, data_offset, disk_length): into variables. hdr_offset: `int` - The start word in the file of the header. + The file start address of the header, in bytes. - data_offset: `int` - The start word in the file of the data. + data_offset: `int`, optional + The file start address of the data, in bytes. If + `None`, the default, then the data offset will be + calculated from the integer header. disk_length: `int` - The length in words of the data in the file. + The length in bytes of the data in the file. If + `None`, the default, then the disk length will be + calculated from the integer header. :Returns: @@ -309,12 +321,38 @@ def from_file_and_offsets(cls, file, hdr_offset, data_offset, disk_length): """ c = file._c_interface + word_size = file.word_size int_hdr, real_hdr = c.read_header( - file.fd, hdr_offset, file.byte_ordering, file.word_size + file.fd, hdr_offset, file.byte_ordering, word_size ) + if data_offset is None: + # Calculate the data offset from the integer header + if file.fmt == "PP": + # We only support 64-word headers, so the data starts + # 66 words after the header_offset, i.e. after 64 + # words of the header, plus 2 block control words. + data_offset = hdr_offset + 66 * word_size + else: + # Fields file + data_offset = int_hdr[LBEGIN] * word_size + + if disk_length is None: + # Calculate the disk length from the integer header + disk_length = int_hdr[LBLREC] + if int_hdr[LBPACK] % 10 == 2: + # Cray 32-bit packing + disk_length = disk_length * 4 + else: + disk_length = disk_length * word_size + return cls( - int_hdr, real_hdr, hdr_offset, data_offset, disk_length, file=file + int_hdr, + real_hdr, + hdr_offset, + data_offset, + disk_length, + file=file, ) def read_extra_data(self): @@ -325,8 +363,8 @@ def read_extra_data(self): `numpy.ndarray` """ - c = self.file._c_interface file = self.file + c = file._c_interface ( extra_data_offset, @@ -389,9 +427,10 @@ def get_data(self): `numpy.ndarray` """ - c = self.file._c_interface file = self.file - data_type, nwords = c.get_type_and_num_words(self.int_hdr) + c = file._c_interface + int_hdr = self.int_hdr + data_type, nwords = c.get_type_and_num_words(int_hdr) return c.read_record_data( file.fd, @@ -399,7 +438,7 @@ def get_data(self): self.disk_length, file.byte_ordering, file.word_size, - self.int_hdr, + int_hdr, self.real_hdr, data_type, nwords,