Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions Changelog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ version NEXT
to regrid the vertical axis in logarithmic coordinates to
`cf.Field.regrids` and `cf.Field.regridc`
(https://github.com/NCAS-CMS/cf-python/issues/715)
* Improve the performance of reading and accessing the data of PP and
UM fields files (https://github.com/NCAS-CMS/cf-python/issues/746)
* Improve `cf.Field.collapse` performance by lazily computing reduced
axis coordinates (https://github.com/NCAS-CMS/cf-python/issues/741)
* Improve `cf.Field.__getitem__` performance by not re-calculating
Expand Down
34 changes: 23 additions & 11 deletions cf/data/array/umarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
load_stash2standard_name,
parse_indices,
)
from ...umread_lib.umfile import File
from ...umread_lib.umfile import File, Rec
from .abstract import Array
from .mixin import FileArrayMixin

Expand Down Expand Up @@ -272,13 +272,22 @@ def _get_rec(self, f, header_offset):
The record container.

"""
# TODOCFA: This method doesn't require data_offset and disk_length,
# so plays nicely with CFA. Is it fast enough that we can
# use this method always?
for v in f.vars:
for r in v.recs:
if r.hdr_offset == header_offset:
return r
return Rec.from_file_and_offsets(f, header_offset)

# ------------------------------------------------------------
# Leave the following commented code here for debugging
# purposes. Replacing the above line with this code moves the
# calculation of the data offset and disk length from pure
# Python to the C library, at the expense of completely
# parsing the file. Note: If you do replace the above line
# with the commented code, then you *must* also set
# 'parse=True' in the `open` method.
# ------------------------------------------------------------

# for v in f.vars:
# for r in v.recs:
# if r.hdr_offset == header_offset:
# return r

def _set_units(self, int_hdr):
"""The units and calendar properties.
Expand Down Expand Up @@ -666,21 +675,24 @@ def get_word_size(self):
return self._get_component("word_size", None)

def open(self):
"""Returns an open dataset containing the data array.
"""Returns an open dataset and the address of the data.

:Returns:

`umfile_lib.File`, `int`
`umfile_lib.umfile.File`, `int`
The open file object, and the start address in bytes
of the lookup header.

**Examples**

>>> f.open()
(<cf.umread_lib.umfile.File object at 0x7fdc25056380>, 44567)
(<cf.umread_lib.umfile.File object at 0x7fdc25056380>, 4)

"""
return super().open(
File,
byte_ordering=self.get_byte_ordering(),
word_size=self.get_word_size(),
fmt=self.get_fmt(),
parse=False,
)
7 changes: 5 additions & 2 deletions cf/field.py
Original file line number Diff line number Diff line change
Expand Up @@ -454,8 +454,11 @@ def __getitem__(self, indices):
# below.
if org_cyclic:
new_cyclic = new_data.cyclic()
[new.cyclic(i, iscyclic=False) for i in org_cyclic if i not in new_cyclic]

[
new.cyclic(i, iscyclic=False)
for i in org_cyclic
if i not in new_cyclic
]

# ------------------------------------------------------------
# Subspace constructs with data
Expand Down
40 changes: 32 additions & 8 deletions cf/read_write/um/umread.py
Original file line number Diff line number Diff line change
Expand Up @@ -2469,9 +2469,9 @@ def data_type_in_file(self, rec):
if rec.int_hdr.item(lbuser2) == 3:
# Boolean
return np.dtype(bool)
else:
# Int or float
return rec.get_type_and_num_words()[0]

# Int or float
return rec.get_type_and_num_words()[0]

def printfdr(self, display=False):
"""Print out the contents of PP field headers.
Expand Down Expand Up @@ -3439,7 +3439,7 @@ def read(
else:
byte_ordering = None

f = self.file_open(filename)
f = self.file_open(filename, parse=True)

info = is_log_level_info(logger)

Expand Down Expand Up @@ -3472,6 +3472,7 @@ def _open_um_file(
fmt=None,
word_size=None,
byte_ordering=None,
parse=True,
):
"""Open a UM fields file or PP file.

Expand All @@ -3480,10 +3481,18 @@ def _open_um_file(
filename: `str`
The file to be opened.

parse: `bool`, optional
If True, the default, then parse the contents. If
False then the contents are not parsed, which can be
considerably faster in cases when the contents are not
required.

.. versionadded:: NEXTVERSION

:Returns:

`umread.umfile.File`
The opened file with an open file descriptor.
`umread_lib.umfile.File`
The open PP or FF file object.

"""
self.file_close()
Expand All @@ -3493,6 +3502,7 @@ def _open_um_file(
byte_ordering=byte_ordering,
word_size=word_size,
fmt=fmt,
parse=parse,
)
except Exception as error:
try:
Expand Down Expand Up @@ -3527,7 +3537,9 @@ def is_um_file(self, filename):

"""
try:
self.file_open(filename)
# Note: No need to completely parse the file to ascertain
# if it's PP or FF.
self.file_open(filename, parse=False)
except Exception:
self.file_close()
return False
Expand All @@ -3549,16 +3561,27 @@ def file_close(self):

self._um_file = None

def file_open(self, filename):
def file_open(self, filename, parse=True):
"""Open the file for reading.

:Paramters:

filename: `str`
The file to be read.

parse: `bool`, optional
If True, the default, then parse the contents. If
False then the contents are not parsed, which can be
considerably faster in cases when the contents are not
required.

.. versionadded:: NEXTVERSION

:Returns:

`umread_lib.umfile.File`
The open PP or FF file object.

"""
g = getattr(self, "read_vars", {})

Expand All @@ -3567,6 +3590,7 @@ def file_open(self, filename):
byte_ordering=g.get("byte_ordering"),
word_size=g.get("word_size"),
fmt=g.get("fmt"),
parse=parse,
)


Expand Down
5 changes: 2 additions & 3 deletions cf/test/test_pp.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,9 +112,8 @@ def test_PP_WGDOS_UNPACKING(self):

f = cf.read(self.ppfile)[0]

# TODO: reinstate "CFA4" at version>3.14
for fmt in ("NETCDF4",): # "CFA4"):
cf.write(f, tmpfile, fmt=fmt)
for cfa in (False, True):
cf.write(f, tmpfile, cfa=cfa)
g = cf.read(tmpfile)[0]

self.assertTrue((f.array == array).all())
Expand Down
5 changes: 4 additions & 1 deletion cf/test/test_read_write.py
Original file line number Diff line number Diff line change
Expand Up @@ -551,7 +551,7 @@ def test_write_netcdf_mode(self):

def test_read_write_netCDF4_compress_shuffle(self):
f = cf.read(self.filename)[0]
# TODO: reinstate "CFA4" at version > 3.14
# TODODASK: reinstate "CFA4" at version > 3.14
for fmt in ("NETCDF4", "NETCDF4_CLASSIC"): # , "CFA4"):
cf.write(f, tmpfile, fmt=fmt, compress=1, shuffle=True)
g = cf.read(tmpfile)[0]
Expand Down Expand Up @@ -920,6 +920,9 @@ def test_write_omit_data(self):
self.assertFalse(g.array.count())
self.assertTrue(g.construct("grid_latitude").array.count())

@unittest.skipUnless(
False, "URL TEST: UNRELIABLE FLAKEY URL DESTINATION. TODO REPLACE URL"
)
def test_read_url(self):
"""Test reading urls."""
for scheme in ("http", "https"):
Expand Down
67 changes: 53 additions & 14 deletions cf/umread_lib/umfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,12 @@ class UMFileException(Exception):
pass


# Lookup header pointers
LBLREC = 14 # Length of data record (including any extra data)
LBPACK = 20 # Packing method indicator
LBEGIN = 28 # Disk address/Start Record


class File:
"""A class for a UM file that gives a view of the file including
sets of PP records combined into variables."""
Expand All @@ -34,7 +40,7 @@ def __init__(
'little_endian' or 'big_endian'

word_size: `int`, optional
4 or 8
The size in bytes of one word. Either ``4`` or ``8``.

fmt: `str`, optional
'FF' or 'PP'
Expand Down Expand Up @@ -281,12 +287,14 @@ def __init__(
self.file = file

@classmethod
def from_file_and_offsets(cls, file, hdr_offset, data_offset, disk_length):
def from_file_and_offsets(
cls, file, hdr_offset, data_offset=None, disk_length=None
):
"""Instantiate a `Rec` object from the `File` object and the
header and data offsets.

The headers are read in, and also the record object is ready for
calling `get_data`.
The lookup header is read from disk immediately, and the
returned record object is ready for calling `get_data`.

:Parameters:

Expand All @@ -295,26 +303,56 @@ def from_file_and_offsets(cls, file, hdr_offset, data_offset, disk_length):
into variables.

hdr_offset: `int`
The start word in the file of the header.
The file start address of the header, in bytes.

data_offset: `int`
The start word in the file of the data.
data_offset: `int`, optional
The file start address of the data, in bytes. If
`None`, the default, then the data offset will be
calculated from the integer header.

disk_length: `int`
The length in words of the data in the file.
The length in bytes of the data in the file. If
`None`, the default, then the disk length will be
calculated from the integer header.

:Returns:

`Rec`

"""
c = file._c_interface
word_size = file.word_size
int_hdr, real_hdr = c.read_header(
file.fd, hdr_offset, file.byte_ordering, file.word_size
file.fd, hdr_offset, file.byte_ordering, word_size
)

if data_offset is None:
# Calculate the data offset from the integer header
if file.fmt == "PP":
# We only support 64-word headers, so the data starts
# 66 words after the header_offset, i.e. after 64
# words of the header, plus 2 block control words.
data_offset = hdr_offset + 66 * word_size
else:
# Fields file
data_offset = int_hdr[LBEGIN] * word_size

if disk_length is None:
# Calculate the disk length from the integer header
disk_length = int_hdr[LBLREC]
if int_hdr[LBPACK] % 10 == 2:
# Cray 32-bit packing
disk_length = disk_length * 4
else:
disk_length = disk_length * word_size

return cls(
int_hdr, real_hdr, hdr_offset, data_offset, disk_length, file=file
int_hdr,
real_hdr,
hdr_offset,
data_offset,
disk_length,
file=file,
)

def read_extra_data(self):
Expand All @@ -325,8 +363,8 @@ def read_extra_data(self):
`numpy.ndarray`

"""
c = self.file._c_interface
file = self.file
c = file._c_interface

(
extra_data_offset,
Expand Down Expand Up @@ -389,17 +427,18 @@ def get_data(self):
`numpy.ndarray`

"""
c = self.file._c_interface
file = self.file
data_type, nwords = c.get_type_and_num_words(self.int_hdr)
c = file._c_interface
int_hdr = self.int_hdr
data_type, nwords = c.get_type_and_num_words(int_hdr)

return c.read_record_data(
file.fd,
self.data_offset,
self.disk_length,
file.byte_ordering,
file.word_size,
self.int_hdr,
int_hdr,
self.real_hdr,
data_type,
nwords,
Expand Down