Skip to content

Commit 87bd422

Browse files
authored
Feature/speed up resample (#455)
1 parent 67e13a5 commit 87bd422

2 files changed

Lines changed: 75 additions & 20 deletions

File tree

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ If upgrading from v2.x, see the [v3.0.0 release notes](https://github.com/flixOp
6060
### 💥 Breaking Changes
6161
6262
### ♻️ Changed
63+
- Greatly sped up the resampling of a FlowSystem (x20 - x40) by converting to dataarray internally
6364
6465
### 🗑️ Deprecated
6566

flixopt/flow_system.py

Lines changed: 74 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66

77
import logging
88
import warnings
9+
from collections import defaultdict
910
from itertools import chain
1011
from typing import TYPE_CHECKING, Any, Literal, Optional
1112

@@ -982,6 +983,65 @@ def isel(
982983
selected_dataset = ds.isel(**indexers)
983984
return self.__class__.from_dataset(selected_dataset)
984985

986+
def _resample_by_dimension_groups(
987+
self,
988+
time_dataset: xr.Dataset,
989+
time: str,
990+
method: str,
991+
**kwargs: Any,
992+
) -> xr.Dataset:
993+
"""
994+
Resample variables grouped by their dimension structure to avoid broadcasting.
995+
996+
This method groups variables by their non-time dimensions before resampling,
997+
which provides two key benefits:
998+
999+
1. **Performance**: Resampling many variables with the same dimensions together
1000+
is significantly faster than resampling each variable individually.
1001+
1002+
2. **Safety**: Prevents xarray from broadcasting variables with different
1003+
dimensions into a larger dimensional space filled with NaNs, which would
1004+
cause memory bloat and computational inefficiency.
1005+
1006+
Example:
1007+
Without grouping (problematic):
1008+
var1: (time, location, tech) shape (8000, 10, 2)
1009+
var2: (time, region) shape (8000, 5)
1010+
concat → (variable, time, location, tech, region) ← Unwanted broadcasting!
1011+
1012+
With grouping (safe and fast):
1013+
Group 1: [var1, var3, ...] with dims (time, location, tech)
1014+
Group 2: [var2, var4, ...] with dims (time, region)
1015+
Each group resampled separately → No broadcasting, optimal performance!
1016+
1017+
Args:
1018+
time_dataset: Dataset containing only variables with time dimension
1019+
time: Resampling frequency (e.g., '2h', '1D', '1M')
1020+
method: Resampling method name (e.g., 'mean', 'sum', 'first')
1021+
**kwargs: Additional arguments passed to xarray.resample()
1022+
1023+
Returns:
1024+
Resampled dataset with original dimension structure preserved
1025+
"""
1026+
# Group variables by dimensions (excluding time)
1027+
dim_groups = defaultdict(list)
1028+
for var_name, var in time_dataset.data_vars.items():
1029+
dims_key = tuple(sorted(d for d in var.dims if d != 'time'))
1030+
dim_groups[dims_key].append(var_name)
1031+
1032+
# Handle empty case: no time-dependent variables
1033+
if not dim_groups:
1034+
return getattr(time_dataset.resample(time=time, **kwargs), method)()
1035+
1036+
# Resample each group separately
1037+
resampled_groups = []
1038+
for var_names in dim_groups.values():
1039+
grouped_dataset = time_dataset[var_names]
1040+
resampled_group = getattr(grouped_dataset.resample(time=time, **kwargs), method)()
1041+
resampled_groups.append(resampled_group)
1042+
1043+
return xr.merge(resampled_groups)
1044+
9851045
def resample(
9861046
self,
9871047
time: str,
@@ -1007,34 +1067,28 @@ def resample(
10071067
if not self.connected_and_transformed:
10081068
self.connect_and_transform()
10091069

1010-
dataset = self.to_dataset()
1070+
# Validate method before resampling
1071+
available_methods = ['mean', 'sum', 'max', 'min', 'first', 'last', 'std', 'var', 'median', 'count']
1072+
if method not in available_methods:
1073+
raise ValueError(f'Unsupported resampling method: {method}. Available: {available_methods}')
10111074

1012-
# Separate variables with and without time dimension
1013-
time_vars = {}
1014-
non_time_vars = {}
1075+
dataset = self.to_dataset()
10151076

1016-
for var_name, var in dataset.data_vars.items():
1017-
if 'time' in var.dims:
1018-
time_vars[var_name] = var
1019-
else:
1020-
non_time_vars[var_name] = var
1077+
time_var_names = [v for v in dataset.data_vars if 'time' in dataset[v].dims]
1078+
non_time_var_names = [v for v in dataset.data_vars if v not in time_var_names]
10211079

10221080
# Only resample variables that have time dimension
1023-
time_dataset = dataset[list(time_vars.keys())]
1024-
resampler = time_dataset.resample(time=time, **kwargs)
1081+
time_dataset = dataset[time_var_names]
10251082

1026-
if hasattr(resampler, method):
1027-
resampled_time_data = getattr(resampler, method)()
1028-
else:
1029-
available_methods = ['mean', 'sum', 'max', 'min', 'first', 'last', 'std', 'var', 'median', 'count']
1030-
raise ValueError(f'Unsupported resampling method: {method}. Available: {available_methods}')
1083+
# Resample with dimension grouping to avoid broadcasting
1084+
resampled_time_dataset = self._resample_by_dimension_groups(time_dataset, time, method, **kwargs)
10311085

10321086
# Combine resampled time variables with non-time variables
1033-
if non_time_vars:
1034-
non_time_dataset = dataset[list(non_time_vars.keys())]
1035-
resampled_dataset = xr.merge([resampled_time_data, non_time_dataset])
1087+
if non_time_var_names:
1088+
non_time_dataset = dataset[non_time_var_names]
1089+
resampled_dataset = xr.merge([resampled_time_dataset, non_time_dataset])
10361090
else:
1037-
resampled_dataset = resampled_time_data
1091+
resampled_dataset = resampled_time_dataset
10381092

10391093
# Let FlowSystem recalculate or use explicitly set value
10401094
resampled_dataset.attrs['hours_of_last_timestep'] = hours_of_last_timestep

0 commit comments

Comments
 (0)