66
77import logging
88import warnings
9+ from collections import defaultdict
910from itertools import chain
1011from typing import TYPE_CHECKING , Any , Literal , Optional
1112
@@ -982,6 +983,65 @@ def isel(
982983 selected_dataset = ds .isel (** indexers )
983984 return self .__class__ .from_dataset (selected_dataset )
984985
986+ def _resample_by_dimension_groups (
987+ self ,
988+ time_dataset : xr .Dataset ,
989+ time : str ,
990+ method : str ,
991+ ** kwargs : Any ,
992+ ) -> xr .Dataset :
993+ """
994+ Resample variables grouped by their dimension structure to avoid broadcasting.
995+
996+ This method groups variables by their non-time dimensions before resampling,
997+ which provides two key benefits:
998+
999+ 1. **Performance**: Resampling many variables with the same dimensions together
1000+ is significantly faster than resampling each variable individually.
1001+
1002+ 2. **Safety**: Prevents xarray from broadcasting variables with different
1003+ dimensions into a larger dimensional space filled with NaNs, which would
1004+ cause memory bloat and computational inefficiency.
1005+
1006+ Example:
1007+ Without grouping (problematic):
1008+ var1: (time, location, tech) shape (8000, 10, 2)
1009+ var2: (time, region) shape (8000, 5)
1010+ concat → (variable, time, location, tech, region) ← Unwanted broadcasting!
1011+
1012+ With grouping (safe and fast):
1013+ Group 1: [var1, var3, ...] with dims (time, location, tech)
1014+ Group 2: [var2, var4, ...] with dims (time, region)
1015+ Each group resampled separately → No broadcasting, optimal performance!
1016+
1017+ Args:
1018+ time_dataset: Dataset containing only variables with time dimension
1019+ time: Resampling frequency (e.g., '2h', '1D', '1M')
1020+ method: Resampling method name (e.g., 'mean', 'sum', 'first')
1021+ **kwargs: Additional arguments passed to xarray.resample()
1022+
1023+ Returns:
1024+ Resampled dataset with original dimension structure preserved
1025+ """
1026+ # Group variables by dimensions (excluding time)
1027+ dim_groups = defaultdict (list )
1028+ for var_name , var in time_dataset .data_vars .items ():
1029+ dims_key = tuple (sorted (d for d in var .dims if d != 'time' ))
1030+ dim_groups [dims_key ].append (var_name )
1031+
1032+ # Handle empty case: no time-dependent variables
1033+ if not dim_groups :
1034+ return getattr (time_dataset .resample (time = time , ** kwargs ), method )()
1035+
1036+ # Resample each group separately
1037+ resampled_groups = []
1038+ for var_names in dim_groups .values ():
1039+ grouped_dataset = time_dataset [var_names ]
1040+ resampled_group = getattr (grouped_dataset .resample (time = time , ** kwargs ), method )()
1041+ resampled_groups .append (resampled_group )
1042+
1043+ return xr .merge (resampled_groups )
1044+
9851045 def resample (
9861046 self ,
9871047 time : str ,
@@ -1007,34 +1067,28 @@ def resample(
10071067 if not self .connected_and_transformed :
10081068 self .connect_and_transform ()
10091069
1010- dataset = self .to_dataset ()
1070+ # Validate method before resampling
1071+ available_methods = ['mean' , 'sum' , 'max' , 'min' , 'first' , 'last' , 'std' , 'var' , 'median' , 'count' ]
1072+ if method not in available_methods :
1073+ raise ValueError (f'Unsupported resampling method: { method } . Available: { available_methods } ' )
10111074
1012- # Separate variables with and without time dimension
1013- time_vars = {}
1014- non_time_vars = {}
1075+ dataset = self .to_dataset ()
10151076
1016- for var_name , var in dataset .data_vars .items ():
1017- if 'time' in var .dims :
1018- time_vars [var_name ] = var
1019- else :
1020- non_time_vars [var_name ] = var
1077+ time_var_names = [v for v in dataset .data_vars if 'time' in dataset [v ].dims ]
1078+ non_time_var_names = [v for v in dataset .data_vars if v not in time_var_names ]
10211079
10221080 # Only resample variables that have time dimension
1023- time_dataset = dataset [list (time_vars .keys ())]
1024- resampler = time_dataset .resample (time = time , ** kwargs )
1081+ time_dataset = dataset [time_var_names ]
10251082
1026- if hasattr (resampler , method ):
1027- resampled_time_data = getattr (resampler , method )()
1028- else :
1029- available_methods = ['mean' , 'sum' , 'max' , 'min' , 'first' , 'last' , 'std' , 'var' , 'median' , 'count' ]
1030- raise ValueError (f'Unsupported resampling method: { method } . Available: { available_methods } ' )
1083+ # Resample with dimension grouping to avoid broadcasting
1084+ resampled_time_dataset = self ._resample_by_dimension_groups (time_dataset , time , method , ** kwargs )
10311085
10321086 # Combine resampled time variables with non-time variables
1033- if non_time_vars :
1034- non_time_dataset = dataset [list ( non_time_vars . keys ()) ]
1035- resampled_dataset = xr .merge ([resampled_time_data , non_time_dataset ])
1087+ if non_time_var_names :
1088+ non_time_dataset = dataset [non_time_var_names ]
1089+ resampled_dataset = xr .merge ([resampled_time_dataset , non_time_dataset ])
10361090 else :
1037- resampled_dataset = resampled_time_data
1091+ resampled_dataset = resampled_time_dataset
10381092
10391093 # Let FlowSystem recalculate or use explicitly set value
10401094 resampled_dataset .attrs ['hours_of_last_timestep' ] = hours_of_last_timestep
0 commit comments