diff --git a/flixopt/transform_accessor.py b/flixopt/transform_accessor.py index 46e0e4c6a..9274c41f0 100644 --- a/flixopt/transform_accessor.py +++ b/flixopt/transform_accessor.py @@ -1694,6 +1694,16 @@ def cluster( else: ds_for_clustering = ds + # Validate user-provided weight keys against the selected clustering input + if cluster is not None and cluster.weights is not None: + selected_vars = set(ds_for_clustering.data_vars) + unknown = sorted(set(cluster.weights) - selected_vars) + if unknown: + raise ValueError( + f'ClusterConfig weights reference unknown variables: {unknown}. ' + f'Available variables can be found via transform.clustering_data().' + ) + # Filter constant arrays once on the full dataset (not per slice) # This ensures all slices have the same variables for consistent metrics ds_for_clustering = drop_constant_arrays(ds_for_clustering, dim='time') diff --git a/tests/test_clustering/test_integration.py b/tests/test_clustering/test_integration.py index 622df94f8..f5d23c691 100644 --- a/tests/test_clustering/test_integration.py +++ b/tests/test_clustering/test_integration.py @@ -335,13 +335,12 @@ def test_tsam_kwargs_passthrough(self, basic_flow_system): ) assert len(fs_clustered.clusters) == 2 - def test_extra_weight_keys_filtered(self, basic_flow_system): - """Test that extra keys in ClusterConfig.weights are filtered out. + def test_unknown_weight_keys_raise(self, basic_flow_system): + """Test that unknown keys in ClusterConfig.weights raise ValueError. - Regression test: tsam raises errors when weights contain keys not present - in the clustering DataFrame. Extra keys can arise when constant columns - are dropped before clustering, or when the user specifies weights for - variables not in the FlowSystem. + Regression test: weight keys that don't match any variable in the + FlowSystem are likely typos and should be caught early with a clear + error message. """ from tsam import ClusterConfig @@ -354,13 +353,36 @@ def test_extra_weight_keys_filtered(self, basic_flow_system): weights['nonexistent_variable'] = 0.5 weights['another_missing_col'] = 0.3 - # Must not raise despite extra weight keys - fs_clustered = basic_flow_system.transform.cluster( - n_clusters=2, - cluster_duration='1D', - cluster=ClusterConfig(weights=weights), - ) - assert len(fs_clustered.clusters) == 2 + with pytest.raises(ValueError, match='unknown variables'): + basic_flow_system.transform.cluster( + n_clusters=2, + cluster_duration='1D', + cluster=ClusterConfig(weights=weights), + ) + + def test_weight_keys_excluded_by_data_vars_raise(self, basic_flow_system): + """Test that weight keys excluded by the data_vars allow-list raise ValueError. + + A variable may exist on the FlowSystem but be intentionally omitted from + the clustering input via data_vars. Weights referencing such excluded + variables should be rejected. + """ + from tsam import ClusterConfig + + ds = basic_flow_system.to_dataset(include_solution=False) + clustering_columns = list(basic_flow_system.transform.clustering_data().data_vars) + excluded_var = sorted(set(ds.data_vars) - set(clustering_columns))[0] + + # Weight references both a selected var and an excluded var + weights = {clustering_columns[0]: 1.0, excluded_var: 0.5} + + with pytest.raises(ValueError, match='unknown variables'): + basic_flow_system.transform.cluster( + n_clusters=2, + cluster_duration='1D', + data_vars=clustering_columns, + cluster=ClusterConfig(weights=weights), + ) def test_extra_weight_keys_filtered_with_constant_column(self): """Test that weights for constant (dropped) columns are filtered out. @@ -398,16 +420,29 @@ def test_extra_weight_keys_filtered_with_constant_column(self): constant_sink = Sink('constant_load', inputs=[constant_flow]) fs.add_elements(source, sink, constant_sink, bus) - # The constant column name (find it from clustering_data) - all_data = fs.transform.clustering_data() + # Use to_dataset() to get ALL columns including the constant one + # (clustering_data() already strips constants, so it wouldn't test the path) + all_data = fs.to_dataset(include_solution=False) all_columns = set(all_data.data_vars) - - # Build weights that reference ALL columns (including the constant one - # that will be dropped) plus an extra nonexistent one + clustering_columns = set(fs.transform.clustering_data().data_vars) + + # Identify constant columns: variables with a single unique value across time + constant_columns = set() + for name in all_data.data_vars: + var = all_data[name] + if 'time' not in var.dims or np.nanmax(var.values) - np.nanmin(var.values) < 1e-10: + constant_columns.add(name) + + assert len(constant_columns) > 0, 'Test requires at least one constant column' + assert constant_columns <= all_columns, 'Constant columns must be in the full dataset' + for col in constant_columns: + assert col not in clustering_columns, f'Constant column {col!r} should not be in clustering_data()' + + # Build weights that reference ALL columns including the constant one + # that will be dropped — these are valid variables, just constant over time weights = {col: 1.0 for col in all_columns} - weights['totally_fake_column'] = 0.5 - # Before the fix, this would raise in tsam due to extra weight keys + # Must not raise: constant columns are silently filtered, not rejected fs_clustered = fs.transform.cluster( n_clusters=2, cluster_duration='1D', @@ -415,11 +450,49 @@ def test_extra_weight_keys_filtered_with_constant_column(self): ) assert len(fs_clustered.clusters) == 2 - def test_extra_weight_keys_filtered_multiperiod(self): - """Test that extra weight keys are filtered in multi-period clustering. + def test_unknown_weight_keys_raise_multiperiod(self): + """Test that unknown weight keys raise ValueError in multi-period clustering.""" + pytest.importorskip('tsam') + from tsam import ClusterConfig - Each period is clustered independently; weights must be filtered per - slice so no extra keys leak through to tsam. + from flixopt import Bus, Flow, Sink, Source + from flixopt.core import TimeSeriesData + + n_hours = 168 # 7 days + fs = FlowSystem( + timesteps=pd.date_range('2024-01-01', periods=n_hours, freq='h'), + periods=pd.Index([2025, 2030], name='period'), + ) + + demand_data = np.sin(np.linspace(0, 14 * np.pi, n_hours)) + 2 + bus = Bus('electricity') + grid_flow = Flow('grid_in', bus='electricity', size=100) + demand_flow = Flow( + 'demand_out', + bus='electricity', + size=100, + fixed_relative_profile=TimeSeriesData(demand_data / 100), + ) + source = Source('grid', outputs=[grid_flow]) + sink = Sink('demand', inputs=[demand_flow]) + fs.add_elements(source, sink, bus) + + clustering_data = fs.transform.clustering_data() + weights = {col: 1.0 for col in clustering_data.data_vars} + weights['nonexistent_period_var'] = 0.7 + + with pytest.raises(ValueError, match='unknown variables'): + fs.transform.cluster( + n_clusters=2, + cluster_duration='1D', + cluster=ClusterConfig(weights=weights), + ) + + def test_valid_weight_keys_multiperiod(self): + """Test that valid weight keys work in multi-period clustering. + + Each period is clustered independently; weights for valid columns + must be filtered per slice so no extra keys leak through to tsam. """ pytest.importorskip('tsam') from tsam import ClusterConfig @@ -446,10 +519,8 @@ def test_extra_weight_keys_filtered_multiperiod(self): sink = Sink('demand', inputs=[demand_flow]) fs.add_elements(source, sink, bus) - # Weights with extra keys that don't exist in any period slice clustering_data = fs.transform.clustering_data() weights = {col: 1.0 for col in clustering_data.data_vars} - weights['nonexistent_period_var'] = 0.7 fs_clustered = fs.transform.cluster( n_clusters=2,