diff --git a/pymrio/core/mriosystem.py b/pymrio/core/mriosystem.py index 4929dc20..9781071e 100644 --- a/pymrio/core/mriosystem.py +++ b/pymrio/core/mriosystem.py @@ -55,6 +55,11 @@ # warnings.warn(message, DeprecationWarning, stacklevel=2) +# Constants for column name validation +ALTERNATIVE_REGION_NAMES = ["country", "countries", "regions", "reg", "location"] +ALTERNATIVE_SECTOR_NAMES = ["sectors", "industry", "industries", "sec", "activity"] + + # Exceptions class ResetError(Exception): """Base class for errors while reseting the system.""" @@ -1788,12 +1793,48 @@ def characterize( ) return ret_value(validation=validation, extension=None) - fac_calc = ( - factors.set_index(index_col + [characterized_name_column]) - .loc[:, characterization_factors_column] - .unstack(characterized_name_column) - .fillna(0) - ) + # Check for duplicate indices before unstacking + # This can happen if region/sector columns are not named correctly + factors_indexed = factors.set_index(index_col + [characterized_name_column]) + + if factors_indexed.index.duplicated().any(): + # Build helpful error message + error_msg = ( + "Duplicate indices found in characterization factors. " + "This typically occurs when region or sector specific characterization factors " + "are provided but the column names don't match pymrio's expectations.\n\n" + "Expected column names:\n" + " - 'region' (lowercase) for region-specific factors\n" + " - 'sector' (lowercase) for sector-specific factors\n\n" + f"Current columns in factors dataframe: {list(factors.columns)}\n\n" + ) + + # Check for case mismatches + possible_region_cols = [col for col in factors.columns if col.lower() == "region" and col != "region"] + possible_sector_cols = [col for col in factors.columns if col.lower() == "sector" and col != "sector"] + + if possible_region_cols: + error_msg += f"Found possible region column with different case: {possible_region_cols}\n" + error_msg += "Please rename it to 'region' (lowercase).\n" + if possible_sector_cols: + error_msg += f"Found possible sector column with different case: {possible_sector_cols}\n" + error_msg += "Please rename it to 'sector' (lowercase).\n" + + # Check for alternative column names if no case mismatch found + if not possible_region_cols and not possible_sector_cols: + found_region_alternatives = [col for col in factors.columns if col.lower() in ALTERNATIVE_REGION_NAMES] + found_sector_alternatives = [col for col in factors.columns if col.lower() in ALTERNATIVE_SECTOR_NAMES] + + if found_region_alternatives: + error_msg += f"Found possible alternative region column names: {found_region_alternatives}\n" + error_msg += "Please rename to 'region' (lowercase) if these are region identifiers.\n" + if found_sector_alternatives: + error_msg += f"Found possible alternative sector column names: {found_sector_alternatives}\n" + error_msg += "Please rename to 'sector' (lowercase) if these are sector identifiers.\n" + + raise ValueError(error_msg) + + fac_calc = factors_indexed.loc[:, characterization_factors_column].unstack(characterized_name_column).fillna(0) new_ext = Extension(name=name) diff --git a/tests/test_core.py b/tests/test_core.py index 9a616b01..6b280a36 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -747,6 +747,56 @@ def test_characterize_extension_over_extensions(fix_testmrio): pdt.assert_frame_equal(ex_reg_one.unit, ex_reg_method.unit) +def test_characterize_error_wrong_column_names(fix_testmrio): + """Test improved error message when region/sector column names are wrong.""" + tmrio = fix_testmrio.testmrio + + # Load region-specific factors + factors_reg_spec = pd.read_csv( + Path(PYMRIO_PATH["test_mrio"] / Path("concordance") / "emissions_charact_reg_spec.tsv"), + sep="\t", + ) + + # Test case 1: Wrong case for region column (Region instead of region) + factors_wrong_case = factors_reg_spec.copy() + factors_wrong_case = factors_wrong_case.rename(columns={"region": "Region"}) + + with pytest.raises(ValueError) as exc_info: + tmrio.emissions.characterize(factors_wrong_case) + + error_msg = str(exc_info.value) + assert "Duplicate indices found" in error_msg + assert "column names don't match pymrio's expectations" in error_msg + assert "'region' (lowercase)" in error_msg + assert "Found possible region column with different case: ['Region']" in error_msg + + # Test case 2: Alternative column name (country instead of region) + factors_country = factors_reg_spec.copy() + factors_country = factors_country.rename(columns={"region": "country"}) + + with pytest.raises(ValueError) as exc_info: + tmrio.emissions.characterize(factors_country) + + error_msg = str(exc_info.value) + assert "Duplicate indices found" in error_msg + assert "Found possible alternative region column names: ['country']" in error_msg + + # Test case 3: Abbreviated column name (reg instead of region) + factors_reg = factors_reg_spec.copy() + factors_reg = factors_reg.rename(columns={"region": "reg"}) + + with pytest.raises(ValueError) as exc_info: + tmrio.emissions.characterize(factors_reg) + + error_msg = str(exc_info.value) + assert "Duplicate indices found" in error_msg + assert "Found possible alternative region column names: ['reg']" in error_msg + + # Test case 4: Verify correct column names still work + result = tmrio.emissions.characterize(factors_reg_spec) + assert result.extension is not None + + def test_extension_convert_simple(fix_testmrio): """Testing the convert function within extensions object.""" tt_pre = fix_testmrio.testmrio.copy()