Source code for dtcg.datacube.update_metadata

"""Copyright 2026 DTCG Contributors

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.


=====

Functionality for ensuring metadata is CF compliant: https://cfconventions.org/.
"""

from __future__ import annotations

import os
import warnings
from datetime import datetime
from importlib import resources

import rioxarray  # noqa: F401
import xarray as xr
import yaml
from schema import Optional, Schema


[docs] class MetadataMapper: """Class for applying CF-compliant metadata to xarray Datasets. Attributes ---------- METADATA_SCHEMA_DATA : schema.Schema Validation schema for data variable metadata. METADATA_SCHEMA_COORDS : schema.Schema Validation schema for coordinates metadata. metadata_mappings_data : dict Dictionary of metadata mappings for data variables loaded from a YAML file. metadata_mappings_coords : dict Dictionary of metadata mappings for coordinates loaded from a YAML file. """ metadata_mappings_data: dict # as this is not explicitly passed to __init__(). metadata_mappings_coords: dict # as this is not explicitly passed to __init__().
[docs] def __init__( self: MetadataMapper, metadata_mapping_data: str = "", metadata_mapping_coords: str = "", ): """Initialise MetadataMapper with a given or default mapping file. Parameters ---------- metadata_mapping_data : str, optional Path to the YAML file containing variable metadata mappings. If empty, defaults to 'metadata_mapping_data.yaml' provided by the ``dtcg`` package. metadata_mapping_coords : str, optional Path to the YAML file containing variable metadata mappings. If empty, defaults to 'metadata_mapping_coords.yaml' provided by the ``dtcg`` package. """ if not metadata_mapping_data: metadata_mapping_data = resources.files("dtcg.datacube").joinpath( "metadata_mapping_data.yaml" ) self.METADATA_SCHEMA_DATA = Schema( { "standard_name": str, "long_name": str, "units": str, Optional("author"): str, "institution": str, "source": str, "comment": str, "references": str, } ) self.metadata_mappings_data = self.read_metadata_mappings( self.METADATA_SCHEMA_DATA, metadata_mapping_data ) if not metadata_mapping_coords: metadata_mapping_coords = resources.files("dtcg.datacube").joinpath( "metadata_mapping_coords.yaml" ) self.METADATA_SCHEMA_COORDS = Schema( { "standard_name": str, "long_name": str, "units": str, } ) self.metadata_mappings_coords = self.read_metadata_mappings( self.METADATA_SCHEMA_COORDS, metadata_mapping_coords )
[docs] def read_metadata_mappings( self: MetadataMapper, schema: Schema, map_file: str ) -> dict: """Load and validate metadata mappings from a YAML file. Parameters ---------- schema : Schema The schema structure used for validation map_file : str Path to the YAML file containing metadata mappings. Return ------ dict Metadata mappings loaded from YAML file. Raises ------ schema.SchemaError If any of the metadata entries fail schema validation. """ with open(map_file) as f: config_dict = yaml.safe_load(f) for _, metadata in config_dict.items(): schema.validate(metadata) return config_dict
@staticmethod def _update_shared_metadata(dataset: xr.Dataset, ds_name: str) -> None: """Add shared metadata attributes to the dataset and ensure CRS is set. Parameters ---------- dataset : xarray.Dataset The dataset to which shared metadata and CRS should be applied. ds_name : str Name of dataset. Notes ----- If a CRS is not present, it is set from the dataset's `pyproj_srs` attribute. Shared metadata includes CF conventions, title, and summary. """ # update metadata shared across all variables shared_metadata = { "Conventions": "CF-1.12", "comment": ( "The DTC Glaciers project is developed under the European Space " "Agency's Digital Twin Earth initiative, as part of the Digital Twin " "Components (DTC) Early Development Actions." ), "date_created": datetime.now().isoformat(), "RGI-ID": dataset.attrs["RGI-ID"], "glacier_attributes": dataset.attrs.get("glacier_attributes", {}), } if "L1" in ds_name: if not ( "spatial_ref" in dataset.data_vars or "spatial_ref" in dataset.coords ): # create a spatial_ref layer in the dataset if not dataset.rio.crs and not {"x", "y"}.isdisjoint(dataset.dims): dataset.rio.write_crs(dataset.pyproj_srs, inplace=True) shared_metadata.update( { "title": "Datacube of glacier-domain variables.", "summary": ( "Resampled glacier-domain variables from multiple sources " f"for RGI6-ID '{dataset.attrs['RGI-ID']}'. " "Generated for the DTC Glaciers project." ), } ) elif "L2" in ds_name: shared_metadata.update( { "title": "Datacube of observation-informed modelled variables.", "summary": ( "Observation-informed modelled variables for RGI6-ID " f"'{dataset.attrs['RGI-ID']}'. " "Generated for the DTC Glaciers project." ), } ) # L2 must contain a description of the applied calibration strategy if "calibration_strategy" not in dataset.attrs: raise ValueError( "Missing required attribute 'calibration_strategy' in" "dataset.attrs. Add a description of the applied " "calibration strategy." ) shared_metadata["calibration_strategy"] = dataset.attrs[ "calibration_strategy" ] dataset.attrs.clear() # clear old metadata dataset.attrs.update(shared_metadata)
[docs] def update_metadata( self: MetadataMapper, dataset: xr.Dataset, ds_name: str ) -> xr.Dataset: """Apply variable and shared metadata to an xarray Dataset. Parameters ---------- dataset : xarray.Dataset Dataset to which the metadata should be applied. ds_name : str Name of dataset. Returns ------- xarray.Dataset The input dataset with updated metadata. Warns ----- UserWarning If any dataset variables are missing in the metadata mapping. Notes ----- This function adds both per-variable and global metadata attributes. Missing variable mappings are reported as warnings, not errors. """ # check there are mappings for all variables in the dataset difference_data = set(dataset.data_vars) - set( self.metadata_mappings_data.keys() ) difference_coords = set(dataset.coords) - set( self.metadata_mappings_coords.keys() ) for difference in [difference_data, difference_coords]: # remove eolis check as they contain the metadata not_needed = [ "eolis_elevation_change_sigma_timeseries", "eolis_elevation_change_timeseries", "eolis_gridded_elevation_change", "eolis_gridded_elevation_change_sigma", "spatial_ref", ] difference = [x for x in difference if x not in not_needed] if difference: warning_msg = ( "Metadata mapping is missing for the following variables: " f"{sorted(difference)}. The metadata for these variables " "might not be compliant with Climate and Forecast " "conventions https://cfconventions.org/." ) with warnings.catch_warnings(): warnings.simplefilter("always") warnings.warn(warning_msg, UserWarning, stacklevel=2) # special treatment for model parameters, to convert some of their attrs model_variables = [ "volume", "area", "length", "off_area", "on_area", "melt_off_glacier", "melt_on_glacier", "liq_prcp_off_glacier", "liq_prcp_on_glacier", "snowfall_off_glacier", "snowfall_on_glacier", "melt_off_glacier_monthly", "melt_on_glacier_monthly", "liq_prcp_off_glacier_monthly", "liq_prcp_on_glacier_monthly", "snowfall_off_glacier_monthly", "snowfall_on_glacier_monthly", "runoff_monthly", "runoff_monthly_cumulative", "runoff", "specific_mb", "specific_mb_calendar_cum", "snowline", ] model_coordinates = [ "member", "time", "rgi_id", "hydro_year", "hydro_month", "calendar_year", "calendar_month", "month_2d", "calendar_month_2d", ] # small helper function to rename some model output attributes def _rename_key(attrs, new_key, old_key): if new_key not in attrs: default = "N/A" else: default = attrs[new_key] attrs[new_key] = attrs.pop(old_key, default) # simple function to apply metadata to all layers in an xarray dataset for metadata_mappings in [ self.metadata_mappings_data, self.metadata_mappings_coords, ]: for data_name, metadata in metadata_mappings.items(): if data_name in dataset.data_vars or data_name in dataset.coords: dataset[data_name].attrs.update(metadata) # special treatment of model output attributes if data_name in model_variables: _rename_key(dataset[data_name].attrs, "units", "unit") if data_name in model_coordinates + model_variables: _rename_key( dataset[data_name].attrs, "long_name", "description" ) self._update_shared_metadata(dataset, ds_name) return dataset