Source code for disdrodb.metadata.checks
#!/usr/bin/env python3
# -----------------------------------------------------------------------------.
# Copyright (c) 2021-2023 DISDRODB developers
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
# -----------------------------------------------------------------------------.
"""Check metadata."""
import os
from typing import Optional, Union
import numpy as np
from disdrodb.api.info import (
infer_campaign_name_from_path,
infer_data_source_from_path,
)
from disdrodb.configs import get_metadata_archive_dir
from disdrodb.metadata.reader import read_station_metadata
from disdrodb.metadata.search import get_list_metadata
from disdrodb.metadata.standards import get_valid_metadata_keys
from disdrodb.utils.yaml import read_yaml
#### --------------------------------------------------------------------------.
#### Check Station Metadata
[docs]
def get_metadata_missing_keys(metadata):
"""Return the DISDRODB metadata keys which are missing."""
keys = list(metadata.keys())
valid_keys = get_valid_metadata_keys()
# Identify missing keys
idx_missing_keys = np.where(np.isin(valid_keys, keys, invert=True))[0]
missing_keys = np.array(valid_keys)[idx_missing_keys].tolist()
return missing_keys
[docs]
def get_metadata_invalid_keys(metadata):
"""Return the DISDRODB metadata keys which are not valid."""
keys = list(metadata.keys())
valid_keys = get_valid_metadata_keys()
# Identify invalid keys
idx_invalid_keys = np.where(np.isin(keys, valid_keys, invert=True))[0]
invalid_keys = np.array(keys)[idx_invalid_keys].tolist()
return invalid_keys
def _check_metadata_keys(metadata):
"""Check validity of metadata keys."""
# Check all keys are valid
invalid_keys = get_metadata_invalid_keys(metadata)
if len(invalid_keys) > 0:
raise ValueError(f"Invalid metadata keys: {invalid_keys}")
# Check no keys are missing
missing_keys = get_metadata_missing_keys(metadata)
if len(missing_keys) > 0:
raise ValueError(f"Missing metadata keys: {missing_keys}")
def _check_metadata_values(metadata):
"""Check validity of metadata values.
If null is specified in the YAML files (or None in the dict) raise error.
"""
for key, value in metadata.items():
if isinstance(value, type(None)):
raise ValueError(f"The metadata key {key} has None or null value. Use '' instead.")
def _check_metadata_campaign_name(metadata, expected_name):
"""Check metadata ``campaign_name``."""
if "campaign_name" not in metadata:
raise ValueError("The metadata file does not contain the 'campaign_name' key.")
campaign_name = metadata["campaign_name"]
if campaign_name == "":
raise ValueError("The 'campaign_name' key in the metadata is empty.")
if campaign_name != expected_name:
raise ValueError(
f"The campaign_name in the metadata is '{campaign_name}' but the campaign directory is '{expected_name}'",
)
def _check_metadata_data_source(metadata, expected_name):
"""Check metadata ``data_source``."""
if "data_source" not in metadata:
raise ValueError("The metadata file does not contain the 'data_source' key.")
data_source = metadata["data_source"]
if data_source == "":
raise ValueError("The 'data_source' key in the metadata is empty.")
if data_source != expected_name:
raise ValueError(
f"The data_source in the metadata is '{data_source}' but the data_source directory is '{expected_name}'",
)
def _check_metadata_station_name(metadata, expected_name):
"""Check metadata ``station_name``.
This function does not check that data are available for the station!
"""
if "station_name" not in metadata:
raise ValueError("The metadata file does not contain the 'station_name' key.")
station_name = metadata["station_name"]
if not isinstance(station_name, str):
raise ValueError("The 'station_name' key in the metadata is not defined as a string!")
if station_name == "":
raise ValueError("The 'station_name' key in the metadata is empty.")
if station_name != expected_name:
raise ValueError(
f"The station_name in the metadata is '{station_name}' but the metadata file is named"
f" '{expected_name}.yml'",
)
def _check_metadata_measurement_interval(metadata):
"""Check metadata ``measurement_interval``."""
from disdrodb.api.checks import check_measurement_intervals
if "measurement_interval" not in metadata:
raise ValueError("The metadata file does not contain the 'measurement_interval' key.")
measurement_intervals = metadata["measurement_interval"]
_ = check_measurement_intervals(measurement_intervals)
def _check_metadata_sensor_name(metadata):
from disdrodb.api.checks import check_sensor_name
sensor_name = metadata["sensor_name"]
check_sensor_name(sensor_name)
[docs]
def check_station_metadata(data_source, campaign_name, station_name, metadata_archive_dir=None):
"""Check DISDRODB metadata compliance."""
from disdrodb.l0.l0_reader import check_metadata_reader
metadata = read_station_metadata(
data_source=data_source,
campaign_name=campaign_name,
station_name=station_name,
metadata_archive_dir=metadata_archive_dir,
)
_check_metadata_keys(metadata)
_check_metadata_values(metadata)
_check_metadata_campaign_name(metadata, expected_name=campaign_name)
_check_metadata_data_source(metadata, expected_name=data_source)
_check_metadata_station_name(metadata, expected_name=station_name)
_check_metadata_sensor_name(metadata)
_check_metadata_measurement_interval(metadata)
check_metadata_reader(metadata)
#### --------------------------------------------------------------------------.
#### Metadata Archive Missing Information
def _check_lonlat_type(longitude, latitude):
# Check type validity
if isinstance(longitude, str):
raise TypeError("longitude is not defined as numeric.")
if isinstance(latitude, str):
raise TypeError("latitude is not defined as numeric.")
# Check is not none
if isinstance(longitude, type(None)) or isinstance(latitude, type(None)):
raise ValueError("Unspecified longitude and latitude coordinates.")
def _check_lonlat_validity(longitude, latitude):
if longitude == -9999 or latitude == -9999:
raise ValueError("Missing lat lon coordinates (-9999).")
if longitude > 180 or longitude < -180:
raise ValueError("Invalid longitude (outside [-180, 180])")
if latitude > 90 or latitude < -90:
raise ValueError("Invalid latitude (outside [-90, 90])")
[docs]
def check_station_metadata_geolocation(metadata) -> None:
"""Identify metadata with missing or wrong geolocation."""
# Get longitude, latitude and platform type
longitude = metadata.get("longitude")
latitude = metadata.get("latitude")
platform_type = metadata.get("platform_type")
# Check type validity
_check_lonlat_type(longitude=longitude, latitude=latitude)
# Check value validity
# - If mobile platform
if platform_type == "mobile":
if longitude != -9999 or latitude != -9999:
raise ValueError("For mobile platform_type, specify latitude and longitude -9999")
# - If fixed platform
else:
_check_lonlat_validity(longitude=longitude, latitude=latitude)
[docs]
def identify_missing_metadata_coords(metadata_filepaths: str) -> None:
"""Identify missing coordinates.
Parameters
----------
metadata_filepaths : str
Input YAML file path.
Raises
------
TypeError
Error if ``latitude`` or ``longitude`` coordinates are not present or are wrongly formatted.
"""
for filepath in metadata_filepaths:
metadata = read_yaml(filepath)
check_station_metadata_geolocation(metadata)
[docs]
def identify_empty_metadata_keys(metadata_filepaths: list, keys: Union[str, list]) -> None:
"""Identify empty metadata keys.
Parameters
----------
metadata_filepaths : str
Input YAML file path.
keys : Union[str,list]
Attributes to verify the presence.
"""
if isinstance(keys, str):
keys = [keys]
for filepath in metadata_filepaths:
for key in keys:
metadata = read_yaml(filepath)
if len(str(metadata.get(key, ""))) == 0: # ensure is string to avoid error
print(f"Empty {key} at: ", filepath)
#### --------------------------------------------------------------------------.
#### Check Metadata Archive
[docs]
def check_metadata_archive_keys(metadata_archive_dir: Optional[str] = None) -> bool:
"""Check that all metadata files have valid keys.
Parameters
----------
metadata_archive_dir : str (optional)
The directory path where the DISDRODB Metadata Archive is located.
The directory path must end with ``<...>/DISDRODB``.
If ``None``, it uses the ``metadata_archive_dir`` path specified
in the DISDRODB active configuration.
Returns
-------
bool
If the check succeeds, the result is ``True``, otherwise ``False``.
"""
is_valid = True
metadata_archive_dir = get_metadata_archive_dir(metadata_archive_dir)
list_metadata_paths = get_list_metadata(
metadata_archive_dir=metadata_archive_dir,
data_sources=None,
campaign_names=None,
station_names=None,
product=None, # --> Search in DISDRODB Metadata Archive
available_data=False, # --> Select all metadata matching the filtering criteria
)
for filepath in list_metadata_paths:
data_source = infer_data_source_from_path(filepath)
campaign_name = infer_campaign_name_from_path(filepath)
station_name = os.path.basename(filepath).replace(".yml", "")
metadata = read_station_metadata(
metadata_archive_dir=metadata_archive_dir,
data_source=data_source,
campaign_name=campaign_name,
station_name=station_name,
)
try:
_check_metadata_keys(metadata)
except Exception as e:
print(f"Error for {data_source} {campaign_name} {station_name}.")
print(f"The error is: {e}.")
is_valid = False
return is_valid
[docs]
def check_metadata_archive_campaign_name(metadata_archive_dir: Optional[str] = None) -> bool:
"""Check metadata ``campaign_name``.
Parameters
----------
metadata_archive_dir : str (optional)
The directory path where the DISDRODB Metadata Archive is located.
The directory path must end with ``<...>/DISDRODB``.
If ``None``, it uses the ``metadata_archive_dir`` path specified
in the DISDRODB active configuration.
Returns
-------
bool
If the check succeeds, the result is ``True``, otherwise ``False``.
"""
is_valid = True
metadata_archive_dir = get_metadata_archive_dir(metadata_archive_dir)
list_metadata_paths = get_list_metadata(
metadata_archive_dir=metadata_archive_dir,
data_sources=None,
campaign_names=None,
station_names=None,
product=None, # --> Search in DISDRODB Metadata Archive
available_data=False, # --> Select all metadata matching the filtering criteria
)
for filepath in list_metadata_paths:
data_source = infer_data_source_from_path(filepath)
campaign_name = infer_campaign_name_from_path(filepath)
station_name = os.path.basename(filepath).replace(".yml", "")
metadata = read_station_metadata(
metadata_archive_dir=metadata_archive_dir,
data_source=data_source,
campaign_name=campaign_name,
station_name=station_name,
)
try:
_check_metadata_campaign_name(metadata, expected_name=campaign_name)
except Exception as e:
is_valid = False
print(f"Error for {data_source} {campaign_name} {station_name}.")
print(f"The error is: {e}.")
return is_valid
[docs]
def check_metadata_archive_data_source(metadata_archive_dir: Optional[str] = None) -> bool:
"""Check metadata ``data_source``.
Parameters
----------
metadata_archive_dir : str (optional)
The directory path where the DISDRODB Metadata Archive is located.
The directory path must end with ``<...>/DISDRODB``.
If ``None``, it uses the ``metadata_archive_dir`` path specified
in the DISDRODB active configuration.
Returns
-------
bool
If the check succeeds, the result is ``True``, otherwise ``False``.
"""
is_valid = True
metadata_archive_dir = get_metadata_archive_dir(metadata_archive_dir)
list_metadata_paths = get_list_metadata(
metadata_archive_dir=metadata_archive_dir,
data_sources=None,
campaign_names=None,
station_names=None,
product=None, # --> Search in DISDRODB Metadata Archive
available_data=False, # --> Select all metadata matching the filtering criteria
)
for filepath in list_metadata_paths:
data_source = infer_data_source_from_path(filepath)
campaign_name = infer_campaign_name_from_path(filepath)
station_name = os.path.basename(filepath).replace(".yml", "")
metadata = read_station_metadata(
metadata_archive_dir=metadata_archive_dir,
data_source=data_source,
campaign_name=campaign_name,
station_name=station_name,
)
try:
_check_metadata_data_source(metadata, expected_name=data_source)
except Exception as e:
is_valid = False
print(f"Error for {data_source} {campaign_name} {station_name}.")
print(f"The error is: {e}.")
return is_valid
[docs]
def check_metadata_archive_sensor_name(metadata_archive_dir: Optional[str] = None) -> bool:
"""Check metadata ``sensor_name``.
Parameters
----------
metadata_archive_dir : str (optional)
The directory path where the DISDRODB Metadata Archive is located.
The directory path must end with ``<...>/DISDRODB``.
If ``None``, it uses the ``metadata_archive_dir`` path specified
in the DISDRODB active configuration.
Returns
-------
bool
If the check succeeds, the result is ``True``, otherwise ``False``.
"""
is_valid = True
metadata_archive_dir = get_metadata_archive_dir(metadata_archive_dir)
list_metadata_paths = get_list_metadata(
metadata_archive_dir=metadata_archive_dir,
data_sources=None,
campaign_names=None,
station_names=None,
product=None, # --> Search in DISDRODB Metadata Archive
available_data=False, # --> Select all metadata matching the filtering criteria
)
for filepath in list_metadata_paths:
data_source = infer_data_source_from_path(filepath)
campaign_name = infer_campaign_name_from_path(filepath)
station_name = os.path.basename(filepath).replace(".yml", "")
metadata = read_station_metadata(
metadata_archive_dir=metadata_archive_dir,
data_source=data_source,
campaign_name=campaign_name,
station_name=station_name,
)
try:
_check_metadata_sensor_name(metadata)
except Exception as e:
is_valid = False
print(f"Error for {data_source} {campaign_name} {station_name}.")
print(f"The error is: {e}.")
return is_valid
[docs]
def check_metadata_archive_station_name(metadata_archive_dir: Optional[str] = None) -> bool:
"""Check metadata ``station_name``.
Parameters
----------
metadata_archive_dir : str (optional)
The directory path where the DISDRODB Metadata Archive is located.
The directory path must end with ``<...>/DISDRODB``.
If ``None``, it uses the ``metadata_archive_dir`` path specified
in the DISDRODB active configuration.
Returns
-------
bool
If the check succeeds, the result is ``True``, otherwise ``False``.
"""
is_valid = True
metadata_archive_dir = get_metadata_archive_dir(metadata_archive_dir)
list_metadata_paths = get_list_metadata(
metadata_archive_dir=metadata_archive_dir,
data_sources=None,
campaign_names=None,
station_names=None,
product=None, # --> Search in DISDRODB Metadata Archive
available_data=False, # --> Select all metadata matching the filtering criteria
)
for filepath in list_metadata_paths:
data_source = infer_data_source_from_path(filepath)
campaign_name = infer_campaign_name_from_path(filepath)
station_name = os.path.basename(filepath).replace(".yml", "")
metadata = read_station_metadata(
metadata_archive_dir=metadata_archive_dir,
data_source=data_source,
campaign_name=campaign_name,
station_name=station_name,
)
try:
_check_metadata_station_name(metadata, expected_name=station_name)
except Exception as e:
is_valid = False
print(f"Error for {data_source} {campaign_name} {station_name}.")
print(f"The error is: {e}.")
return is_valid
[docs]
def check_metadata_archive_reader(metadata_archive_dir: Optional[str] = None) -> bool:
"""Check if the ``reader`` key is available and there is the associated reader.
Parameters
----------
metadata_archive_dir : str (optional)
The directory path where the DISDRODB Metadata Archive is located.
The directory path must end with ``<...>/DISDRODB``.
If ``None``, it uses the ``metadata_archive_dir`` path specified
in the DISDRODB active configuration.
Returns
-------
bool
If the check succeeds, the result is ``True``, otherwise ``False``.
"""
from disdrodb.l0.l0_reader import check_metadata_reader
is_valid = True
metadata_archive_dir = get_metadata_archive_dir(metadata_archive_dir)
list_metadata_paths = get_list_metadata(
metadata_archive_dir=metadata_archive_dir,
data_sources=None,
campaign_names=None,
station_names=None,
product=None, # --> Search in DISDRODB Metadata Archive
available_data=False, # --> Select all metadata matching the filtering criteria
)
for filepath in list_metadata_paths:
data_source = infer_data_source_from_path(filepath)
campaign_name = infer_campaign_name_from_path(filepath)
station_name = os.path.basename(filepath).replace(".yml", "")
metadata = read_station_metadata(
metadata_archive_dir=metadata_archive_dir,
data_source=data_source,
campaign_name=campaign_name,
station_name=station_name,
)
try:
check_metadata_reader(metadata)
except Exception as e:
is_valid = False
print(f"Error for {data_source} {campaign_name} {station_name}.")
print(f"The error is: {e}.")
return is_valid
[docs]
def check_metadata_archive(metadata_archive_dir: Optional[str] = None, raise_error=False):
"""Check the archive metadata compliance.
Parameters
----------
metadata_archive_dir : str (optional)
The directory path where the DISDRODB Metadata Archive is located.
The directory path must end with ``<...>/DISDRODB``.
If ``None``, it uses the ``metadata_archive_dir`` path specified
in the DISDRODB active configuration.
raise_error: bool (optional)
Whether to raise an error and interrupt the archive check if a
metadata is not compliant. The default value is ``False``.
Returns
-------
bool
If the check succeeds, the result is ``True``, otherwise ``False``.
"""
is_valid = True
metadata_archive_dir = get_metadata_archive_dir(metadata_archive_dir)
list_metadata_paths = get_list_metadata(
metadata_archive_dir=metadata_archive_dir,
data_sources=None,
campaign_names=None,
station_names=None,
product=None, # --> Search in DISDRODB Metadata Archive
available_data=False, # --> Select all metadata matching the filtering criteria
)
for filepath in list_metadata_paths:
data_source = infer_data_source_from_path(filepath)
campaign_name = infer_campaign_name_from_path(filepath)
station_name = os.path.basename(filepath).replace(".yml", "")
# Check compliance
try:
check_station_metadata(
metadata_archive_dir=metadata_archive_dir,
data_source=data_source,
campaign_name=campaign_name,
station_name=station_name,
)
except Exception as e:
is_valid = False
msg = f"Error for {data_source} {campaign_name} {station_name}."
msg = msg + f"The error is: {e}."
if raise_error:
raise ValueError(msg)
print(msg)
return is_valid
[docs]
def check_metadata_archive_geolocation(metadata_archive_dir: Optional[str] = None):
"""Check the metadata files have missing or wrong geolocation..
Parameters
----------
metadata_archive_dir : str (optional)
The directory path where the DISDRODB Metadata Archive is located.
The directory path must end with ``<...>/DISDRODB``.
If ``None``, it uses the ``metadata_archive_dir`` path specified
in the DISDRODB active configuration.
Returns
-------
bool
If the check succeeds, the result is ``True``, otherwise ``False``.
"""
is_valid = True
metadata_archive_dir = get_metadata_archive_dir(metadata_archive_dir)
list_metadata_paths = get_list_metadata(
metadata_archive_dir=metadata_archive_dir,
data_sources=None,
campaign_names=None,
station_names=None,
product=None, # --> Search in DISDRODB Metadata Archive
available_data=False, # --> Select all metadata matching the filtering criteria
)
for filepath in list_metadata_paths:
data_source = infer_data_source_from_path(filepath)
campaign_name = infer_campaign_name_from_path(filepath)
station_name = os.path.basename(filepath).replace(".yml", "")
metadata = read_station_metadata(
metadata_archive_dir=metadata_archive_dir,
data_source=data_source,
campaign_name=campaign_name,
station_name=station_name,
)
try:
check_station_metadata_geolocation(metadata)
except Exception as e:
is_valid = False
print(f"Missing information for {data_source} {campaign_name} {station_name}.")
print(f"The error is: {e}.")
return is_valid