Source code for disdrodb.psd.gof_metrics

# -----------------------------------------------------------------------------.
# Copyright (c) 2021-2026 DISDRODB developers
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
# -----------------------------------------------------------------------------.
"""Define Goodness-Of-Fit metrics for xarray objects."""

import numpy as np
import xarray as xr

from disdrodb.constants import DIAMETER_DIMENSION
from disdrodb.utils.warnings import suppress_warnings



[docs]
def compute_kl_divergence(pk, qk, dim, eps=1e-12):
    """Compute Kullback-Leibler (KL) divergence.

    Compares two probability distributions.
    When KL < 0.1 the two distributions are similar.
    When KL < 0.01 the two distributions are nearly indistinguishable.

    Note that when qk is 0 but pk > 0 at some bin, KL divergence explodes.

    Parameters
    ----------
    pk : xarray.DataArray
        Observed / true / empirical probability distribution
    qk : xarray.DataArray
        Predicted / model / approximating probability distribution
    dim : str
        Name of the bin dimension.
    eps : float, optional
        Small value for numerical stability. Default is 1e-12.

    Returns
    -------
    xarray.DataArray
        Kullback-Leibler (KL) divergence.

    """
    # Get row masses before computing CDFs
    pk_mass = pk.sum(dim=dim)
    qk_mass = qk.sum(dim=dim)

    # Regularize probability to avoid division by zero (or log of 0)
    qk_regularized = xr.where(qk == 0, eps, qk)
    pk_regularized = xr.where(pk == 0, eps, pk)

    # Compute log probability ratio
    log_prob_ratio = np.log(pk_regularized / qk_regularized)

    # Compute divergence (zero out where pk=0)
    kl = (pk * log_prob_ratio).sum(dim=dim, skipna=False)

    # Clip tiny negative values due to numerical noise
    kl = xr.where(kl >= 0.0, kl, 0.0)

    # Handle edge cases when P=0 or Q=0
    kl = xr.where((pk_mass == 0) | (qk_mass == 0), np.nan, kl)
    return kl




[docs]
def compute_jensen_shannon_distance(pk, qk, dim, eps=1e-12):
    """Compute Jensen-Shannon distance.

    Symmetric and finite version of KL divergence.
    The Jensen-Shannon distance is the square root of the Jensen-Shannon divergence.
    Values are defined between 0 and np.sqrt(ln(2)) = 0.83256

    Parameters
    ----------
    pk : xarray.DataArray
        Observed / true probability distribution
    qk : xarray.DataArray
        Predicted / model probability distribution
    dim : str
        Name of the bin dimension
    eps : float, optional
        Small value for numerical stability. Default is 1e-12.

    Returns
    -------
    xarray.DataArray
        Jensen-Shannon distance
    """
    # Mixture distribution
    mk = 0.5 * (pk + qk)

    # KL(P || M)
    kl_pm = compute_kl_divergence(pk=pk, qk=mk, dim=dim, eps=eps)

    # KL(Q || M)
    kl_qm = compute_kl_divergence(pk=qk, qk=mk, dim=dim, eps=eps)

    # Jensen-Shannon divergence [0, ln(2)]
    js_div = 0.5 * (kl_pm + kl_qm)
    js_div = np.maximum(js_div, 0.0)  # clip tiny negative values to zero (numerical safety)

    # Jensen-Shannon distance
    js_distance = np.sqrt(js_div)
    return js_distance




[docs]
def compute_wasserstein_distance(
    pk,
    qk,
    D,
    dD,
    dim,
    integration="bin",
):
    """Compute Wasserstein-1 distance (Earth Mover's Distance) between two distributions.

    Parameters
    ----------
    pk : xarray.DataArray
        Observed / true probability distribution
    qk : xarray.DataArray
        Predicted / model probability distribution
    D : xarray.DataArray
        Bin centers
    dD : xarray.DataArray
        Bin widths
    dim : str
        Name of the bin dimension
    integration : str, optional
        Integration scheme used to compute the Wasserstein integral.
        Supported options are ``"bin"`` and ``"left_riemann"``.

        ``"bin"`` compute Histogram-based Wasserstein distance. Distributions are interpreted as
        piecewise-constant densities over bins of width ``dD``. The distance is
        computed by integrating the difference between cumulative distribution
        functions over each bin. This is the default.

        ``"left_riemann"`` computes Discrete-support Wasserstein distance. Probability mass is assumed to be
        concentrated at bin centers ``D``, and the integral is approximated using
        the spacing between support points, consistent with :func:`scipy.stats.wasserstein_distance`.


    Returns
    -------
    xarray.DataArray
        Wasserstein-1 distance
    """
    # Get row masses before computing CDFs
    pk_mass = pk.sum(dim=dim)
    qk_mass = qk.sum(dim=dim)

    # CDFs
    cdf_p = pk.cumsum(dim)
    cdf_q = qk.cumsum(dim)

    # Absolute CDF difference
    diff = abs(cdf_p - cdf_q)

    if integration == "bin":
        # Histogram-based Wasserstein (density interpretation)
        wd = (diff * dD).sum(dim=dim)

    elif integration == "left_riemann":
        # Discrete-support Wasserstein (SciPy-style)
        # Evaluate |CDF difference| at left support points D_i
        diff_left = diff.isel({dim: slice(None, -1)})

        # Compute spacing between support points and
        # explicitly assign left coordinates to avoid misalignment
        dx = D.diff(dim)
        dx = dx.assign_coords({dim: D.isel({dim: slice(None, -1)})})
        wd = (diff_left * dx).sum(dim=dim)
    else:
        raise ValueError("integration must be 'bin' or 'left_riemann'")

    # Handle edge cases (Wasserstein distance is undefined when P=0 or Q=0)
    wd = xr.where((pk_mass == 0) | (qk_mass == 0), np.nan, wd)
    return wd




[docs]
def compute_kolmogorov_smirnov_distance(pk, qk, dim):
    """Compute Kolmogorov-Smirnov (KS) distance.

    The Kolmogorov-Smirnov (KS) distance is bounded between 0 and 1,
    where 0 indicates that the two distributions are identical.
    The associated KS test p-value ranges from 0 to 1,
    with a value of 1 indicating no evidence against the null hypothesis that the distributions are identical.
    When the p value is smaller than the significance level (e.g. < 0.05) the model is rejected.

    If model parameters are estimated from the same data to which the model is compared,
    the standard KS p-values are invalid.
    The solution is to use a parametric bootstrap:
    1. Fit model to your data
    2. Simulate many datasets from that fitted gamma
    3. Refit gamma for each simulated dataset
    4. Compute KS statistic each time
    5. Compare your observed KS statistic to the bootstrap distribution

    Parameters
    ----------
    pk : xarray.DataArray
        Observed / true probability distribution
    qk : xarray.DataArray
        Predicted / model probability distribution
    dim : str
        Name of the bin dimension

    Returns
    -------
    ks_statistic : xarray.DataArray
        Kolmogorov-Smirnov statistic (maximum CDF difference)
        If 0, the two distributions are identical.
    ks_p_value : xarray.DataArray
        Kolmogorov-Smirnov Test p-value (asymptotic approximation)
        A p-value of 0 means “strong evidence against equality.”
        A p-value of 1 means “no evidence against equality.”
        Identical distributions show a pvalue of 1.
        Similar distributions show a pvalue close to 1.
    """
    # Get row masses before computing CDFs
    pk_mass = pk.sum(dim=dim)
    qk_mass = qk.sum(dim=dim)

    # CDFs
    cdf_p = pk.cumsum(dim)
    cdf_q = qk.cumsum(dim)

    # KS statistic
    ks = np.abs(cdf_p - cdf_q).max(dim=dim)

    # Effective sample sizes (Rényi-2 effective N)
    n_eff_p = 1.0 / (pk**2).sum(dim=dim)
    n_eff_q = 1.0 / (qk**2).sum(dim=dim)

    # Combined effective sample size
    n_eff = (n_eff_p * n_eff_q) / (n_eff_p + n_eff_q)

    # Asymptotic KS p-value approximation
    p_value = 2.0 * np.exp(-2.0 * (ks * np.sqrt(n_eff)) ** 2)
    p_value = p_value.clip(0.0, 1.0)

    # Handle edge cases (KS distance is undefined when P=0 or Q=0)
    ks = xr.where((pk_mass == 0) | (qk_mass == 0), np.nan, ks)
    p_value = xr.where((pk_mass == 0) | (qk_mass == 0), np.nan, p_value)

    return ks, p_value




[docs]
def compute_gof_stats(obs, pred, dim=DIAMETER_DIMENSION):
    """
    Compute various goodness-of-fit (GoF) statistics between observed and predicted values.

    Computes a comprehensive set of GOF metrics including Pearson correlation,
    error statistics, and distribution distance metrics (KL, JS, Wasserstein, KS).

    Parameters
    ----------
    obs : xarray.DataArray
        Observations DataArray with at least dimension ``dim``.
        Should contain 'diameter_bin_center' and 'diameter_bin_width' coordinates.
    pred : xarray.DataArray
        Predictions DataArray with at least dimension ``dim``.
    dim : str, optional
        DataArray dimension over which to compute GOF statistics.
        Default is DIAMETER_DIMENSION.

    Returns
    -------
    ds : xarray.Dataset
        Dataset containing the following computed GoF statistics:

        - R2: Coefficient of determination (squared Pearson correlation)
        - MAE: Mean Absolute Error
        - MaxAE: Maximum Absolute Error
        - RelMaxAE: Relative Maximum Absolute Error
        - PeakDiff: Difference at distribution peak (N(D) max)
        - RelPeakDiff: Relative difference at peak
        - DmodeDiff: Difference in mode diameters
        - NtDiff: Difference in total number concentration
        - KLDiv: Kullback-Leibler divergence
        - JSD: Jensen-Shannon distance
        - WD: Wasserstein-1 distance
        - KS: Kolmogorov-Smirnov statistic
    """
    # TODO: add censoring option (by setting values to np.nan?)
    from disdrodb.l2.empirical_dsd import get_mode_diameter

    # Retrieve diameter and diameter bin width
    diameter = obs["diameter_bin_center"]
    diameter_bin_width = obs["diameter_bin_width"]

    # Compute errors
    error = obs - pred

    # Compute max obs and pred
    obs_max = obs.max(dim=dim, skipna=False)
    pred_max = pred.max(dim=dim, skipna=False)

    # Compute NaN mask
    mask_nan = np.logical_or(np.isnan(obs_max), np.isnan(pred_max))

    # Compute GOF statistics
    with suppress_warnings():
        # Compute Pearson Correlation
        # - if same values and a single values return NaN (std of constant value is not defined)
        pearson_r = xr.corr(obs, pred, dim=dim)

        # Compute Mean Absolute Error (MAE)
        mae = np.abs(error).mean(dim=dim, skipna=False)

        # Compute maximum absolute error
        max_error = np.abs(error).max(dim=dim, skipna=False)
        relative_max_error = xr.where(max_error == 0, 0, xr.where(obs_max == 0, np.nan, max_error / obs_max))

        # Compute deviation of N(D) at distribution mode
        mode_deviation = obs_max - pred_max
        mode_relative_deviation = xr.where(
            mode_deviation == 0,
            0,
            xr.where(obs_max == 0, np.nan, mode_deviation / obs_max),
        )

        # Compute diameter difference of the distribution mode
        diameter_mode_pred = get_mode_diameter(pred, diameter)
        diameter_mode_obs = get_mode_diameter(obs, diameter)
        diameter_mode_deviation = diameter_mode_obs - diameter_mode_pred

        # Compute difference in total number concentration
        total_number_concentration_obs = (obs * diameter_bin_width).sum(dim=dim, skipna=False)
        total_number_concentration_pred = (pred * diameter_bin_width).sum(dim=dim, skipna=False)
        total_number_concentration_difference = total_number_concentration_pred - total_number_concentration_obs

        # Compute pdf per bin
        pk_pdf = obs / total_number_concentration_obs
        qk_pdf = pred / total_number_concentration_pred

        # Compute probabilities per bin
        pk = pk_pdf * diameter_bin_width
        pk = pk / pk.sum(dim=dim, skipna=False)  # this might not be necessary
        qk = qk_pdf * diameter_bin_width
        qk = qk / qk.sum(dim=dim, skipna=False)  # this might not be necessary

        # Keep prob mass to 0 if total concentration is 0
        pk = xr.where(total_number_concentration_obs == 0, 0, pk)
        qk = xr.where(total_number_concentration_pred == 0, 0, qk)

        # Compute Kullback-Leibler divergence
        kl_divergence = compute_kl_divergence(pk=pk, qk=qk, dim=dim)

        # Compute Jensen-Shannon distance
        js_distance = compute_jensen_shannon_distance(pk=pk, qk=qk, dim=dim)

        # Compute Wasserstein-1 distance
        wd = compute_wasserstein_distance(pk=pk, qk=qk, D=diameter, dD=diameter_bin_width, dim=dim)

        # Compute Kolmogorov-Smirnov distance
        ks_stat, ks_pvalue = compute_kolmogorov_smirnov_distance(pk=pk, qk=qk, dim=dim)

    # Create an xarray.Dataset to hold the computed statistics
    ds = xr.Dataset(
        {
            "R2": pearson_r**2,  # Squared Pearson correlation coefficient
            "MAE": mae,  # Mean Absolute Error
            "MaxAE": max_error,  # Maximum Absolute Error
            "RelMaxAE": relative_max_error,  # Relative Maximum Absolute Error
            "PeakDiff": mode_deviation,  # Difference at distribution peak
            "RelPeakDiff": mode_relative_deviation,  # Relative difference at peak
            "DmodeDiff": diameter_mode_deviation,  # Difference in mode diameters
            "NtDiff": total_number_concentration_difference,
            "KLDiv": kl_divergence,  # Kullback-Leibler divergence
            "JSD": js_distance,  # Jensen-Shannon distance
            "WD": wd,  # Wasserstein-1 distance
            "KS": ks_stat,  # Kolmogorov-Smirnov statistic
            # "KS_pvalue": ks_p_value,  # Kolmogorov-Smirnov Test p-value
        },
    )
    # Round
    ds = ds.round(2)
    # Mask where input obs or pred is NaN
    ds = ds.where(~mask_nan)
    return ds