Source code for pyrfu.pyrf.optimize_nbins_2d

#!/usr/bin/env python

# Built-in imports
import itertools

# 3rd party imports
import numpy as np

__author__ = "Louis Richard"
__email__ = "louisr@irfu.se"
__copyright__ = "Copyright 2020-2023"
__license__ = "MIT"
__version__ = "2.4.2"
__status__ = "Prototype"


[docs]def optimize_nbins_2d(x, y, n_min: list = None, n_max: list = None):
    r"""Estimates the number of bins for 2d histogram that minimizes
    the risk function in [1]_ , obtained by direct decomposition of the
    MISE following the method described in [2]_ .

    Parameters
    ----------
    x : xarray.DataArray
        Input time series of the first variable.
    y : xarray.DataArray
        Input time series of the second variable.
    n_min : array_like, Optional
        Minimum number of bins for each time series. Default is [1, 1]
    n_max : array_like, Optional
        Maximum number of bins for each time series. Default is [100, 100]

    Returns
    -------
    opt_n_x : int
        Number of bins of the first variable that minimizes the cost function.
    opt_n_y : int
        Number of bins of the second variable that minimizes the cost function.

    References
    ----------
    .. [1]  Rudemo, M. (1982) Empirical Choice of Histograms and Kernel
            Density Estimators. Scandinavian Journal of Statistics, 9, 65-78.

    .. [2]  Shimazaki H. and Shinomoto S., A method for selecting the bin
            size of a time histogram Neural Computation (2007) Vol. 19(6),
            1503-1527

    """

    n_min = n_min or [1, 1]
    n_max = n_max or [100, 100]

    x_min, x_max = [np.min(x.data), np.max(x.data)]
    y_min, y_max = [np.min(y.data), np.max(y.data)]

    # #of Bins
    n_x, n_y = [np.arange(min_, max_) for min_, max_ in zip(n_min, n_max)]

    d_x = (x_max - x_min) / n_x  # Bin size vector
    d_y = (y_max - y_min) / n_y  # Bin size vector

    d_xy = [[(i, j) for j in d_y] for i in d_x]

    # matrix of bin size vector
    d_xy = np.array(d_xy, dtype=[("x", float), ("y", float)])

    # Computation of the cost function to x and y
    c_xy = np.zeros(d_xy.shape)

    for i, j in itertools.product(range(len(n_x)), range(len(n_y))):
        k_i = np.histogram2d(x, y, bins=(n_x[i], n_y[j]))
        # The mean and the variance are simply computed from the event
        # counts in all the bins of the 2-dimensional histogram.
        k_i = k_i[0]
        # The cost Function
        c_xy[i, j] = (2 * np.mean(k_i) - np.var(k_i)) / (
            (d_xy[i, j][0] * d_xy[i, j][1]) ** 2
        )

    # Optimal Bin Size Selection
    # get the index in x and y that produces the minimum cost function
    n_x = int(n_x[np.where(c_xy == np.min(c_xy))[0][0]])
    n_y = int(n_y[np.where(c_xy == np.min(c_xy))[1][0]])

    return n_x, n_y