Source code for pyrfu.pyrf.optimize_nbins_2d
#!/usr/bin/env python
# Built-in imports
import itertools
# 3rd party imports
import numpy as np
__author__ = "Louis Richard"
__email__ = "louisr@irfu.se"
__copyright__ = "Copyright 2020-2023"
__license__ = "MIT"
__version__ = "2.4.2"
__status__ = "Prototype"
[docs]def optimize_nbins_2d(x, y, n_min: list = None, n_max: list = None):
r"""Estimates the number of bins for 2d histogram that minimizes
the risk function in [1]_ , obtained by direct decomposition of the
MISE following the method described in [2]_ .
Parameters
----------
x : xarray.DataArray
Input time series of the first variable.
y : xarray.DataArray
Input time series of the second variable.
n_min : array_like, Optional
Minimum number of bins for each time series. Default is [1, 1]
n_max : array_like, Optional
Maximum number of bins for each time series. Default is [100, 100]
Returns
-------
opt_n_x : int
Number of bins of the first variable that minimizes the cost function.
opt_n_y : int
Number of bins of the second variable that minimizes the cost function.
References
----------
.. [1] Rudemo, M. (1982) Empirical Choice of Histograms and Kernel
Density Estimators. Scandinavian Journal of Statistics, 9, 65-78.
.. [2] Shimazaki H. and Shinomoto S., A method for selecting the bin
size of a time histogram Neural Computation (2007) Vol. 19(6),
1503-1527
"""
n_min = n_min or [1, 1]
n_max = n_max or [100, 100]
x_min, x_max = [np.min(x.data), np.max(x.data)]
y_min, y_max = [np.min(y.data), np.max(y.data)]
# #of Bins
n_x, n_y = [np.arange(min_, max_) for min_, max_ in zip(n_min, n_max)]
d_x = (x_max - x_min) / n_x # Bin size vector
d_y = (y_max - y_min) / n_y # Bin size vector
d_xy = [[(i, j) for j in d_y] for i in d_x]
# matrix of bin size vector
d_xy = np.array(d_xy, dtype=[("x", float), ("y", float)])
# Computation of the cost function to x and y
c_xy = np.zeros(d_xy.shape)
for i, j in itertools.product(range(len(n_x)), range(len(n_y))):
k_i = np.histogram2d(x, y, bins=(n_x[i], n_y[j]))
# The mean and the variance are simply computed from the event
# counts in all the bins of the 2-dimensional histogram.
k_i = k_i[0]
# The cost Function
c_xy[i, j] = (2 * np.mean(k_i) - np.var(k_i)) / (
(d_xy[i, j][0] * d_xy[i, j][1]) ** 2
)
# Optimal Bin Size Selection
# get the index in x and y that produces the minimum cost function
n_x = int(n_x[np.where(c_xy == np.min(c_xy))[0][0]])
n_y = int(n_y[np.where(c_xy == np.min(c_xy))[1][0]])
return n_x, n_y