NNPDF · t7phy · Mar 10, 2023 · Mar 10, 2023 · Mar 22, 2023 · Mar 22, 2023
diff --git a/validphys2/src/validphys/datafiles/new_commondata/commondata_utils.py b/validphys2/src/validphys/datafiles/new_commondata/commondata_utils.py
@@ -0,0 +1,361 @@
+"""
+Python utilities for commondata implementation.
+
+This module provides helpful functions that automate a few
+tasks that are regularly needed for the implementation of
+experimental data to the commondata format. If there are 
+any additional functions that could be added here as they
+could simplify some repetitve tasks, please do suggest.
+
+Before the usage of any functions, it is recommended to read
+the docstrings of the function to understand the inputs and
+outputs.
+
+@author: Tanishq Sharma
+"""
+
+import numpy as np
+
+from math import sqrt
+from numpy.linalg import eig
+
+def symmetrize_errors(delta_plus, delta_minus):
+    r"""Compute the symmterized uncertainty and the shift in data point.
+
+    Parameters
+    ----------
+    delta_plus : float
+        The top/plus uncertainty with sign
+    delta_minus : float
+        The bottom/minus uncertainty with sign
+
+    Returns
+    -------
+    se_delta : float
+        The value to be added to the data point
+    se_sigma : float
+        The symmetrized uncertainty to be used in commondata
+
+    """
+    semi_diff = (delta_plus + delta_minus)/2
+    average = (delta_plus - delta_minus)/2
+    se_delta = semi_diff
+    se_sigma = sqrt(average*average + 2*semi_diff*semi_diff)
+    return se_delta, se_sigma
+
+def percentage_to_absolute(percentage, value):
+    r"""Compute the absolute value of uncertainty from percentage.
+
+    Parameters
+    ----------
+    percentage : string/float
+        Experimental datasets can provide the percentage 
+        uncertainties with a % sign or without one. 
+        The function will autostrip % sign and convert to
+        a float type in case the percentage uncertainty 
+        comes with a % sign. Else, it will directly perform 
+        the computation.
+    value : float
+        The data point
+
+    Returns
+    -------
+    absolute : float
+        The absolute value of the uncertainty
+
+    """
+    if type(percentage) is str:
+        percentage = float(percentage.replace("%", ""))
+        absolute = percentage * value * 0.01
+        return absolute 
+    else:
+        absolute = percentage * value * 0.01
+        return absolute
+
+def cormat_to_covmat(err_list, cormat_list):
+    r"""Convert correlation matrix elements to covariance
+    matrix elements.
+
+    Parameters
+    ----------
+    err_list : list
+        A one dimensional list which contains the uncertainty
+        associated to each data point in order.
+    cormat_list : list
+        A one dimensional list which contains the elements of 
+        the correlation matrix row by row. Since experimental
+        datasets provide these matrices in a list form, this 
+        simplifies the implementation for the user.
+
+    Returns
+    -------
+    covmat_list : list
+        A one dimensional list which contains the elements of
+        the covariance matrix row by row.
+
+    """
+    covmat_list = []
+    for i in range(len(cormat_list)):
+        a = i // len(err_list)
+        b = i % len(err_list)
+        covmat_list.append(cormat_list[i] * err_list[a] * err_list[b])
+    return covmat_list
+
+def covmat_to_artunc(ndata, covmat_list, no_of_norm_mat=0):
+    r"""Convert the covariance matrix to a matrix of 
+    artificial uncertainties.
+
+    Parameters
+    ----------
+    ndata : integer
+        Number of data points
+    covmat_list : list
+        A one dimensional list which contains the elements of
+        the covariance matrix row by row. Since experimental
+        datasets provide these matrices in a list form, this 
+        simplifies the implementation for the user.
+    no_of_norm_mat : int
+        Normalized covariance matrices may have an eigenvalue
+        of 0 due to the last data point not being linearly
+        independent. To allow for this, the user should input
+        the number of normalized matrices that are being treated
+        in an instance. For example, if a single covariance matrix
+        of a normalized distribution is being processed, the input
+        would be 1. If a covariance matrix contains pertains to
+        3 normalized datasets (i.e. cross covmat for 3 
+        distributions), the input would be 3. The default value is
+        0 for when the covariance matrix pertains to an absolute 
+        distribution.
+
+    Returns
+    -------
+    artunc : list
+        A two dimensional matrix (given as a list of lists)
+        which contains artificial uncertainties to be added 
+        to the commondata. i^th row (or list) contains the 
+        artificial uncertainties of the i^th data point.
+
+    """
+    epsilon = -0.0000000001
+    neg_eval_count = 0
+    psd_check = True
+    covmat = np.zeros((ndata, ndata))
+    artunc = np.zeros((ndata, ndata))
+    for i in range(len(covmat_list)):
+        a = i // ndata
+        b = i % ndata
+        covmat[a][b] = covmat_list[i]
+    eigval, eigvec = eig(covmat)
+    for j in range(len(eigval)):
+        if eigval[j] < epsilon:
+            psd_check = False
+        elif eigval[j] > epsilon and eigval[j] <= 0:
+            neg_eval_count = neg_eval_count + 1
+            if neg_eval_count == (no_of_norm_mat + 1):
+                psd_check = False
+        elif eigval[j] > 0:
+            continue
+    if psd_check == False:
+        raise ValueError('The covariance matrix is not positive-semidefinite')
+    else:
+        for i in range(ndata):
+            for j in range(ndata):
+                if eigval[j] < 0:
+                    continue
+                else:
+                    artunc[i][j] = eigvec[i][j] * sqrt(eigval[j]) 
+    return artunc.tolist()
+
+def cross_cormat_to_covmat(row_err_list, col_err_list, cormat_list):
+    r"""Convert cross correlation matrix elements 
+    (i.e. those between different different variables or 
+    observables) to covariance matrix elements.
+
+    Parameters
+    ----------
+    row_err_list : list
+        A one dimensional list which contains the uncertainty
+        associated to each data point of the variable that is
+        given on the vertical axis.
+    col_err_list : list
+        A one dimensional list which contains the uncertainty
+        associated to each data point of the variable that is
+        given on the horizontal axis.
+    cormat_list : list
+        A one dimensional list which contains the elements of 
+        the correlation matrix row by row. Since experimental
+        datasets provide these matrices in a list form, this 
+        simplifies the implementation for the user.
+
+    Returns
+    -------
+    covmat_list : list
+        A one dimensional list which contains the elements of
+        the covariance matrix row by row.
+
+    """
+    covmat_list = []
+    for i in range(len(cormat_list)):
+        a = i // len(col_err_list)
+        b = i % len(col_err_list)
+        covmat_list.append(cormat_list[i] * row_err_list[a] * col_err_list[b])
+    return covmat_list
+
+def matlist_to_matrix(rows, columns, mat_list):
+    r"""Convert a 1d list to a 2d matrix.
+
+    Note: This utils function is not strictly needed for
+    data implementation, however, it is provided for
+    the aid of the user due to how matrices are treated
+    throughout all the other functions. This function 
+    allows the user to convert a list that contains the
+    elemnets of matrix row by row to a proper matrix, if
+    need be for any reason.
+
+    Parameters
+    ----------
+    rows : int
+        No. of rows in the matrix
+    columns : int
+        No. of columns in the matrix
+    mat_list : list
+        A one dimensional list which contains the elements of
+        the matrix row by row.
+
+    Returns
+    -------
+    matrix : numpy.ndarray
+        The matrix as a numpy 2d array.
+
+    """
+    if rows * columns == len(mat_list):
+        matrix = np.zeros((rows, columns))
+        for i in range(rows):
+            for j in range(columns):
+                matrix[i][j] = mat_list[j + i * columns]
+        matrix = np.array(matrix)
+        return matrix
+    else:
+        raise Exception('rows * columns != len(mat_list)')
+
+def concat_matrices(rows, columns, list_of_matrices):
+    r"""Join smaller matrices into a large matrix.
+
+    This function aims to simplify the process of joining multiple
+    smaller matrices into one large matrix. Such a need could arise,
+    for instance, when cross variable covariance matrices are provided
+    but the user needs to join all the matrices to generate the full
+    covariance matrix corresponding to the entire dataset.
+
+    Parameters
+    ----------
+    rows : int
+        No. of rows of matrices to be concatenated. E.g., if 6 
+        matrices: A, B, C, D, E, F need to be joined as
+        [[A, B, C],
+        [D, E, F]],
+        the number of rows would be 2.
+    columns : int
+        No. of columns of matrices to be concatenated. In the 
+        above example, this would be 3.
+    list_of_matrices : list
+        A list of the matrices that have to concatenated row by 
+        row. In the above example, this would be [A, B, C, D, E, F].
+        The matrices themselves need to be provided as a list of lists, 
+        or a numpy 2d array. If the user has the matrix in a 1d row by
+        row form, use matList_to_matrix() to convert it. It is assumed 
+        the user verifies that all the input matrices have the correct 
+        dimensions. Matrices with incompatible dimensions will lead to 
+        undesired behavior.
+
+    Returns
+    -------
+    final_mat_list : list
+        A one dimensional list which contains the elements of
+        the final, fully concatenated matrix row by row.
+
+    """
+    for i in range(len(list_of_matrices)):
+        list_of_matrices[i] = np.array(list_of_matrices[i])
+    col_list = []
+    for i in range(rows):
+        row_list = []
+        for j in range(columns):
+            row_list.append(list_of_matrices[j + i * columns])
+        col_list.append(np.concatenate(tuple(row_list), axis=1))
+    final_mat = np.concatenate(tuple(col_list), axis=0)
+    final_mat_list = []
+    for i in range(len(final_mat)):
+        for j in range(len(final_mat[i])):
+            final_mat_list.append(final_mat[i][j])
+    return final_mat_list
+
+def trimat_to_fullmat(mode, tri_mat_list):
+    r"""Convert a list of values of a triangular matrix
+    to a symmetric matrix.
+
+    Experimental datasets can provide the entries of 
+    correlation or covariance matrices as a triangular
+    matrix, as these matrices are symmetric by their 
+    very nature. This function can convert these list to
+    a complete symmetric matrix, that can be used for the
+    dataset implementation.
+
+    mode : bool
+        Enter 0 or 1 based on the following scenarios:
+        Use mode 0 if matrix entries are given row by 
+        row such as:
+        0 1 2 3
+          4 5 6
+            7 8
+              9
+        Use mode 1 if the matrix entries are given column
+        by column such as:
+        0 1 3 6
+          2 4 7
+            5 8
+              9
+        Please note that the numbers above (0-9) are not
+        entries of the matrix but rather the index of the 
+        entries of the list which contains the elements of 
+        the triangular matrix.
+    tri_mat_list : list
+        A list containing the elements of the triangular matrix,
+        for example, for a 4*4 matrix, the list of 
+        triangular matrix entries could be: 
+        [a, b, c, d, e, f, g, h, i, j] 
+
+    Returns
+    -------
+    mat_list : list
+        A one dimensional list which contains the elements of
+        the fully populated, symmetric matrix row by row.    
+
+    """
+    dim = int((np.sqrt(1 + 8*len(tri_mat_list)) - 1)/2)
+    matrix = np.zeros((dim, dim))
+    if mode == 0:
+        for i in range(dim):
+            for j in range(i + 1):
+                list_el = len(tri_mat_list) - 1 - ((i*(i + 1))//2 + j)
+                if i == j:
+                    matrix[dim - 1 - i][dim - 1 - j] = tri_mat_list[list_el]
+                else:
+                    matrix[dim - 1 - i][dim - 1 - j] = tri_mat_list[list_el]
+                    matrix[dim - 1 - j][dim - 1 - i] = tri_mat_list[list_el]
+    elif mode == 1:
+        for i in range(dim):
+            for j in range(i + 1):
+                list_el = (i*(i + 1))//2 + j
+                if i == j:
+                    matrix[i][j] = tri_mat_list[list_el]
+                else:
+                    matrix[i][j] = tri_mat_list[list_el]
+                    matrix[j][i] = tri_mat_list[list_el]
+    else:
+        raise Exception('Mode should be 0 or 1, refer to the function for usage')
+    mat_list = []
+    for i in range(dim):
+        for j in range(dim):
+            mat_list.append(matrix[i][j])
+    return mat_list