data.py

#######################################################
###       Code for observational data loading       ###
###           by Manuel A. Buen-Abad, 2020          ###
###                and Chen Sun, 2020               ###
#######################################################

import os
import random
import numpy as np
import scipy.linalg as la
from numpy import pi, sqrt, log, log10, exp, power


# numexpr, as stated in Pantheon code that it's much faster than numpy
try:
    import numexpr as ne
except ImportError:
    raise Exception(
        "This likelihood has intensive array manipulations. You "
        "have to install the numexpr Python package. Please type:\n"
        "(sudo) pip install numexpr --user")


# CONSTANTS:
_rads_over_arcsec_ = (2.*pi)/(360.*60.*60.)  # [rad/arcsec]


##########################
# auxiliary functions
##########################

def read_matrix(path):
    """
    extract the matrix from the path

    This routine uses the blazing fast pandas library (0.10 seconds to load
    a 740x740 matrix). If not installed, it uses a custom routine that is
    twice as slow (but still 4 times faster than the straightforward
    numpy.loadtxt method.)

    This function is adopted from MontePython

    .. note::

        the length of the matrix is stored on the first line... then it has
        to be unwrapped. The pandas routine read_table understands this
        immediatly, though.

    """
    from pandas import read_table
    # path = os.path.join(self.data_directory, path)
    # The first line should contain the length.
    with open(path, 'r') as text:
        length = int(text.readline())

    # Note that this function does not require to skiprows, as it
    # understands the convention of writing the length in the first
    # line

    # deprecated in pandas 0.23.0
    # "Deprecated since version 0.23.0: Use DataFrame.values() instead."
    # c.f. https://pandas.pydata.org/pandas-docs/version/0.25.1/reference/api/pandas.DataFrame.as_matrix.html
    # matrix = read_table(path).as_matrix().reshape((length, length))
    matrix = read_table(path).values.reshape((length, length))

    return matrix


##########################
# data loading functions
##########################

def load_quasars(dir_lkl, anchor_lkl, z_low=0., z_up=1000., Gamma_low=0., Gamma_up=10., get_dm=False):
    """Load the quasars from "The Chandra view of the relation between X-ray and UV emission in quasars" by Bisogni S., Lusso E., Civano F., Nardini E., Risaliti G., Elvis M., Fabbiano G..  <Astron. Astrophys. 655, A109 (2021)> =2021A&A...655A.109B        (SIMBAD/NED BibCode)

    :param dir_lkl: folder of likelihood
    :param anchor_lkl: file of likelihood, either 'quasars_Bisogni2021.txt' or 'quasars_Lusso2020.txt'
    :param z_low: lower cut of the quasar redshift
    :param z_up: upper cut of the quasar redshift
    :param Gamma_low: lower cut of photon index
    :param Gamma_up: upper cut of photon index
    :param get_dm: if True, return the distance modulus from Lusso2020. (Default: False)

    """
    path = os.path.join(dir_lkl, anchor_lkl)
    if anchor_lkl == "quasars_Lusso2020.txt":
        (qso_z_arr,
         qso_logf2500_arr,
         qso_dlogf2500_arr,
         qso_logf2keV_arr,
         qso_dlogf2keV_low_arr,
         qso_Gamma_arr,
         qso_dist_mod_arr,
         qso_ddist_mod_arr) = np.loadtxt(os.path.join(dir_lkl, anchor_lkl),
                                         usecols=[3, 4, 5, 6, 7, 9, 11, 12]).T
        # Lusso 2020 uses symmetric error bars
        qso_dlogf2keV_up_arr = qso_dlogf2keV_low_arr

    elif anchor_lkl == "quasars_Bisogni2021.txt":
        (qso_z_arr,
         qso_logf2500_arr,
         qso_dlogf2500_arr,
         qso_logf2keV_arr,
         qso_dlogf2keV_low_arr,
         qso_dlogf2keV_up_arr,
         qso_Gamma_arr) = np.loadtxt(os.path.join(dir_lkl, anchor_lkl),
                                     usecols=[1, 6, 7, 8, 9, 10, 11]).T
        # there is no distance modulus column in the Bisogni data set.
        qso_dist_mod_arr = np.array([None]*len(qso_z_arr))
        qso_ddist_mod_arr = np.array([None]*len(qso_z_arr))
    else:
        raise Exception(
            "The choice of anchor_lkl can be either 'quasars_Bisogni2021.txt' or 'quasars_Lusso2020.txt'. You chose '%s' instead. " % anchor_lkl)

    qso_name_arr = np.loadtxt(os.path.join(dir_lkl, anchor_lkl),
                              usecols=[0], dtype='str')

    # make mask for the cut
    print("---%d quasars before cut---" % len(qso_z_arr))

    mask_z_low = np.where(qso_z_arr > z_low, True, False)
    mask_z_up = np.where(qso_z_arr < z_up, True, False)
    mask = mask_z_low*mask_z_up
    print("---%d quasars remain after z cut---" % sum(mask))

    mask_Gamma_low = np.where(qso_Gamma_arr > Gamma_low, True, False)
    mask_Gamma_up = np.where(qso_Gamma_arr < Gamma_up, True, False)
    mask *= mask_Gamma_low*mask_Gamma_up
    print("---%d quasars remain after Gamma cut---" % sum(mask))

    if get_dm:

        return (qso_name_arr[mask],
                qso_z_arr[mask],
                qso_logf2500_arr[mask],
                qso_dlogf2500_arr[mask],
                qso_logf2keV_arr[mask],
                qso_dlogf2keV_low_arr[mask],
                qso_dlogf2keV_up_arr[mask],
                qso_Gamma_arr[mask],
                qso_dist_mod_arr[mask],
                qso_ddist_mod_arr[mask])
    else:
        return (qso_name_arr[mask],
                qso_z_arr[mask],
                qso_logf2500_arr[mask],
                qso_dlogf2500_arr[mask],
                qso_logf2keV_arr[mask],
                qso_dlogf2keV_low_arr[mask],
                qso_dlogf2keV_up_arr[mask],
                qso_Gamma_arr[mask])


def load_shoes(dir_lkl, anchor_lkl, aB, aBsig):
    """
    Load SH0ES.

    return: Anchor_SN, Anchor_SNsig, Anchor_Ceph, Anchor_Cephsig, Anchor_M, Anchor_Msig, aB, aBsig
    """

    (Anchor_SN, Anchor_SNsig, Anchor_Ceph,
     Anchor_Cephsig, Anchor_M, Anchor_Msig) = np.loadtxt(os.path.join(dir_lkl, anchor_lkl),
                                                         skiprows=2,
                                                         delimiter=",",
                                                         dtype=None)

    Anchor_SN = Anchor_SN - 5 * aB  # this is the measured m_SN

    return (Anchor_SN, Anchor_SNsig, Anchor_Ceph, Anchor_Cephsig, Anchor_M, Anchor_Msig, aB, aBsig)


def load_pantheon(dir_lkl, Pantheon_lkl, Pantheon_covmat, Pantheon_subset, verbose):
    """
    Load Pantheon.

    return: PAN_lkl, PAN_cov_sqrt, PAN_cov_logdet
    """

    PAN_lkl = np.loadtxt(os.path.join(dir_lkl, Pantheon_lkl),
                         skiprows=1,
                         usecols=(1, 4, 5))

    C00 = read_matrix(os.path.join(dir_lkl, Pantheon_covmat))

    # choose a subset of covmat and lkl
    # covmat
    full_length = len(PAN_lkl)
    subset_length = int(Pantheon_subset)
    del_length = full_length - subset_length
    del_idx = (random.sample(range(full_length), del_length))
    C00 = np.delete(C00, del_idx, axis=1)
    C00 = np.delete(C00, del_idx, axis=0)

    # lkl
    PAN_lkl = np.delete(PAN_lkl, del_idx, axis=0)
    if verbose >= 2:
        print('full_length=%s' % full_length)
        print('subset_length=%s' % subset_length)
        print('del_length=%s' % del_length)
        print('C00.shape=%s' % str(C00.shape))
        print('PAN_lkl.shape=%s' % str(PAN_lkl.shape))
    # end of choice
    PAN_cov = ne.evaluate("C00")
    PAN_cov += np.diag(PAN_lkl[:, 2]**2)
    PAN_cov_sqrt = la.cholesky(PAN_cov, lower=True, overwrite_a=True)
    _, PAN_cov_logdet = np.linalg.slogdet(PAN_cov)

    return (PAN_lkl, PAN_cov_sqrt, PAN_cov_logdet)


def load_boss_dr12(dir_lkl, BOSSDR12_rsfid, BOSSDR12_meas, BOSSDR12_covmat):
    """
    Load BOSS DR12.

    return: BOSS_rsfid, BOSS_meas_z, BOSS_meas_dM, BOSS_meas_Hz, BOSS_cov, BOSS_icov, BOSS_cov_logdet
    """

    BOSS_rsfid = BOSSDR12_rsfid
    BOSS_meas_z = np.array([], 'float64')
    BOSS_meas_dM = np.array([], 'float64')
    BOSS_meas_Hz = np.array([], 'float64')

    with open(os.path.join(dir_lkl, BOSSDR12_meas)) as f:
        for line in f:
            words = line.split()
            if words[0] != '#':
                if words[1] == 'dM(rsfid/rs)':
                    BOSS_meas_z = np.append(BOSS_meas_z, float(words[0]))
                    BOSS_meas_dM = np.append(BOSS_meas_dM, float(words[2]))
                elif words[1] == 'Hz(rs/rsfid)':
                    BOSS_meas_Hz = np.append(BOSS_meas_Hz, float(words[2]))

    BOSS_cov = np.loadtxt(os.path.join(dir_lkl, BOSSDR12_covmat))
    BOSS_icov = np.linalg.inv(BOSS_cov)
    _, BOSS_cov_logdet = np.linalg.slogdet(BOSS_cov)

    return (BOSS_rsfid, BOSS_meas_z, BOSS_meas_dM, BOSS_meas_Hz, BOSS_cov, BOSS_icov, BOSS_cov_logdet)


def load_bao_lowz(dir_lkl, BAOlowz_lkl):
    """
    Load BAOlowz (6DFs + DR7 MGS)

    return: BAOlowz_meas_exp, BAOlowz_meas_z, BAOlowz_meas_rs_dV, BAOlowz_meas_sigma, BAOlowz_meas_type
    """

    BAOlowz_meas_exp = np.array([])
    BAOlowz_meas_z = np.array([], 'float64')
    BAOlowz_meas_rs_dV = np.array([], 'float64')  # rs/dV or dV/rs
    BAOlowz_meas_sigma = np.array([], 'float64')
    BAOlowz_meas_type = np.array([], 'int')  # type 3, dV/rs, type 7 rs/dV

    with open(os.path.join(dir_lkl, BAOlowz_lkl)) as f:

        for line in f:
            words = line.split()

            if line[0] != '#':
                BAOlowz_meas_exp = np.append(BAOlowz_meas_exp, words[0])
                BAOlowz_meas_z = np.append(BAOlowz_meas_z, float(words[1]))
                BAOlowz_meas_rs_dV = np.append(
                    BAOlowz_meas_rs_dV, float(words[2]))
                BAOlowz_meas_sigma = np.append(
                    BAOlowz_meas_sigma, float(words[3]))
                BAOlowz_meas_type = np.append(BAOlowz_meas_type, int(words[4]))

    return (BAOlowz_meas_exp, BAOlowz_meas_z, BAOlowz_meas_rs_dV, BAOlowz_meas_sigma, BAOlowz_meas_type)


def load_clusters(dir_lkl, flg_load_err=False):
    """
    Load clusters ADD.

    return: names, z_cls, DA_cls, err_cls, asymm_cls, ne0_cls, beta_cls, rc_out_cls, f_cls, rc_in_cls, Rvir_cls
    """

    # from Bonamente et al., astro-ph/0512349, Table 3.
    stat = np.array([0.01, 0.15, 0.08, 0.08, 0.01, 0.02])
    sys_p = np.array([0.03, 0.05, 0.075, 0.08])
    sys_n = np.array([0.05, 0.075, 0.08])

    names = []
    z_cls = np.array([])

    DA_cls = np.array([])
    p_err_cls = np.array([])
    n_err_cls = np.array([])

    ne0_cls = np.array([])
    beta_cls = np.array([])
    rc_out_cls = np.array([])
    f_cls = np.array([])
    rc_in_cls = np.array([])

    ne0_err_cls = np.array([])
    beta_err_cls = np.array([])
    rc_out_err_cls = np.array([])
    f_err_cls = np.array([])
    rc_in_err_cls = np.array([])

    Rvir_cls = np.array([])

    with open(dir_lkl+'add.txt', 'r') as filein:
        for i, line in enumerate(filein):
            if line.strip() and line.find('#') == -1:

                this_line = line.split()

                names.append(this_line[0]+' '+this_line[1])
                z_cls = np.append(z_cls, float(this_line[2]))

                DA_cls = np.append(DA_cls, float(this_line[3]))
                p_err_cls = np.append(p_err_cls, float(this_line[4]))
                n_err_cls = np.append(n_err_cls, float(this_line[5]))

                ne0_cls = np.append(ne0_cls, float(this_line[6]))
                ne0_err_cls = np.append(ne0_cls, float(this_line[7]))
                beta_cls = np.append(beta_cls, float(this_line[8]))
                beta_err_cls = np.append(beta_cls, float(this_line[9]))
                rc_out_cls = np.append(rc_out_cls, float(this_line[10]))
                rc_out_err_cls = np.append(rc_out_cls, float(this_line[11]))
                f_cls = np.append(f_cls, float(this_line[12]))
                f_err_cls = np.append(f_cls, float(this_line[13]))
                rc_in_cls = np.append(rc_in_cls, float(this_line[14]))
                rc_in_err_cls = np.append(rc_in_cls, float(this_line[15]))

                Rvir_cls = np.append(Rvir_cls, float(this_line[20]))

    # converting from arcsec to kpc
    rc_out_cls = (DA_cls*1.e3)*(_rads_over_arcsec_*rc_out_cls)
    # converting from arcsec to kpc
    rc_in_cls = (DA_cls*1.e3)*(_rads_over_arcsec_*rc_in_cls)
    # converting from arcsec to kpc
    Rvir_cls = (DA_cls*1.e3)*(_rads_over_arcsec_*Rvir_cls)

    sig_p = sqrt(DA_cls*DA_cls*((stat**2.).sum() +
                                sys_p.sum()**2.) + p_err_cls**2.)
    sig_m = sqrt(DA_cls*DA_cls*((stat**2.).sum() +
                                sys_n.sum()**2.) + n_err_cls**2.)

    err_cls = (sig_p + sig_m)/2.
    asymm_cls = (sig_p - sig_m)/(sig_p + sig_m)
    if not flg_load_err:
        return (names, z_cls, DA_cls, err_cls, asymm_cls, ne0_cls, beta_cls, rc_out_cls, f_cls, rc_in_cls, Rvir_cls)
    else:
        return (names, z_cls, DA_cls, err_cls, asymm_cls, ne0_cls, beta_cls, rc_out_cls, f_cls, rc_in_cls, Rvir_cls, ne0_err_cls, beta_err_cls, rc_out_err_cls, f_err_cls, rc_in_err_cls)