Source code for bayesgm.utils.data_io

import numpy as np
import os
import sys
import pandas as pd
from sklearn.preprocessing import MinMaxScaler,MaxAbsScaler,StandardScaler



[docs]
def save_data(fname, data, delimiter='\t'):
    """
    Save the data to the specified path.

    Parameters:
    -----------
    fname : str
        The file name or path where the data will be saved.
    data : np.ndarray
        The data to save.
    delimiter : str, optional
        The delimiter for saving .txt or .csv files (default: '\t').

    Raises:
    -------
    ValueError
        If the file extension is not recognized.
    """
    if fname.endswith('.npy'):
        np.save(fname, data)
    elif fname.endswith('.txt') or fname.endswith('.csv'):
        np.savetxt(fname, data, fmt='%.6f', delimiter=delimiter)
    else:
        raise ValueError("Wrong saving format, please specify either .npy, .txt, or .csv")



[docs]
def parse_file(path, sep='\t', header=0, normalize=True):
    """
    Parse an input data file and return a single data matrix.

    This is a general-purpose loader for the BGM model, where the input
    is a single data matrix (as opposed to the causal triplet format with
    treatment, outcome, and covariates).

    Parameters
    ----------
    path : str
        Path to the input file. Supported formats: .npz, .csv, .txt.
    sep : str, optional
        Separator for .csv or .txt files. Default is tab-delimited.
    header : int or None, optional
        Row number to use as column names in .csv files. Default is 0.
    normalize : bool, optional
        If True, the data will be normalized using ``StandardScaler``.

    Returns
    -------
    data : np.ndarray
        The data matrix with shape ``(n_samples, n_features)``, dtype float32.

    Examples
    --------
    >>> data = parse_file("data.csv", sep=',', normalize=True)
    >>> data = parse_file("data.npz", normalize=False)
    """
    assert os.path.exists(path), f"File not found: {path}"
    if path.endswith('npz'):
        loaded = np.load(path)
        # Support common key names: 'data', 'x', or the first key
        for key in ['data', 'x', 'X']:
            if key in loaded:
                data = loaded[key]
                break
        else:
            # Use the first available key
            first_key = list(loaded.keys())[0]
            data = loaded[first_key]
    elif path.endswith('csv'):
        data = pd.read_csv(path, header=header, sep=sep).values
    elif path.endswith('txt'):
        data = np.loadtxt(path, delimiter=sep)
    else:
        print('File format not recognized, please use .npz, .csv or .txt as input.')
        sys.exit()
    data = data.astype('float32')
    if normalize:
        data = StandardScaler().fit_transform(data)
    return data




[docs]
def parse_file_triplet(path, sep='\t', header=0, normalize=True):
    """
    Parse an input file and extract the (treatment, outcome, covariates) triplet
    for CausalBGM model training or evaluation.

    Parameters
    ----------
    path : str
        Path to the input file. The file can be in .npz, .csv, or .txt format.
    sep : str, optional
        Separator used in .csv or .txt files. Defaults to tab-delimited format.
    header : int or None, optional
        Row number to use as column names in .csv files. Default is 0 (the first row). 
        Use `None` if the file does not have a header.
    normalize : bool, optional
        If True, the features in `v` will be normalized using `StandardScaler`.

    Returns
    -------
    data_x : np.ndarray
        The treatment variable(s) extracted from the file, reshaped to (-1, 1).
    data_y : np.ndarray
        The outcome variable(s) extracted from the file, reshaped to (-1, 1).
    data_v : np.ndarray
        Covariates extracted from the file. Normalized if `normalize=True`.

    Notes
    -----
    - Supported file formats:
        - `.npz`: Numpy compressed files with keys `x`, `y`, and `v`.
        - `.csv`: Comma-separated value files with treatment, outcome, and covariates as columns.
        - `.txt`: Tab- or other character-delimited text files with similar structure to .csv.
    - The input file must exist at the specified `path`.
    - The first column is assumed to be the treatment variable (`x`).
    - The second column is assumed to be the outcome variable (`y`).
    - Remaining columns are assumed to be covariates (`v`).

    Examples
    --------
    # Example for .csv input
    data_x, data_y, data_v = parse_file_triplet("data.csv", sep=',', header=0, normalize=True)
    
    # Example for .npz input
    data_x, data_y, data_v = parse_file_triplet("data.npz", normalize=False)
    """
    assert os.path.exists(path)
    if path[-3:] == 'npz':
        data = np.load(path)
        data_x, data_y, data_v = data['x'],data['y'],data['v']
    elif  path[-3:] == 'csv':
        data = pd.read_csv(path, header=0, sep=sep).values
        data_x = data[:,0].reshape(-1, 1).astype('float32')
        data_y = data[:,1].reshape(-1, 1).astype('float32')
        data_v = data[:,2:].astype('float32')
    elif path[-3:] == 'txt':
        data = np.loadtxt(path,delimiter=sep)
        data_x = data[:,0].reshape(-1, 1).astype('float32')
        data_y = data[:,1].reshape(-1, 1).astype('float32')
        data_v = data[:,2:].astype('float32')
    else:
        print('File format not recognized, please use .npz, .csv or .txt as input.')
        sys.exit()
    if normalize:
        data_v = StandardScaler().fit_transform(data_v)
    return data_x, data_y, data_v