Source code for ciss_vae.data

"""Data loading utilities for CISS-VAE example dataset."""
import pandas as pd
try:
    from importlib import resources
except ImportError:
    # Python < 3.9 fallback
    import importlib_resources as resources

[docs] def load_example_dataset(): """ Load the complete CISS-VAE example dataset. Returns ------- tuple A tuple containing (df_missing, df_complete, clusters) where: - df_missing : pd.DataFrame DataFrame with missing values - df_complete : pd.DataFrame Complete DataFrame without missing values - clusters : np.ndarray Cluster labels as 1D array Examples -------- >>> from ciss_vae.data import load_example_dataset >>> df_missing, df_complete, clusters = load_example_dataset() """ # Load df_missing.csv with index_col=[0] with resources.open_text(__package__, "df_missing.csv") as f: df_missing = pd.read_csv(f, index_col=[0]) # Load df_complete.csv with index_col=[0] with resources.open_text(__package__, "df_complete.csv") as f: df_complete = pd.read_csv(f, index_col=[0]) # Load clusters.csv with index_col=[0] and squeeze to 1D array with resources.open_text(__package__, "clusters.csv") as f: clusters_df = pd.read_csv(f, index_col=[0]) clusters = clusters_df.values.squeeze() return df_missing, df_complete, clusters
[docs] def load_missing_data(): """Load only the missing data DataFrame.""" with resources.open_text(__package__, "df_missing.csv") as f: return pd.read_csv(f, index_col=[0])
[docs] def load_complete_data(): """Load only the complete data DataFrame.""" with resources.open_text(__package__, "df_complete.csv") as f: return pd.read_csv(f, index_col=[0])
[docs] def load_clusters(): """Load only the cluster labels as 1D array.""" with resources.open_text(__package__, "clusters.csv") as f: clusters_df = pd.read_csv(f, index_col=[0]) return clusters_df.values.squeeze()
def load_dni(): """Load dni matrix for df_missing""" with resources.open_text(__package__, "dni.csv") as f: return pd.read_csv(f, index_col=[0]) # For convenience, make the main function available at package level __all__ = ["load_example_dataset", "load_missing_data", "load_complete_data", "load_clusters"]