Source code for bca_survival.preprocessing

"""
Survival Data Preprocessing Module.

This module provides utility functions for preprocessing survival analysis data, including
calculating time-to-event durations, handling missing or invalid data, creating event indicators,
and computing tissue ratios from the BCA values.

Requires: pandas
"""

from typing import Tuple

import pandas as pd


[docs] def calculate_days( df: pd.DataFrame, start_date_col: str, event_date_col: str, event_col: str ) -> pd.DataFrame: """ Calculates the number of days between two date columns and sets an event indicator. Args: df (pd.DataFrame): The input dataframe. start_date_col (str): Name of the column containing start dates. event_date_col (str, optional): Name of the column containing event dates. If None, only the event indicator will be created. event_col (str): Name of the column containing event indicators (1/0 or True/False). Returns: pd.DataFrame: DataFrame with added 'days' and 'event' columns. Note: The function expects dates in the format '%d.%m.%Y' (e.g., '31.12.2020'). The 'days' column represents the time between start and event dates. The 'event' column is converted to integer type. """ if event_date_col: df["days"] = ( pd.to_datetime(df[event_date_col], format="%d.%m.%Y") - pd.to_datetime(df[start_date_col], format="%d.%m.%Y") ).dt.days df["event"] = df[event_col].astype(int) return df
[docs] def check_and_remove_negative_days(df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]: """ Checks for and removes rows with negative or NaN values in the 'days' column. Args: df (pd.DataFrame): The input dataframe with a 'days' column. Returns: tuple: A tuple containing: - pd.DataFrame: DataFrame with negative and NaN 'days' values removed. - pd.DataFrame or None: DataFrame containing only the removed rows, or None if no rows were removed. Note: Negative days values can occur due to data entry errors or when an event occurs before the recorded start date. This function identifies and removes such problematic records. It prints a warning if any rows are removed. """ negative_values_count = (df["days"] < 0).sum() nan_count = (df["days"].isna()).sum() df_negative = None if negative_values_count > 0: print( f"Warning: {negative_values_count + nan_count} rows will be dropped because they contain values below 0 " f"or nan in the 'days' column." ) df_negative = df[(df["days"] < 0) | (df["days"].isna())] df = df[df["days"] >= 0] return df, df_negative
[docs] def create_event_date_column( df: pd.DataFrame, date_death: str, date_disease_death: str, date_followup: str ) -> pd.DataFrame: """ Creates an event date column and event indicator based on multiple date columns. This is used to prepare for Overall Survival analysis. Args: df (pd.DataFrame): The input dataframe. date_death (str): Column name containing the date of death. date_disease_death (str): Column name containing the date of disease-specific death. date_followup (str): Column name containing the date of last follow-up. Returns: pd.DataFrame: DataFrame with added 'event_date' and 'event' columns. Note: This function prioritizes death dates over follow-up dates. It sets the event indicator to True if either death date is present, and False if only the follow-up date is available. If no dates are available, both columns are set to NaN. """ for i, row in df.iterrows(): if not pd.isna(row[date_death]): df.loc[i, "event_date"] = row[date_death] df.loc[i, "event"] = True elif not pd.isna(row[date_disease_death]): df.loc[i, "event_date"] = row[date_disease_death] df.loc[i, "event"] = True elif not pd.isna(row[date_followup]): df.loc[i, "event_date"] = row[date_followup] df.loc[i, "event"] = False else: df.loc[i, "event_date"] = pd.NA df.loc[i, "event"] = pd.NA return df
[docs] def compute_ratios(df: pd.DataFrame) -> pd.DataFrame: """ Computes ratios between different tissue measurements across body parts and metrics. Args: df (pd.DataFrame): The input dataframe containing tissue measurement columns. Returns: pd.DataFrame: DataFrame with additional columns for computed ratios. Note: This function calculates ratios such as intramuscular adipose tissue to total adipose tissue (imat/tat), visceral fat to total fat (vat/tat), etc., for various body parts and metrics. The column naming convention is: '{body_part}::WL::{tissue_type}::{metric}' for measurements '{body_part}::WL::{numerator}/{denominator}::{metric}' for ratios For example, 'l5::WL::imat/tat::mean_ml' represents the ratio of mean milliliter volume of intramuscular adipose tissue to total adipose tissue at the L5 vertebra level. """ # Define the parts and metrics body_parts = [ "ventral_cavity", "abdominal_cavity", "thoracic_cavity", "mediastinum", "pericardium", "l5", "l4", "l3", "l2", "l1", "t12", "t11", "t10", "t9", "t8", "t7", "t6", "t5", "t4", "t3", "t2", "t1", ] metrics = [ "mean_ml", "std_ml", "min_ml", "q1_ml", "q2_ml", "q3_ml", "max_ml", "sum_ml", "mean_hu", ] ratios = [ ("imat", "tat"), ("vat", "tat"), ("eat", "tat"), ("sat", "tat"), ("pat", "tat"), ("muscle", "bone"), ("imat", "muscle"), ] # Iterate through each body part and metric combination for body_part in body_parts: for metric in metrics: for numerator, denominator in ratios: num_col = f"{body_part}::WL::{numerator}::{metric}" den_col = f"{body_part}::WL::{denominator}::{metric}" new_col = f"{body_part}::WL::{numerator}/{denominator}::{metric}" if num_col in df.columns and den_col in df.columns: df[new_col] = df[num_col] / df[den_col] return df