"""
Survival Data Preprocessing Module.
This module provides utility functions for preprocessing survival analysis data, including
calculating time-to-event durations, handling missing or invalid data, creating event indicators,
and computing tissue ratios from the BCA values.
Requires: pandas
"""
from typing import Tuple
import pandas as pd
[docs]
def calculate_days(
df: pd.DataFrame, start_date_col: str, event_date_col: str, event_col: str
) -> pd.DataFrame:
"""
Calculates the number of days between two date columns and sets an event indicator.
Args:
df (pd.DataFrame): The input dataframe.
start_date_col (str): Name of the column containing start dates.
event_date_col (str, optional): Name of the column containing event dates.
If None, only the event indicator will be created.
event_col (str): Name of the column containing event indicators (1/0 or True/False).
Returns:
pd.DataFrame: DataFrame with added 'days' and 'event' columns.
Note:
The function expects dates in the format '%d.%m.%Y' (e.g., '31.12.2020').
The 'days' column represents the time between start and event dates.
The 'event' column is converted to integer type.
"""
if event_date_col:
df["days"] = (
pd.to_datetime(df[event_date_col], format="%d.%m.%Y")
- pd.to_datetime(df[start_date_col], format="%d.%m.%Y")
).dt.days
df["event"] = df[event_col].astype(int)
return df
[docs]
def check_and_remove_negative_days(df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
"""
Checks for and removes rows with negative or NaN values in the 'days' column.
Args:
df (pd.DataFrame): The input dataframe with a 'days' column.
Returns:
tuple: A tuple containing:
- pd.DataFrame: DataFrame with negative and NaN 'days' values removed.
- pd.DataFrame or None: DataFrame containing only the removed rows, or None if no rows were removed.
Note:
Negative days values can occur due to data entry errors or when an event occurs before
the recorded start date. This function identifies and removes such problematic records.
It prints a warning if any rows are removed.
"""
negative_values_count = (df["days"] < 0).sum()
nan_count = (df["days"].isna()).sum()
df_negative = None
if negative_values_count > 0:
print(
f"Warning: {negative_values_count + nan_count} rows will be dropped because they contain values below 0 "
f"or nan in the 'days' column."
)
df_negative = df[(df["days"] < 0) | (df["days"].isna())]
df = df[df["days"] >= 0]
return df, df_negative
[docs]
def create_event_date_column(
df: pd.DataFrame, date_death: str, date_disease_death: str, date_followup: str
) -> pd.DataFrame:
"""
Creates an event date column and event indicator based on multiple date columns.
This is used to prepare for Overall Survival analysis.
Args:
df (pd.DataFrame): The input dataframe.
date_death (str): Column name containing the date of death.
date_disease_death (str): Column name containing the date of disease-specific death.
date_followup (str): Column name containing the date of last follow-up.
Returns:
pd.DataFrame: DataFrame with added 'event_date' and 'event' columns.
Note:
This function prioritizes death dates over follow-up dates. It sets the event
indicator to True if either death date is present, and False if only the follow-up
date is available. If no dates are available, both columns are set to NaN.
"""
for i, row in df.iterrows():
if not pd.isna(row[date_death]):
df.loc[i, "event_date"] = row[date_death]
df.loc[i, "event"] = True
elif not pd.isna(row[date_disease_death]):
df.loc[i, "event_date"] = row[date_disease_death]
df.loc[i, "event"] = True
elif not pd.isna(row[date_followup]):
df.loc[i, "event_date"] = row[date_followup]
df.loc[i, "event"] = False
else:
df.loc[i, "event_date"] = pd.NA
df.loc[i, "event"] = pd.NA
return df
[docs]
def compute_ratios(df: pd.DataFrame) -> pd.DataFrame:
"""
Computes ratios between different tissue measurements across body parts and metrics.
Args:
df (pd.DataFrame): The input dataframe containing tissue measurement columns.
Returns:
pd.DataFrame: DataFrame with additional columns for computed ratios.
Note:
This function calculates ratios such as intramuscular adipose tissue to total adipose tissue
(imat/tat), visceral fat to total fat (vat/tat), etc., for various body parts and metrics.
The column naming convention is:
'{body_part}::WL::{tissue_type}::{metric}' for measurements
'{body_part}::WL::{numerator}/{denominator}::{metric}' for ratios
For example, 'l5::WL::imat/tat::mean_ml' represents the ratio of mean milliliter
volume of intramuscular adipose tissue to total adipose tissue at the L5 vertebra level.
"""
# Define the parts and metrics
body_parts = [
"ventral_cavity",
"abdominal_cavity",
"thoracic_cavity",
"mediastinum",
"pericardium",
"l5",
"l4",
"l3",
"l2",
"l1",
"t12",
"t11",
"t10",
"t9",
"t8",
"t7",
"t6",
"t5",
"t4",
"t3",
"t2",
"t1",
]
metrics = [
"mean_ml",
"std_ml",
"min_ml",
"q1_ml",
"q2_ml",
"q3_ml",
"max_ml",
"sum_ml",
"mean_hu",
]
ratios = [
("imat", "tat"),
("vat", "tat"),
("eat", "tat"),
("sat", "tat"),
("pat", "tat"),
("muscle", "bone"),
("imat", "muscle"),
]
# Iterate through each body part and metric combination
for body_part in body_parts:
for metric in metrics:
for numerator, denominator in ratios:
num_col = f"{body_part}::WL::{numerator}::{metric}"
den_col = f"{body_part}::WL::{denominator}::{metric}"
new_col = f"{body_part}::WL::{numerator}/{denominator}::{metric}"
if num_col in df.columns and den_col in df.columns:
df[new_col] = df[num_col] / df[den_col]
return df