Source code for bca_survival.tools.bca_totalseg_extraction

"""
BOA Results extractor

This script processes BOA data by extracting measurements from the individual JSON files
across a directory structure. It targets two types of measurement files:
- total-measurements.json: Contains segmentation measurements for various organs
- bca-measurements.json: Contains body composition analysis measurements

The script consolidates these measurements into Excel spreadsheets for further analysis.

Usage:
    boa-extract base_path output_path
    python -m survival_analysis.boa_extractor base_path output_path

Author: Eric
"""

import argparse
import json
import os
from pathlib import Path
from typing import Tuple, Union

import pandas as pd



[docs]
def process_json_files(root_dir: str) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Walks through the directory structure, identifies relevant JSON files,
    processes them, and compiles the data into pandas DataFrames.

    Args:
        root_dir (str): The root directory to search for measurement files

    Returns:
        tuple: A tuple containing two DataFrames:
            - final_total_df: DataFrame with organ segmentation measurements
            - final_bca_df: DataFrame with body composition analysis measurements
    """
    totalseg_data = []
    bca_data = []
    file_count = 0
    root_dir = str(Path(root_dir))
    print("Evaluating all files in {}".format(root_dir))
    # Walk through directory
    for dirpath, dirnames, filenames in os.walk(root_dir):
        if "total-measurements.json" in filenames:
            file_path = os.path.join(dirpath, "total-measurements.json")
            totalseg_data.append(process_totalseg_measurements(file_path, dirpath))
            file_count += 1
            bca_df = process_bca_measurements(dirpath)
            if bca_df is not None:
                bca_data.append(bca_df)

            # Report progress
            print(f"Processed: {file_path}")

    # Create DataFrame
    if totalseg_data:
        final_total_df = pd.concat(totalseg_data, ignore_index=True)
    else:
        print("No total-measurements files found.")
        final_total_df = pd.DataFrame()

    # Concatenate and save bca measurements data to CSV
    if bca_data:
        final_bca_df = pd.concat(bca_data, ignore_index=True)
    else:
        print("No Processable bca files found")
        final_bca_df = pd.DataFrame()

    return final_total_df, final_bca_df




[docs]
def process_totalseg_measurements(file_path: str, dirpath: str) -> pd.DataFrame:
    """
    Processes an individual total-measurements.json file to extract organ segmentation measurements.

    Args:
        file_path (str): Path to the JSON file
        dirpath (str): Directory path containing the file (used to extract the study ID)

    Returns:
        pandas.DataFrame: A DataFrame with one row representing the measurements from the file
    """
    with open(file_path, "r") as file:
        content = json.load(file)

        # Get the parent folder name
        study_id = os.path.basename(os.path.dirname(dirpath))

        # Prepare the data for the dataframe
        row_data = {"StudyID": study_id}

        # Iterate through the 'total' key
        if "segmentations" in content and "total" in content["segmentations"]:
            total_data = content["segmentations"]["total"]
            for organ, metrics in total_data.items():
                if metrics.get("present"):
                    for metric, value in metrics.items():
                        if metric != "present":
                            column_name = f"{organ}::{metric}"
                            row_data[column_name] = value
        return pd.DataFrame([row_data])




[docs]
def process_bca_measurements(folder_path: str) -> Union[pd.DataFrame, None]:
    """
    Processes the bca-measurements.json file in the given folder and extracts the measurement data.

    Args:
        folder_path (str): The path to the folder containing the bca-measurements.json file.

    Returns:
        pandas.DataFrame or None: A DataFrame containing the measurement data with formatted
                                column names, or None if the file doesn't exist.
    """
    file_path = os.path.join(folder_path, "bca-measurements.json")
    if not os.path.exists(file_path):
        return None

    with open(file_path, "r") as file:
        data = json.load(file)

    aggregated = data.get("aggregated", {})
    study_id = os.path.basename(os.path.dirname(folder_path))
    row = {"StudyID": study_id}

    for scan_key, scan_value in aggregated.items():
        for measurement_type, measurements in scan_value.items():
            if isinstance(measurements, int):
                continue
            prefix = f"{scan_key}::{'ALL' if measurement_type == 'measurements' else 'WL'}::"
            for key, value in measurements.items():
                for metric, value in value.items():
                    column_name = (
                        f"{prefix}{key}::{metric if metric.endswith('_hu') else metric + '_ml'}"
                    )
                    row[column_name] = value

    df = pd.DataFrame([row])
    return df




[docs]
def main(root_path: str, output_path: str) -> None:
    """
    Main function to iterate over folders, process the JSON files, and save the results to Excel files.

    Args:
        root_path (str): The base directory containing the folders with JSON files
        output_path (str): Path to save the resulting Excel files
    """
    total_df, bca_df = process_json_files(root_path)
    if "_" in total_df["StudyID"][0]:
        bca_df["StudyID"] = bca_df["StudyID"].apply(lambda x: x.split("_")[1]).astype(int)
        total_df["StudyID"] = total_df["StudyID"].apply(lambda x: x.split("_")[1]).astype(int)
    # Create output directory if it doesn't exist
    os.makedirs(output_path, exist_ok=True)

    # Save results to Excel files
    total_df.to_excel(os.path.join(output_path, "total-measurements.xlsx"), index=False)
    bca_df.to_excel(os.path.join(output_path, "bca-measurements.xlsx"), index=False)

    print(f"Results saved to {output_path}")
    print(
        f"Found {len(total_df)} total measurement records and {len(bca_df)} BCA measurement records"
    )




[docs]
def main_cli() -> None:
    """
    Command-line interface entry point for the BOA extractor.
    This function is referenced in pyproject.toml to create the console script.
    """
    parser = argparse.ArgumentParser(
        description="Process BOA JSON files and export measurements to Excel files."
    )
    parser.add_argument(
        "base_path", type=str, help="The base directory containing the folders with JSON files."
    )
    parser.add_argument("output_path", type=str, help="Path to save the resulting Excel files")
    #    parser.add_argument('--version', action='version', version=f'%(prog)s {get_version()}')
    args = parser.parse_args()

    main(args.base_path, args.output_path)



if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Process BOA JSON files and export measurements to Excel files."
    )
    parser.add_argument(
        "base_path", type=str, help="The base directory containing the folders with JSON files."
    )
    parser.add_argument("output_path", type=str, help="Path to save the resulting Excel files")
    # parser.add_argument('--version', action='version', version=f'%(prog)s {get_version()}')
    args = parser.parse_args()

    main(args.base_path, args.output_path)