Source code for bca_survival.tools.bca_totalseg_extraction

"""
BOA Results extractor

This script processes BOA data by extracting measurements from the individual JSON files
across a directory structure. It targets two types of measurement files:
- total-measurements.json: Contains segmentation measurements for various organs
- bca-measurements.json: Contains body composition analysis measurements

The script consolidates these measurements into Excel spreadsheets for further analysis.

Usage:
    boa-extract base_path output_path
    python -m survival_analysis.boa_extractor base_path output_path

Author: Eric
"""

import argparse
import json
import os
from pathlib import Path
from typing import Tuple, Union

import pandas as pd


[docs] def process_json_files(root_dir: str) -> Tuple[pd.DataFrame, pd.DataFrame]: """ Walks through the directory structure, identifies relevant JSON files, processes them, and compiles the data into pandas DataFrames. Args: root_dir (str): The root directory to search for measurement files Returns: tuple: A tuple containing two DataFrames: - final_total_df: DataFrame with organ segmentation measurements - final_bca_df: DataFrame with body composition analysis measurements """ totalseg_data = [] bca_data = [] file_count = 0 root_dir = str(Path(root_dir)) print("Evaluating all files in {}".format(root_dir)) # Walk through directory for dirpath, dirnames, filenames in os.walk(root_dir): if "total-measurements.json" in filenames: file_path = os.path.join(dirpath, "total-measurements.json") totalseg_data.append(process_totalseg_measurements(file_path, dirpath)) file_count += 1 bca_df = process_bca_measurements(dirpath) if bca_df is not None: bca_data.append(bca_df) # Report progress print(f"Processed: {file_path}") # Create DataFrame if totalseg_data: final_total_df = pd.concat(totalseg_data, ignore_index=True) else: print("No total-measurements files found.") final_total_df = pd.DataFrame() # Concatenate and save bca measurements data to CSV if bca_data: final_bca_df = pd.concat(bca_data, ignore_index=True) else: print("No Processable bca files found") final_bca_df = pd.DataFrame() return final_total_df, final_bca_df
[docs] def process_totalseg_measurements(file_path: str, dirpath: str) -> pd.DataFrame: """ Processes an individual total-measurements.json file to extract organ segmentation measurements. Args: file_path (str): Path to the JSON file dirpath (str): Directory path containing the file (used to extract the study ID) Returns: pandas.DataFrame: A DataFrame with one row representing the measurements from the file """ with open(file_path, "r") as file: content = json.load(file) # Get the parent folder name study_id = os.path.basename(os.path.dirname(dirpath)) # Prepare the data for the dataframe row_data = {"StudyID": study_id} # Iterate through the 'total' key if "segmentations" in content and "total" in content["segmentations"]: total_data = content["segmentations"]["total"] for organ, metrics in total_data.items(): if metrics.get("present"): for metric, value in metrics.items(): if metric != "present": column_name = f"{organ}::{metric}" row_data[column_name] = value return pd.DataFrame([row_data])
[docs] def process_bca_measurements(folder_path: str) -> Union[pd.DataFrame, None]: """ Processes the bca-measurements.json file in the given folder and extracts the measurement data. Args: folder_path (str): The path to the folder containing the bca-measurements.json file. Returns: pandas.DataFrame or None: A DataFrame containing the measurement data with formatted column names, or None if the file doesn't exist. """ file_path = os.path.join(folder_path, "bca-measurements.json") if not os.path.exists(file_path): return None with open(file_path, "r") as file: data = json.load(file) aggregated = data.get("aggregated", {}) study_id = os.path.basename(os.path.dirname(folder_path)) row = {"StudyID": study_id} for scan_key, scan_value in aggregated.items(): for measurement_type, measurements in scan_value.items(): if isinstance(measurements, int): continue prefix = f"{scan_key}::{'ALL' if measurement_type == 'measurements' else 'WL'}::" for key, value in measurements.items(): for metric, value in value.items(): column_name = ( f"{prefix}{key}::{metric if metric.endswith('_hu') else metric + '_ml'}" ) row[column_name] = value df = pd.DataFrame([row]) return df
[docs] def main(root_path: str, output_path: str) -> None: """ Main function to iterate over folders, process the JSON files, and save the results to Excel files. Args: root_path (str): The base directory containing the folders with JSON files output_path (str): Path to save the resulting Excel files """ total_df, bca_df = process_json_files(root_path) if "_" in total_df["StudyID"][0]: bca_df["StudyID"] = bca_df["StudyID"].apply(lambda x: x.split("_")[1]).astype(int) total_df["StudyID"] = total_df["StudyID"].apply(lambda x: x.split("_")[1]).astype(int) # Create output directory if it doesn't exist os.makedirs(output_path, exist_ok=True) # Save results to Excel files total_df.to_excel(os.path.join(output_path, "total-measurements.xlsx"), index=False) bca_df.to_excel(os.path.join(output_path, "bca-measurements.xlsx"), index=False) print(f"Results saved to {output_path}") print( f"Found {len(total_df)} total measurement records and {len(bca_df)} BCA measurement records" )
[docs] def main_cli() -> None: """ Command-line interface entry point for the BOA extractor. This function is referenced in pyproject.toml to create the console script. """ parser = argparse.ArgumentParser( description="Process BOA JSON files and export measurements to Excel files." ) parser.add_argument( "base_path", type=str, help="The base directory containing the folders with JSON files." ) parser.add_argument("output_path", type=str, help="Path to save the resulting Excel files") # parser.add_argument('--version', action='version', version=f'%(prog)s {get_version()}') args = parser.parse_args() main(args.base_path, args.output_path)
if __name__ == "__main__": parser = argparse.ArgumentParser( description="Process BOA JSON files and export measurements to Excel files." ) parser.add_argument( "base_path", type=str, help="The base directory containing the folders with JSON files." ) parser.add_argument("output_path", type=str, help="Path to save the resulting Excel files") # parser.add_argument('--version', action='version', version=f'%(prog)s {get_version()}') args = parser.parse_args() main(args.base_path, args.output_path)