Source code for bca_survival.tools.bca_merger

#!/usr/bin/env python3
"""
bca_merger - A CLI tool to merge two Excel files based on ID columns.

Usage:
    bca_merger <first_file> <second_file> <id_column_name>

Arguments:
    first_file      Path to the first Excel file
    second_file     Path to the second Excel file
    id_column_name  Name of the ID column in the first file to match with 'StudyID' in the second file
"""

import sys
from pathlib import Path
from typing import Union

import pandas as pd



[docs]
def merge_files(
    first_file_path: Union[str, Path], second_file_path: Union[str, Path], id_column_name: str
) -> bool:
    """
    Merge two Excel files based on ID columns.

    Args:
        first_file_path: Path to the first Excel file
        second_file_path: Path to the second Excel file
        id_column_name: Name of the ID column in the first file

    Returns:
        bool: True if merge was successful, False otherwise
    """
    try:
        # Read the Excel files
        # Use 'openpyxl' engine and parse_dates=False to keep date columns as they are
        # Set index_col=None explicitly to avoid creating an index column
        df1: pd.DataFrame = pd.read_excel(
            first_file_path, engine="openpyxl", parse_dates=False, index_col=None
        )
        df2: pd.DataFrame = pd.read_excel(
            second_file_path, engine="openpyxl", parse_dates=False, index_col=None
        )

        # Verify that the ID column exists in the first file
        if id_column_name not in df1.columns:
            print(f"Error: Column '{id_column_name}' not found in the first file.")
            sys.exit(1)

        # Verify that 'StudyID' column exists in the second file
        if "StudyID" not in df2.columns:
            print("Error: Column 'StudyID' not found in the second file.")
            sys.exit(1)

        # Merge the dataframes
        # Using outer merge to keep all rows from both files
        # and fill with NaN where there's no match
        merged_df: pd.DataFrame = pd.merge(
            df1, df2, left_on=id_column_name, right_on="StudyID", how="outer", indicator=False
        )

        # Remove the duplicate StudyID column if it exists and is different from the id_column_name
        if "StudyID" in merged_df.columns and "StudyID" != id_column_name:
            merged_df = merged_df.drop(columns=["StudyID"])

        # Generate output filename
        first_file_name: str = Path(first_file_path).stem
        output_path: str = f"{first_file_name}_merged.xlsx"

        # Convert datetime columns to date only (remove time component)
        for col in merged_df.columns:
            # Check if the column contains datetime values
            if pd.api.types.is_datetime64_any_dtype(merged_df[col]):
                # Convert to date only (removes time component)
                merged_df[col] = merged_df[col].dt.strftime("%d.%m.%Y")

        # Write the merged dataframe to a new Excel file
        # Make sure index=False to prevent unnamed columns
        with pd.ExcelWriter(output_path, engine="openpyxl", date_format="%d.%m.%Y") as writer:
            merged_df.to_excel(writer, index=False)

        print(f"Merged file created: {output_path}")
        return True

    except Exception as e:
        print(f"Error: {str(e)}")
        return False




[docs]
def main() -> None:
    """
    Main function to parse command line arguments and execute the merge.
    """
    # Check if the correct number of arguments is provided
    if len(sys.argv) != 4:
        print(__doc__)
        sys.exit(1)

    first_file: str = sys.argv[1]
    second_file: str = sys.argv[2]
    id_column_name: str = sys.argv[3]

    # Check if files exist
    if not Path(first_file).is_file():
        print(f"Error: First file '{first_file}' does not exist.")
        sys.exit(1)

    if not Path(second_file).is_file():
        print(f"Error: Second file '{second_file}' does not exist.")
        sys.exit(1)

    # Merge the files
    print("Files are ok. Beginning merging...")
    success: bool = merge_files(first_file, second_file, id_column_name)

    if not success:
        sys.exit(1)



if __name__ == "__main__":
    main()