Source code for bca_survival.tools.results_converter

#!/usr/bin/env python3
import argparse
import os
import sys
from typing import Any, Dict, List, Literal, Optional, Type, cast

import pandas as pd

# Define PDF converter type
PDF_CONVERTER_TYPE = Literal["win32", "fpdf", None]
PDF_CONVERTER: PDF_CONVERTER_TYPE = None

# Define base FPDF type for later conditional definition
FPDF_CLASS: Optional[Type[Any]] = None
PDFWithWordWrap_CLASS: Optional[Type[Any]] = None

# For PDF conversion
try:
    # Windows approach using COM automation
    import win32com.client  # type: ignore[import-untyped]

    PDF_CONVERTER = "win32"
except ImportError:
    try:
        # Alternative approach using fpdf
        import openpyxl  # type: ignore[import-untyped]
        from fpdf import FPDF  # type: ignore[import-untyped]

        # Store the FPDF class for later use
        FPDF_CLASS = FPDF

        class PDFWithWordWrap(FPDF):  # type: ignore[valid-type]
            """Extended FPDF class with word wrap functionality for cells"""

            def __init__(self, *args: Any, **kwargs: Any) -> None:
                super().__init__(*args, **kwargs)
                self.cell_height_ratio = 1.5  # Default cell height ratio

            def multi_cell_auto_width(
                self,
                width: float,
                height: float,
                text: str,
                border: int = 0,
                align: str = "L",
                fill: bool = False,
            ) -> float:
                """Create a multi-cell with automatic word wrapping and return its height"""
                # Calculate how many lines this text will require
                string_width: float = float(self.get_string_width(text))  # Explicitly cast to float
                lines_float: float = string_width / (width - 2)
                lines_count: int = max(1, int(lines_float) + 1)  # At least one line

                # Calculate total height needed
                total_height: float = height * float(lines_count)

                # Create the multi-cell
                self.multi_cell(width, height, text, border, align, fill)

                # Explicitly ensure we return a float
                return total_height

        # Store the class for later use
        PDFWithWordWrap_CLASS = PDFWithWordWrap

        PDF_CONVERTER = "fpdf"
    except ImportError:
        print("Warning: Neither win32com nor fpdf found. PDF conversion will be disabled.")
        PDF_CONVERTER = None



[docs]
def create_output_folders(directory: str) -> None:
    """Create the output folders if they don't exist"""
    for folder in ["PDF", "CSV", "TXT"]:
        folder_path: str = os.path.join(directory, folder)
        if not os.path.exists(folder_path):
            os.makedirs(folder_path)
            print(f"Created folder: {folder_path}")




[docs]
def convert_to_csv(excel_file: str, output_folder: str) -> None:
    """Convert Excel file to CSV format"""
    filename: str = os.path.basename(excel_file)
    base_name: str = os.path.splitext(filename)[0]

    try:
        # Read all sheets from the Excel file using context manager
        with pd.ExcelFile(excel_file) as excel:
            sheet_names: List[str] = excel.sheet_names

            if len(sheet_names) == 1:
                # If there's only one sheet, convert it directly
                output_file: str = os.path.join(output_folder, f"{base_name}.csv")
                df_single: pd.DataFrame = pd.read_excel(excel, sheet_name=sheet_names[0])
                df_single.to_csv(output_file, index=False)
                print(f"Converted {excel_file} to CSV: {output_file}")
            else:
                # If there are multiple sheets, create separate CSV files
                for sheet in sheet_names:
                    sheet_output: str = os.path.join(output_folder, f"{base_name}_{sheet}.csv")
                    df_multi: pd.DataFrame = pd.read_excel(excel, sheet_name=sheet)
                    df_multi.to_csv(sheet_output, index=False)
                print(f"Converted {excel_file} to multiple CSV files in {output_folder}")

    except Exception as e:
        print(f"Error converting {excel_file} to CSV: {str(e)}")




[docs]
def convert_to_txt(excel_file: str, output_folder: str) -> None:
    """Convert Excel file to TXT format"""
    filename: str = os.path.basename(excel_file)
    base_name: str = os.path.splitext(filename)[0]
    output_file: str = os.path.join(output_folder, f"{base_name}.txt")

    try:
        # Read all sheets from the Excel file using context manager
        with pd.ExcelFile(excel_file) as excel:
            sheet_names: List[str] = excel.sheet_names

            with open(output_file, "w", encoding="utf-8") as txt_file:
                if len(sheet_names) > 1:
                    # If there are multiple sheets, include sheet names in the TXT file
                    for sheet in sheet_names:
                        df_sheet: pd.DataFrame = pd.read_excel(excel, sheet_name=sheet)
                        txt_file.write(f"Sheet: {sheet}\n")
                        txt_file.write("=" * 50 + "\n")
                        txt_file.write(df_sheet.to_string(index=False))
                        txt_file.write("\n\n")
                else:
                    # If there's only one sheet, convert it directly
                    df_txt: pd.DataFrame = pd.read_excel(excel, sheet_name=sheet_names[0])
                    txt_file.write(df_txt.to_string(index=False))

        print(f"Converted {excel_file} to TXT: {output_file}")

    except Exception as e:
        print(f"Error converting {excel_file} to TXT: {str(e)}")




[docs]
def convert_to_pdf_win32(excel_file: str, output_folder: str) -> None:
    """Convert Excel file to PDF format using win32com (Windows only)"""
    filename: str = os.path.basename(excel_file)
    base_name: str = os.path.splitext(filename)[0]
    output_file: str = os.path.join(output_folder, f"{base_name}.pdf")

    excel_app = None
    workbook = None
    try:
        # Initialize Excel application
        excel_app = win32com.client.Dispatch("Excel.Application")
        excel_app.Visible = False  # Run in background
        excel_app.DisplayAlerts = False  # Prevent Excel alerts

        # Open the workbook
        workbook = excel_app.Workbooks.Open(os.path.abspath(excel_file))

        # Process each worksheet
        for sheet in workbook.Worksheets:
            # Set to horizontal (landscape) orientation
            sheet.PageSetup.Orientation = 2  # xlLandscape

            # Fit to page width (1 page wide)
            sheet.PageSetup.FitToPagesWide = 1
            sheet.PageSetup.FitToPagesTall = False

            # Adjust column widths to fit content
            sheet.UsedRange.Columns.AutoFit()

            # Set print area to the used range
            sheet.PageSetup.PrintArea = sheet.UsedRange.Address

            # Set other print settings for better visibility
            sheet.PageSetup.Zoom = False  # Disable zoom to use FitToPages
            sheet.PageSetup.CenterHorizontally = True

        # Save as PDF
        workbook.ExportAsFixedFormat(0, os.path.abspath(output_file))

        print(f"Converted {excel_file} to PDF: {output_file}")

    except Exception as e:
        print(f"Error converting {excel_file} to PDF: {str(e)}")
    finally:
        # Always close workbook and Excel application to release file handles
        if workbook is not None:
            try:
                workbook.Close(False)
            except Exception:
                pass
        if excel_app is not None:
            try:
                excel_app.Quit()
            except Exception:
                pass




[docs]
def convert_to_pdf_fpdf(excel_file: str, output_folder: str) -> None:
    """Convert Excel file to PDF format using fpdf with improved cell visibility"""
    # Check if PDFWithWordWrap_CLASS is available
    if PDFWithWordWrap_CLASS is None:
        print(f"Error: FPDF library not available for PDF conversion of {excel_file}")
        return

    filename: str = os.path.basename(excel_file)
    base_name: str = os.path.splitext(filename)[0]
    output_file: str = os.path.join(output_folder, f"{base_name}.pdf")

    wb = None
    try:
        # Load the Excel file
        wb = openpyxl.load_workbook(excel_file)

        # Create a PDF object - A4 landscape (horizontal)
        pdf = cast(Any, PDFWithWordWrap_CLASS)(orientation="L", unit="mm", format="A4")
        pdf.set_auto_page_break(auto=True, margin=15)

        # For each sheet in the workbook
        for sheet_name in wb.sheetnames:
            sheet = wb[sheet_name]
            pdf.add_page()

            # Add sheet name as a title
            pdf.set_font("Arial", "B", 16)
            pdf.cell(0, 10, f"Sheet: {sheet_name}", 0, 1, "C")
            pdf.ln(5)

            # Get max rows and columns
            max_row: int = sheet.max_row
            max_col: int = sheet.max_column

            # Adjust for empty sheets
            if max_row == 0 or max_col == 0:
                pdf.set_font("Arial", "", 12)
                pdf.cell(0, 10, "Empty sheet", 0, 1, "C")
                continue

            # Calculate optimal column widths based on content
            col_widths: Dict[int, float] = {}
            max_content_lengths: Dict[int, int] = {}

            # First pass: determine maximum content length for each column
            for col in range(1, max_col + 1):
                max_length = 0
                for row in range(1, min(max_row + 1, 101)):  # Sample first 100 rows
                    cell_value = sheet.cell(row=row, column=col).value
                    if cell_value is not None:
                        max_length = max(max_length, len(str(cell_value)))
                max_content_lengths[col] = max_length

            # Available page width (A4 landscape minus margins)
            available_width: float = 277 - 20  # A4 landscape width ~277mm, minus margins

            # Calculate proportional column widths with minimum and maximum constraints
            total_content_length: int = sum(max_content_lengths.values())
            min_col_width: float = 10.0  # Minimum column width in mm
            max_col_width: float = 50.0  # Maximum column width in mm

            for col in range(1, max_col + 1):
                if total_content_length > 0:
                    proportion: float = max_content_lengths[col] / total_content_length
                    col_widths[col] = max(
                        min_col_width, min(max_col_width, proportion * available_width)
                    )
                else:
                    col_widths[col] = min_col_width

            # Adjust if total width exceeds available width
            total_width: float = sum(col_widths.values())
            if total_width > available_width:
                scale_factor: float = available_width / total_width
                for col in col_widths:
                    col_widths[col] *= scale_factor

            # Determine appropriate font size based on table size
            if max_col > 10 or max_row > 100:
                pdf.set_font("Arial", "B", 7)  # Smaller font for large tables
            elif max_col > 6 or max_row > 50:
                pdf.set_font("Arial", "B", 8)
            else:
                pdf.set_font("Arial", "B", 10)

            # Header row
            for col in range(1, max_col + 1):
                cell_value = sheet.cell(row=1, column=col).value
                header_value_str: str = str(cell_value) if cell_value is not None else ""
                pdf.cell(col_widths[col], 8, header_value_str, 1, 0, "C")
            pdf.ln()

            # Data rows
            pdf.set_font("Arial", "", pdf.font_size - 1)  # Slightly smaller for data

            # Calculate how many rows to include
            max_rows_to_show: int = min(max_row, 1000)  # Limit to 1000 rows for large files

            for row in range(2, max_rows_to_show + 1):
                # Save current position
                start_x = pdf.get_x()
                start_y = pdf.get_y()
                max_height = 7  # Default row height

                # First pass: Calculate maximum height needed for this row
                for col in range(1, max_col + 1):
                    cell_value = sheet.cell(row=row, column=col).value
                    cell_text: str = str(cell_value) if cell_value is not None else ""

                    # Calculate cell height based on content
                    if len(cell_text) > 0:
                        cell_width = col_widths[col]
                        lines_needed = max(
                            1, int(pdf.get_string_width(cell_text) / (cell_width - 2)) + 1
                        )
                        height_needed = lines_needed * 5  # 5mm per line
                        max_height = max(max_height, height_needed)

                # Second pass: Draw cells with consistent height
                pdf.set_xy(start_x, start_y)
                for col in range(1, max_col + 1):
                    cell_value = sheet.cell(row=row, column=col).value
                    cell_content: str = str(cell_value) if cell_value is not None else ""

                    # For the last column in the row, use 1 for line break
                    line_break = 0 if col < max_col else 1

                    # Handle long text by using multi_cell if needed
                    if len(cell_content) > 15:  # Arbitrary threshold for "long" text
                        current_x = pdf.get_x()
                        current_y = pdf.get_y()

                        # Draw cell border first
                        pdf.cell(col_widths[col], max_height, "", 1, 0)

                        # Then draw text inside with proper positioning
                        pdf.set_xy(current_x, current_y)
                        pdf.multi_cell(col_widths[col], max_height / 2, cell_content, 0, "L")

                        # Reset position for next cell
                        pdf.set_xy(current_x + col_widths[col], current_y)
                    else:
                        pdf.cell(col_widths[col], max_height, cell_content, 1, line_break, "L")

                # Check if we need a page break
                if pdf.get_y() > pdf.h - 20:  # 20mm from bottom of page
                    pdf.add_page()

            # If there are more rows than we can show
            if max_row > max_rows_to_show:
                pdf.ln(5)
                pdf.set_font("Arial", "I", 8)
                pdf.cell(
                    0,
                    10,
                    f"Note: {max_row - max_rows_to_show} additional rows not shown",
                    0,
                    1,
                    "C",
                )

        # Save the PDF
        pdf.output(output_file)
        print(f"Converted {excel_file} to PDF: {output_file}")

    except Exception as e:
        print(f"Error converting {excel_file} to PDF: {str(e)}")
    finally:
        # Always close the workbook to release file handles
        if wb is not None:
            try:
                wb.close()
            except Exception:
                pass




[docs]
def convert_excel_file(excel_file: str) -> None:
    """Convert an Excel file to multiple formats"""
    directory: str = os.path.dirname(excel_file)

    # Create output folders
    create_output_folders(directory)

    # Convert to different formats
    convert_to_csv(excel_file, os.path.join(directory, "CSV"))
    convert_to_txt(excel_file, os.path.join(directory, "TXT"))

    # Convert to PDF if a converter is available
    if PDF_CONVERTER == "win32":
        convert_to_pdf_win32(excel_file, os.path.join(directory, "PDF"))
    elif PDF_CONVERTER == "fpdf":
        convert_to_pdf_fpdf(excel_file, os.path.join(directory, "PDF"))




[docs]
def process_directory(directory: str) -> None:
    """Process a directory, looking for Excel files"""
    excel_found: bool = False

    # Walk through the directory and its subdirectories
    for root, dirs, files in os.walk(directory):
        excel_files: List[str] = [
            os.path.join(root, file) for file in files if file.lower().endswith(".xlsx")
        ]

        if excel_files:
            excel_found = True
            print(f"Found {len(excel_files)} Excel files in {root}")

            for excel_file in excel_files:
                convert_excel_file(excel_file)

    if not excel_found:
        print(f"No Excel (.xlsx) files found in {directory} or its subdirectories.")




[docs]
def main() -> None:
    """Main function to handle command line arguments"""
    parser = argparse.ArgumentParser(
        description="Convert Excel files to PDF, CSV, and TXT formats."
    )
    parser.add_argument(
        "directory",
        nargs="?",
        default=os.getcwd(),
        help="Directory to scan for Excel files (default: current directory)",
    )
    args = parser.parse_args()

    # Validate directory
    if not os.path.isdir(args.directory):
        print(f"Error: {args.directory} is not a valid directory")
        sys.exit(1)

    print(f"Processing directory: {args.directory}")
    process_directory(args.directory)
    print("Processing complete.")



if __name__ == "__main__":
    main()