Source code for bca_survival.tools.results_converter

#!/usr/bin/env python3
import argparse
import os
import sys
from typing import Any, Dict, List, Literal, Optional, Type, cast

import pandas as pd

# Define PDF converter type
PDF_CONVERTER_TYPE = Literal["win32", "fpdf", None]
PDF_CONVERTER: PDF_CONVERTER_TYPE = None

# Define base FPDF type for later conditional definition
FPDF_CLASS: Optional[Type[Any]] = None
PDFWithWordWrap_CLASS: Optional[Type[Any]] = None

# For PDF conversion
try:
    # Windows approach using COM automation
    import win32com.client  # type: ignore[import-untyped]

    PDF_CONVERTER = "win32"
except ImportError:
    try:
        # Alternative approach using fpdf
        import openpyxl  # type: ignore[import-untyped]
        from fpdf import FPDF  # type: ignore[import-untyped]

        # Store the FPDF class for later use
        FPDF_CLASS = FPDF

        class PDFWithWordWrap(FPDF):  # type: ignore[valid-type]
            """Extended FPDF class with word wrap functionality for cells"""

            def __init__(self, *args: Any, **kwargs: Any) -> None:
                super().__init__(*args, **kwargs)
                self.cell_height_ratio = 1.5  # Default cell height ratio

            def multi_cell_auto_width(
                self,
                width: float,
                height: float,
                text: str,
                border: int = 0,
                align: str = "L",
                fill: bool = False,
            ) -> float:
                """Create a multi-cell with automatic word wrapping and return its height"""
                # Calculate how many lines this text will require
                string_width: float = float(self.get_string_width(text))  # Explicitly cast to float
                lines_float: float = string_width / (width - 2)
                lines_count: int = max(1, int(lines_float) + 1)  # At least one line

                # Calculate total height needed
                total_height: float = height * float(lines_count)

                # Create the multi-cell
                self.multi_cell(width, height, text, border, align, fill)

                # Explicitly ensure we return a float
                return total_height

        # Store the class for later use
        PDFWithWordWrap_CLASS = PDFWithWordWrap

        PDF_CONVERTER = "fpdf"
    except ImportError:
        print("Warning: Neither win32com nor fpdf found. PDF conversion will be disabled.")
        PDF_CONVERTER = None


[docs] def create_output_folders(directory: str) -> None: """Create the output folders if they don't exist""" for folder in ["PDF", "CSV", "TXT"]: folder_path: str = os.path.join(directory, folder) if not os.path.exists(folder_path): os.makedirs(folder_path) print(f"Created folder: {folder_path}")
[docs] def convert_to_csv(excel_file: str, output_folder: str) -> None: """Convert Excel file to CSV format""" filename: str = os.path.basename(excel_file) base_name: str = os.path.splitext(filename)[0] try: # Read all sheets from the Excel file using context manager with pd.ExcelFile(excel_file) as excel: sheet_names: List[str] = excel.sheet_names if len(sheet_names) == 1: # If there's only one sheet, convert it directly output_file: str = os.path.join(output_folder, f"{base_name}.csv") df_single: pd.DataFrame = pd.read_excel(excel, sheet_name=sheet_names[0]) df_single.to_csv(output_file, index=False) print(f"Converted {excel_file} to CSV: {output_file}") else: # If there are multiple sheets, create separate CSV files for sheet in sheet_names: sheet_output: str = os.path.join(output_folder, f"{base_name}_{sheet}.csv") df_multi: pd.DataFrame = pd.read_excel(excel, sheet_name=sheet) df_multi.to_csv(sheet_output, index=False) print(f"Converted {excel_file} to multiple CSV files in {output_folder}") except Exception as e: print(f"Error converting {excel_file} to CSV: {str(e)}")
[docs] def convert_to_txt(excel_file: str, output_folder: str) -> None: """Convert Excel file to TXT format""" filename: str = os.path.basename(excel_file) base_name: str = os.path.splitext(filename)[0] output_file: str = os.path.join(output_folder, f"{base_name}.txt") try: # Read all sheets from the Excel file using context manager with pd.ExcelFile(excel_file) as excel: sheet_names: List[str] = excel.sheet_names with open(output_file, "w", encoding="utf-8") as txt_file: if len(sheet_names) > 1: # If there are multiple sheets, include sheet names in the TXT file for sheet in sheet_names: df_sheet: pd.DataFrame = pd.read_excel(excel, sheet_name=sheet) txt_file.write(f"Sheet: {sheet}\n") txt_file.write("=" * 50 + "\n") txt_file.write(df_sheet.to_string(index=False)) txt_file.write("\n\n") else: # If there's only one sheet, convert it directly df_txt: pd.DataFrame = pd.read_excel(excel, sheet_name=sheet_names[0]) txt_file.write(df_txt.to_string(index=False)) print(f"Converted {excel_file} to TXT: {output_file}") except Exception as e: print(f"Error converting {excel_file} to TXT: {str(e)}")
[docs] def convert_to_pdf_win32(excel_file: str, output_folder: str) -> None: """Convert Excel file to PDF format using win32com (Windows only)""" filename: str = os.path.basename(excel_file) base_name: str = os.path.splitext(filename)[0] output_file: str = os.path.join(output_folder, f"{base_name}.pdf") excel_app = None workbook = None try: # Initialize Excel application excel_app = win32com.client.Dispatch("Excel.Application") excel_app.Visible = False # Run in background excel_app.DisplayAlerts = False # Prevent Excel alerts # Open the workbook workbook = excel_app.Workbooks.Open(os.path.abspath(excel_file)) # Process each worksheet for sheet in workbook.Worksheets: # Set to horizontal (landscape) orientation sheet.PageSetup.Orientation = 2 # xlLandscape # Fit to page width (1 page wide) sheet.PageSetup.FitToPagesWide = 1 sheet.PageSetup.FitToPagesTall = False # Adjust column widths to fit content sheet.UsedRange.Columns.AutoFit() # Set print area to the used range sheet.PageSetup.PrintArea = sheet.UsedRange.Address # Set other print settings for better visibility sheet.PageSetup.Zoom = False # Disable zoom to use FitToPages sheet.PageSetup.CenterHorizontally = True # Save as PDF workbook.ExportAsFixedFormat(0, os.path.abspath(output_file)) print(f"Converted {excel_file} to PDF: {output_file}") except Exception as e: print(f"Error converting {excel_file} to PDF: {str(e)}") finally: # Always close workbook and Excel application to release file handles if workbook is not None: try: workbook.Close(False) except Exception: pass if excel_app is not None: try: excel_app.Quit() except Exception: pass
[docs] def convert_to_pdf_fpdf(excel_file: str, output_folder: str) -> None: """Convert Excel file to PDF format using fpdf with improved cell visibility""" # Check if PDFWithWordWrap_CLASS is available if PDFWithWordWrap_CLASS is None: print(f"Error: FPDF library not available for PDF conversion of {excel_file}") return filename: str = os.path.basename(excel_file) base_name: str = os.path.splitext(filename)[0] output_file: str = os.path.join(output_folder, f"{base_name}.pdf") wb = None try: # Load the Excel file wb = openpyxl.load_workbook(excel_file) # Create a PDF object - A4 landscape (horizontal) pdf = cast(Any, PDFWithWordWrap_CLASS)(orientation="L", unit="mm", format="A4") pdf.set_auto_page_break(auto=True, margin=15) # For each sheet in the workbook for sheet_name in wb.sheetnames: sheet = wb[sheet_name] pdf.add_page() # Add sheet name as a title pdf.set_font("Arial", "B", 16) pdf.cell(0, 10, f"Sheet: {sheet_name}", 0, 1, "C") pdf.ln(5) # Get max rows and columns max_row: int = sheet.max_row max_col: int = sheet.max_column # Adjust for empty sheets if max_row == 0 or max_col == 0: pdf.set_font("Arial", "", 12) pdf.cell(0, 10, "Empty sheet", 0, 1, "C") continue # Calculate optimal column widths based on content col_widths: Dict[int, float] = {} max_content_lengths: Dict[int, int] = {} # First pass: determine maximum content length for each column for col in range(1, max_col + 1): max_length = 0 for row in range(1, min(max_row + 1, 101)): # Sample first 100 rows cell_value = sheet.cell(row=row, column=col).value if cell_value is not None: max_length = max(max_length, len(str(cell_value))) max_content_lengths[col] = max_length # Available page width (A4 landscape minus margins) available_width: float = 277 - 20 # A4 landscape width ~277mm, minus margins # Calculate proportional column widths with minimum and maximum constraints total_content_length: int = sum(max_content_lengths.values()) min_col_width: float = 10.0 # Minimum column width in mm max_col_width: float = 50.0 # Maximum column width in mm for col in range(1, max_col + 1): if total_content_length > 0: proportion: float = max_content_lengths[col] / total_content_length col_widths[col] = max( min_col_width, min(max_col_width, proportion * available_width) ) else: col_widths[col] = min_col_width # Adjust if total width exceeds available width total_width: float = sum(col_widths.values()) if total_width > available_width: scale_factor: float = available_width / total_width for col in col_widths: col_widths[col] *= scale_factor # Determine appropriate font size based on table size if max_col > 10 or max_row > 100: pdf.set_font("Arial", "B", 7) # Smaller font for large tables elif max_col > 6 or max_row > 50: pdf.set_font("Arial", "B", 8) else: pdf.set_font("Arial", "B", 10) # Header row for col in range(1, max_col + 1): cell_value = sheet.cell(row=1, column=col).value header_value_str: str = str(cell_value) if cell_value is not None else "" pdf.cell(col_widths[col], 8, header_value_str, 1, 0, "C") pdf.ln() # Data rows pdf.set_font("Arial", "", pdf.font_size - 1) # Slightly smaller for data # Calculate how many rows to include max_rows_to_show: int = min(max_row, 1000) # Limit to 1000 rows for large files for row in range(2, max_rows_to_show + 1): # Save current position start_x = pdf.get_x() start_y = pdf.get_y() max_height = 7 # Default row height # First pass: Calculate maximum height needed for this row for col in range(1, max_col + 1): cell_value = sheet.cell(row=row, column=col).value cell_text: str = str(cell_value) if cell_value is not None else "" # Calculate cell height based on content if len(cell_text) > 0: cell_width = col_widths[col] lines_needed = max( 1, int(pdf.get_string_width(cell_text) / (cell_width - 2)) + 1 ) height_needed = lines_needed * 5 # 5mm per line max_height = max(max_height, height_needed) # Second pass: Draw cells with consistent height pdf.set_xy(start_x, start_y) for col in range(1, max_col + 1): cell_value = sheet.cell(row=row, column=col).value cell_content: str = str(cell_value) if cell_value is not None else "" # For the last column in the row, use 1 for line break line_break = 0 if col < max_col else 1 # Handle long text by using multi_cell if needed if len(cell_content) > 15: # Arbitrary threshold for "long" text current_x = pdf.get_x() current_y = pdf.get_y() # Draw cell border first pdf.cell(col_widths[col], max_height, "", 1, 0) # Then draw text inside with proper positioning pdf.set_xy(current_x, current_y) pdf.multi_cell(col_widths[col], max_height / 2, cell_content, 0, "L") # Reset position for next cell pdf.set_xy(current_x + col_widths[col], current_y) else: pdf.cell(col_widths[col], max_height, cell_content, 1, line_break, "L") # Check if we need a page break if pdf.get_y() > pdf.h - 20: # 20mm from bottom of page pdf.add_page() # If there are more rows than we can show if max_row > max_rows_to_show: pdf.ln(5) pdf.set_font("Arial", "I", 8) pdf.cell( 0, 10, f"Note: {max_row - max_rows_to_show} additional rows not shown", 0, 1, "C", ) # Save the PDF pdf.output(output_file) print(f"Converted {excel_file} to PDF: {output_file}") except Exception as e: print(f"Error converting {excel_file} to PDF: {str(e)}") finally: # Always close the workbook to release file handles if wb is not None: try: wb.close() except Exception: pass
[docs] def convert_excel_file(excel_file: str) -> None: """Convert an Excel file to multiple formats""" directory: str = os.path.dirname(excel_file) # Create output folders create_output_folders(directory) # Convert to different formats convert_to_csv(excel_file, os.path.join(directory, "CSV")) convert_to_txt(excel_file, os.path.join(directory, "TXT")) # Convert to PDF if a converter is available if PDF_CONVERTER == "win32": convert_to_pdf_win32(excel_file, os.path.join(directory, "PDF")) elif PDF_CONVERTER == "fpdf": convert_to_pdf_fpdf(excel_file, os.path.join(directory, "PDF"))
[docs] def process_directory(directory: str) -> None: """Process a directory, looking for Excel files""" excel_found: bool = False # Walk through the directory and its subdirectories for root, dirs, files in os.walk(directory): excel_files: List[str] = [ os.path.join(root, file) for file in files if file.lower().endswith(".xlsx") ] if excel_files: excel_found = True print(f"Found {len(excel_files)} Excel files in {root}") for excel_file in excel_files: convert_excel_file(excel_file) if not excel_found: print(f"No Excel (.xlsx) files found in {directory} or its subdirectories.")
[docs] def main() -> None: """Main function to handle command line arguments""" parser = argparse.ArgumentParser( description="Convert Excel files to PDF, CSV, and TXT formats." ) parser.add_argument( "directory", nargs="?", default=os.getcwd(), help="Directory to scan for Excel files (default: current directory)", ) args = parser.parse_args() # Validate directory if not os.path.isdir(args.directory): print(f"Error: {args.directory} is not a valid directory") sys.exit(1) print(f"Processing directory: {args.directory}") process_directory(args.directory) print("Processing complete.")
if __name__ == "__main__": main()