#!/usr/bin/env python3
"""
Script to convert all PDF files in the Niraj_Upload folder to individual text files.
Uses mistral:7b-instruct via Ollama to process the extracted text.
"""

import os
from pathlib import Path
import sys

try:
    from pypdf import PdfReader
except ImportError:
    try:
        from PyPDF2 import PdfReader
    except ImportError:
        print("Error: pypdf or PyPDF2 library is required.")
        print("Please install it using: pip install pypdf")
        sys.exit(1)

# LLM integration with mistral:7b-instruct
USE_LLM = True  # Set to True to enable LLM processing
try:
    from llama_index.llms.ollama import Ollama
    LLM_AVAILABLE = True
except ImportError:
    LLM_AVAILABLE = False
    if USE_LLM:
        print("Warning: llama-index-llms-ollama not installed. LLM processing disabled.")
        print("Install with: pip install llama-index-llms-ollama")


def extract_text_from_pdf(pdf_path):
    """Extract text from a PDF file."""
    try:
        reader = PdfReader(pdf_path)
        text_content = []
        
        print(f"Processing: {os.path.basename(pdf_path)} ({len(reader.pages)} pages)")
        
        for page_num, page in enumerate(reader.pages, 1):
            try:
                text = page.extract_text()
                if text.strip():
                    text_content.append(f"\n--- Page {page_num} ---\n")
                    text_content.append(text)
            except Exception as e:
                print(f"  Warning: Could not extract text from page {page_num}: {e}")
        
        return "\n".join(text_content)
    
    except Exception as e:
        print(f"Error processing {pdf_path}: {e}")
        return f"[ERROR: Could not extract text from {os.path.basename(pdf_path)}: {e}]"


def process_with_llm(text, llm, task="summarize"):
    """Process extracted text using mistral:7b-instruct."""
    if not LLM_AVAILABLE or not llm:
        return text
    
    try:
        # Truncate text if too long (mistral:7b-instruct has context limits)
        max_chars = 100000  # Adjust based on your needs
        if len(text) > max_chars:
            text = text[:max_chars] + "\n\n[Text truncated for LLM processing...]"
        
        if task == "summarize":
            prompt = f"""Please provide a concise summary of the following financial document. 
Focus on key financial metrics, important dates, and major business developments.

Document text:
{text}

Summary:"""
        elif task == "extract_key_metrics":
            prompt = f"""Extract key financial metrics and important information from the following document.
Format as a structured list with clear sections.

Document text:
{text}

Key Information:"""
        else:
            prompt = f"""Process and clean the following text, maintaining all important information:

{text}

Processed text:"""
        
        print(f"  🤖 Processing with mistral:7b-instruct...")
        response = llm.complete(prompt)
        return str(response)
    
    except Exception as e:
        print(f"  ⚠️  LLM processing error: {e}")
        print(f"  Falling back to raw text extraction")
        return text


def main():
    # Get the folder path
    script_dir = Path(__file__).parent
    target_dir = script_dir / "Niraj_Upload"
    
    # Output directory in Documents/fiscal_data
    output_dir = Path.home() / "Documents" / "fiscal_data"
    
    if not target_dir.exists():
        print(f"Error: Directory not found at {target_dir}")
        sys.exit(1)
    
    # Create output directory if it doesn't exist
    output_dir.mkdir(parents=True, exist_ok=True)
    print(f"Output directory: {output_dir}\n")
    
    # Initialize LLM if requested
    llm = None
    if USE_LLM and LLM_AVAILABLE:
        try:
            print("🤖 Initializing mistral:7b-instruct via Ollama...")
            llm = Ollama(model="mistral:7b-instruct", request_timeout=1800)
            print("✅ LLM initialized successfully\n")
        except Exception as e:
            print(f"⚠️  Warning: Could not initialize LLM: {e}")
            print("   Continuing without LLM processing...\n")
            llm = None
    
    # Get all PDF files recursively from subdirectories
    pdf_files = sorted(target_dir.rglob("*.pdf"))
    
    if not pdf_files:
        print(f"No PDF files found in {target_dir}")
        sys.exit(1)
    
    print(f"Found {len(pdf_files)} PDF files")
    if llm:
        print(f"Using mistral:7b-instruct for text processing")
    print(f"Converting to individual text files...\n")
    
    # Process each PDF and create individual text files
    success_count = 0
    error_count = 0
    
    for pdf_file in pdf_files:
        # Get the PDF filename and convert to .txt
        pdf_name = pdf_file.stem  # filename without extension
        txt_filename = f"{pdf_name}.txt"
        txt_path = output_dir / txt_filename
        
        print(f"Converting: {pdf_file.name} -> {txt_filename}")
        
        # Extract text from PDF
        text = extract_text_from_pdf(pdf_file)
        
        # Optionally process with LLM
        if llm:
            text = process_with_llm(text, llm, task="summarize")
        
        # Write to individual text file
        try:
            with open(txt_path, 'w', encoding='utf-8') as f:
                f.write(text)
            file_size = os.path.getsize(txt_path) / 1024  # Size in KB
            print(f"  ✓ Created: {txt_filename} ({file_size:.2f} KB)\n")
            success_count += 1
        except Exception as e:
            print(f"  ✗ Error writing {txt_filename}: {e}\n")
            error_count += 1
    
    # Summary
    print("=" * 80)
    print(f"Conversion complete!")
    print(f"  Successfully converted: {success_count} files")
    if error_count > 0:
        print(f"  Errors: {error_count} files")
    if llm:
        print(f"  LLM processing: Enabled (mistral:7b-instruct)")
    print(f"  Output directory: {output_dir}")
    print("=" * 80)


if __name__ == "__main__":
    main()



