#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
HWPX to TXT Converter (v0.1.0)

Extracts text content from Korean HWPX (Hangul Word Processor XML) files.
"""

import sys
import os
import zipfile
import xml.etree.ElementTree as ET
from dataclasses import dataclass
from typing import List, Union, Optional


@dataclass
class HwpxTable:
    rows: List[List[str]]  # Simple grid of text for now
    
    def to_text(self) -> str:
        # Simplified text representation
        return "\n".join(["\t".join(row) for row in self.rows])

    def to_html(self) -> str:
        html = ['<table border="1" style="border-collapse: collapse;">']
        for row in self.rows:
            html.append("  <tr>")
            for cell in row:
                html.append(f"    <td>{cell}</td>")
            html.append("  </tr>")
        html.append("</table>")
        return "\n".join(html)

    def to_markdown(self) -> str:
        if not self.rows: return ""
        # Normalize columns
        max_cols = max(len(r) for r in self.rows)
        grid = [r + [""] * (max_cols - len(r)) for r in self.rows]
        
        lines = []
        header = "| " + " | ".join(grid[0]) + " |"
        lines.append(header)
        lines.append("| " + " | ".join(["---"] * max_cols) + " |")
        for r in grid[1:]:
            lines.append("| " + " | ".join(r) + " |")
        return "\n".join(lines)


def parse_paragraph(para_elem, namespace):
    """Parse a paragraph element and return text."""
    line = []
    for run in para_elem.findall(".//hp:run", namespace):
        text_elem = run.find("hp:t", namespace)
        if text_elem is not None and text_elem.text:
            line.append(text_elem.text)
    return "".join(line)


def parse_table(tbl_elem, namespace):
    """Parse a table element and return HwpxTable."""
    rows = []
    for tr in tbl_elem.findall(".//hp:tr", namespace):
        row_data = []
        for tc in tr.findall(".//hp:tc", namespace):
            # Extract text from all paragraphs in the cell
            cell_text = []
            for para in tc.findall(".//hp:p", namespace):
                cell_text.append(parse_paragraph(para, namespace))
            row_data.append("\n".join(cell_text))
        rows.append(row_data)
    return HwpxTable(rows)


def extract_content_from_xml(xml_path) -> List[Union[str, HwpxTable]]:
    """
    Extract structured content from XML file.
    """
    try:
        tree = ET.parse(xml_path)
        root = tree.getroot()
        
        namespaces = [
            {'hp': 'http://www.hancom.co.kr/hwpml/2011/paragraph'},
            {'hp': 'http://www.hancom.co.kr/hwpml/2010/main'},
            {'hp': 'http://www.hancom.co.kr/hwpml/2016/paragraph'}
        ]
        
        content_list = []
        
        for namespace in namespaces:
            # Check if this namespace works by looking for any paragraph
            if root.find(".//hp:p", namespace) is None:
                continue
                
            # Iterate through all children of section to preserve order
            # Note: This is a simplification. HWPX structure is complex.
            # We'll try to find all 'p' and 'tbl' in document order if possible.
            # But standard ElementTree doesn't support mixed tag search easily with namespaces.
            # So we iterate over all elements and check tags.
            
            for elem in root.iter():
                tag = elem.tag
                if tag.endswith("}p"): # Paragraph
                    text = parse_paragraph(elem, namespace)
                    if text:
                        content_list.append(text)
                elif tag.endswith("}tbl"): # Table
                    table = parse_table(elem, namespace)
                    if table.rows:
                        content_list.append(table)
            
            if content_list:
                return content_list
                
        return []
    except Exception as e:
        print(f"[Error] XML parsing error: {e}")
        return []


def extract_from_prvtext(hwpx_path):
    """
    Fallback method: extract from PrvText.txt inside HWPX.
    
    Args:
        hwpx_path: Path to the HWPX file
        
    Returns:
        Extracted text content as string
    """
    try:
        with zipfile.ZipFile(hwpx_path, 'r') as zip_ref:
            for file_info in zip_ref.infolist():
                if "PrvText.txt" in file_info.filename:
                    with zip_ref.open(file_info) as file:
                        return file.read().decode('utf-8', errors='ignore')
    except Exception as e:
        return f"[Error] Could not extract from PrvText: {e}"
    return ""


def extract_content_from_hwpx(hwpx_path, output_format="txt"):
    """
    Extract content from HWPX file.
    """
    extracted_dir = "/tmp/hwpx_extracted"
    os.makedirs(extracted_dir, exist_ok=True)
    
    # Extract HWPX
    try:
        with zipfile.ZipFile(hwpx_path, 'r') as zip_ref:
            zip_ref.extractall(extracted_dir)
    except Exception as e:
        return f"[Error] Extraction failed: {e}"
    
    # Process XML files
    all_content = []
    contents_dir = os.path.join(extracted_dir, "Contents")
    
    if os.path.isdir(contents_dir):
        xml_files = [f for f in os.listdir(contents_dir) if f.endswith('.xml')]
        for file_name in sorted(xml_files):
            xml_path = os.path.join(contents_dir, file_name)
            section_content = extract_content_from_xml(xml_path)
            all_content.extend(section_content)
    
    # PrvText.txt fallback if XML extraction failed
    if not all_content:
        print("Warning: XML extraction incomplete. Using PrvText.txt fallback...")
        text = extract_from_prvtext(hwpx_path)
        if output_format == "html":
            return f"<pre>{text}</pre>"
        return text
    
    # Convert to requested format
    if output_format == "html":
        html = ["<!DOCTYPE html>", "<html>", "<body>"]
        for item in all_content:
            if isinstance(item, HwpxTable):
                html.append(item.to_html())
            else:
                html.append(f"<p>{item}</p>")
        html.append("</body></html>")
        return "\n".join(html)
        
    elif output_format == "md":
        md = []
        for item in all_content:
            if isinstance(item, HwpxTable):
                md.append(item.to_markdown())
            else:
                md.append(item)
        return "\n\n".join(md)
        
    else: # txt
        txt = []
        for item in all_content:
            if isinstance(item, HwpxTable):
                txt.append("[표]")
                txt.append(item.to_text())
            else:
                txt.append(item)
        return "\n\n".join(txt)


def convert_hwpx(input_path, output_path="/tmp/converted_text.txt", output_format="txt"):
    """
    Convert HWPX file to text.
    
    Args:
        input_path: Path to HWPX file
        output_path: Path for output text file
        
    Returns:
        bool: True if conversion succeeded, False otherwise
    """
    if not os.path.exists(input_path):
        print(f"❌ File not found: {input_path}")
        return False
    
    if not input_path.endswith('.hwpx'):
        print(f"❌ Not a HWPX file: {input_path}")
        return False
    
    print("🔧 Extracting text from XML sections...")
    
    # Extract text
    extracted_text = extract_content_from_hwpx(input_path, output_format)
    
    # Check if extracted text is empty
    if not extracted_text.strip():
        print("❌ No text could be extracted from the file")
        return False
    
    # Save to output location
    try:
        with open(output_path, "w", encoding="utf-8") as f:
            f.write(extracted_text.strip())
    except Exception as e:
        print(f"❌ Failed to save output: {str(e)}")
        return False
    
    print(f"✅ Conversion complete! Output saved to: {output_path}")
    print(f"📊 Text length: {len(extracted_text):,} characters")
    print(f"📊 Lines: {extracted_text.count(chr(10)) + 1:,}")
    
    # Preview (first 1000 characters)
    preview_length = min(1000, len(extracted_text))
    if preview_length > 0:
        print("\n" + "=" * 60)
        print("📖 Preview (first 1000 characters):")
        print("=" * 60)
        print(extracted_text[:preview_length])
        if len(extracted_text) > preview_length:
            print("\n... (truncated)")
    
    return True


def main():
    """Command-line interface entry point"""
    if len(sys.argv) < 2:
        print("Usage: python -m scripts.hwpx.converter <file.hwpx> [output_path] [--format txt|html|md]")
        sys.exit(1)
    
    hwpx_file = sys.argv[1]
    
    output_path = None
    output_format = "txt"
    
    # Parse arguments manually
    args = sys.argv[2:]
    if args and not args[0].startswith("--"):
        output_path = args[0]
        args = args[1:]
        
    for i, arg in enumerate(args):
        if arg == "--format" and i + 1 < len(args):
            output_format = args[i+1]
            
    if not output_path:
        ext = "txt"
        if output_format == "html": ext = "html"
        elif output_format == "md": ext = "md"
        output_path = f"/tmp/converted.{ext}"
    
    output_dir = os.path.dirname(output_path)
    if output_dir:
        os.makedirs(output_dir, exist_ok=True)
    
    success = convert_hwpx(hwpx_file, output_path, output_format)
    sys.exit(0 if success else 1)


if __name__ == "__main__":
    main()
