"""
Evaluator Framework

Provides extensible evaluators for measuring skill output quality.
Supports multiple evaluation types:
- Binary (success/failure)
- Computed (derived from trace data)
- LLM-judged (quality assessment by LLM)
- Custom (user-defined functions)
"""

import os
import re
import subprocess
from abc import ABC, abstractmethod
from dataclasses import dataclass
from typing import Any, Callable, Optional

# Import from parent
import sys
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from core.claude_executor import ExecutionTrace


@dataclass
class EvaluationResult:
    """Result from an evaluator."""
    metric_name: str
    score: float  # 0.0 to 1.0
    details: dict
    issues: list[str]


class BaseEvaluator(ABC):
    """Abstract base class for all evaluators."""
    
    def __init__(self, name: str, weight: float = 1.0):
        self.name = name
        self.weight = weight
    
    @abstractmethod
    def evaluate(
        self,
        trace: ExecutionTrace,
        test_case: Optional[dict] = None
    ) -> EvaluationResult:
        """
        Evaluate an execution trace.
        
        Args:
            trace: The execution trace to evaluate
            test_case: Optional test case with expected outputs
            
        Returns:
            EvaluationResult with score and details
        """
        pass


class BinaryEvaluator(BaseEvaluator):
    """Simple success/failure evaluation."""
    
    def evaluate(
        self,
        trace: ExecutionTrace,
        test_case: Optional[dict] = None
    ) -> EvaluationResult:
        
        score = 1.0 if trace.success else 0.0
        
        return EvaluationResult(
            metric_name=self.name,
            score=score,
            details={
                "success": trace.success,
                "exit_code": trace.exit_code
            },
            issues=trace.errors if not trace.success else []
        )


class FileExistsEvaluator(BaseEvaluator):
    """Evaluate if expected files were created."""
    
    def __init__(
        self,
        name: str = "file_exists",
        patterns: Optional[list[str]] = None,
        weight: float = 1.0
    ):
        super().__init__(name, weight)
        self.patterns = patterns or ["*.*"]
    
    def evaluate(
        self,
        trace: ExecutionTrace,
        test_case: Optional[dict] = None
    ) -> EvaluationResult:
        
        # Get expected patterns from test case or defaults
        patterns = self.patterns
        if test_case:
            expected = test_case.get("expected_outputs", [])
            patterns = [e.get("pattern", "*.*") for e in expected if e.get("type") == "file"]
        
        if not patterns:
            patterns = ["*.*"]
        
        # Check created files against patterns
        created_files = [f.get("path", "") for f in trace.files_created]
        
        matched = 0
        issues = []
        
        for pattern in patterns:
            found = False
            for file in created_files:
                if self._matches_pattern(file, pattern):
                    found = True
                    matched += 1
                    break
            
            if not found:
                issues.append(f"Expected file matching '{pattern}' not found")
        
        score = matched / len(patterns) if patterns else 1.0
        
        return EvaluationResult(
            metric_name=self.name,
            score=score,
            details={
                "expected_patterns": patterns,
                "created_files": created_files,
                "matched": matched
            },
            issues=issues
        )
    
    def _matches_pattern(self, filename: str, pattern: str) -> bool:
        """Check if filename matches pattern (simple glob)."""
        import fnmatch
        return fnmatch.fnmatch(filename, pattern)


class FileValidityEvaluator(BaseEvaluator):
    """Evaluate if created files are valid (not corrupt)."""
    
    def __init__(
        self,
        name: str = "file_validity",
        work_dir: str = "/tmp",
        weight: float = 1.0
    ):
        super().__init__(name, weight)
        self.work_dir = work_dir
        
        # Validators for different file types
        self.validators = {
            ".pptx": self._validate_pptx,
            ".docx": self._validate_docx,
            ".xlsx": self._validate_xlsx,
            ".pdf": self._validate_pdf,
            ".json": self._validate_json,
            ".py": self._validate_python,
            ".js": self._validate_javascript,
        }
    
    def evaluate(
        self,
        trace: ExecutionTrace,
        test_case: Optional[dict] = None
    ) -> EvaluationResult:
        
        files = trace.files_created
        
        if not files:
            return EvaluationResult(
                metric_name=self.name,
                score=0.5,  # Neutral if no files
                details={"files_checked": 0},
                issues=["No files created"]
            )
        
        valid_count = 0
        issues = []
        details = {}
        
        for file_info in files:
            path = file_info.get("path", "")
            _, ext = os.path.splitext(path.lower())
            
            validator = self.validators.get(ext)
            if validator:
                # Try to find the actual file
                full_path = os.path.join(self.work_dir, path)
                if os.path.exists(full_path):
                    is_valid, error = validator(full_path)
                    if is_valid:
                        valid_count += 1
                    else:
                        issues.append(f"{path}: {error}")
                else:
                    # Can't validate, assume OK
                    valid_count += 1
            else:
                # No validator, assume OK
                valid_count += 1
        
        score = valid_count / len(files) if files else 1.0
        
        return EvaluationResult(
            metric_name=self.name,
            score=score,
            details={
                "files_checked": len(files),
                "valid_files": valid_count
            },
            issues=issues
        )
    
    def _validate_pptx(self, path: str) -> tuple[bool, str]:
        """Validate PPTX file."""
        import zipfile
        try:
            if not zipfile.is_zipfile(path):
                return False, "Not a valid ZIP archive"
            
            with zipfile.ZipFile(path, 'r') as zf:
                if "ppt/presentation.xml" not in zf.namelist():
                    return False, "Missing presentation.xml"
            
            return True, ""
        except Exception as e:
            return False, str(e)
    
    def _validate_docx(self, path: str) -> tuple[bool, str]:
        """Validate DOCX file."""
        import zipfile
        try:
            if not zipfile.is_zipfile(path):
                return False, "Not a valid ZIP archive"
            
            with zipfile.ZipFile(path, 'r') as zf:
                if "word/document.xml" not in zf.namelist():
                    return False, "Missing document.xml"
            
            return True, ""
        except Exception as e:
            return False, str(e)
    
    def _validate_xlsx(self, path: str) -> tuple[bool, str]:
        """Validate XLSX file."""
        import zipfile
        try:
            if not zipfile.is_zipfile(path):
                return False, "Not a valid ZIP archive"
            
            with zipfile.ZipFile(path, 'r') as zf:
                if "xl/workbook.xml" not in zf.namelist():
                    return False, "Missing workbook.xml"
            
            return True, ""
        except Exception as e:
            return False, str(e)
    
    def _validate_pdf(self, path: str) -> tuple[bool, str]:
        """Validate PDF file."""
        try:
            with open(path, 'rb') as f:
                header = f.read(4)
                if header != b'%PDF':
                    return False, "Invalid PDF header"
            return True, ""
        except Exception as e:
            return False, str(e)
    
    def _validate_json(self, path: str) -> tuple[bool, str]:
        """Validate JSON file."""
        import json
        try:
            with open(path, 'r') as f:
                json.load(f)
            return True, ""
        except json.JSONDecodeError as e:
            return False, f"Invalid JSON: {e}"
        except Exception as e:
            return False, str(e)
    
    def _validate_python(self, path: str) -> tuple[bool, str]:
        """Validate Python file syntax."""
        try:
            with open(path, 'r') as f:
                code = f.read()
            compile(code, path, 'exec')
            return True, ""
        except SyntaxError as e:
            return False, f"Syntax error: {e}"
        except Exception as e:
            return False, str(e)
    
    def _validate_javascript(self, path: str) -> tuple[bool, str]:
        """Validate JavaScript file (basic check)."""
        try:
            result = subprocess.run(
                ["node", "--check", path],
                capture_output=True,
                text=True,
                timeout=10
            )
            if result.returncode != 0:
                return False, result.stderr[:100]
            return True, ""
        except FileNotFoundError:
            return True, ""  # node not available, skip
        except Exception as e:
            return False, str(e)


class ContentMatchEvaluator(BaseEvaluator):
    """Evaluate if output contains expected content."""
    
    def __init__(
        self,
        name: str = "content_match",
        weight: float = 1.0
    ):
        super().__init__(name, weight)
    
    def evaluate(
        self,
        trace: ExecutionTrace,
        test_case: Optional[dict] = None
    ) -> EvaluationResult:
        
        if not test_case:
            return EvaluationResult(
                metric_name=self.name,
                score=1.0,
                details={},
                issues=[]
            )
        
        # Get expected content from test case
        criteria = test_case.get("quality_criteria", [])
        
        if not criteria:
            return EvaluationResult(
                metric_name=self.name,
                score=1.0,
                details={"criteria_count": 0},
                issues=[]
            )
        
        # Build searchable text from trace
        searchable_text = self._build_searchable_text(trace)
        
        matched = 0
        issues = []
        
        for criterion in criteria:
            # Simple keyword matching
            criterion_lower = criterion.lower()
            
            # Extract key terms (simple approach)
            terms = re.findall(r'\b\w+\b', criterion_lower)
            significant_terms = [t for t in terms if len(t) > 3]
            
            if significant_terms:
                matches = sum(1 for t in significant_terms if t in searchable_text)
                if matches >= len(significant_terms) * 0.5:
                    matched += 1
                else:
                    issues.append(f"Criterion not met: {criterion}")
            else:
                matched += 1  # No significant terms, assume met
        
        score = matched / len(criteria) if criteria else 1.0
        
        return EvaluationResult(
            metric_name=self.name,
            score=score,
            details={
                "criteria_count": len(criteria),
                "matched": matched
            },
            issues=issues
        )
    
    def _build_searchable_text(self, trace: ExecutionTrace) -> str:
        """Build searchable text from trace."""
        parts = [
            trace.raw_stdout.lower(),
            " ".join(trace.reasoning).lower(),
            " ".join(str(c) for c in trace.code_blocks).lower()
        ]
        return " ".join(parts)


class ErrorRateEvaluator(BaseEvaluator):
    """Evaluate based on error count."""
    
    def evaluate(
        self,
        trace: ExecutionTrace,
        test_case: Optional[dict] = None
    ) -> EvaluationResult:
        
        error_count = len(trace.errors)
        warning_count = len(trace.warnings)
        
        # Score decreases with errors
        # 0 errors = 1.0, 1 error = 0.7, 2 errors = 0.5, 3+ errors = 0.3
        if error_count == 0:
            score = 1.0
        elif error_count == 1:
            score = 0.7
        elif error_count == 2:
            score = 0.5
        else:
            score = 0.3
        
        # Warnings slightly reduce score
        score -= min(warning_count * 0.05, 0.2)
        score = max(0.0, score)
        
        return EvaluationResult(
            metric_name=self.name,
            score=score,
            details={
                "error_count": error_count,
                "warning_count": warning_count,
                "errors": trace.errors[:5],
                "warnings": trace.warnings[:5]
            },
            issues=trace.errors
        )


class EfficiencyEvaluator(BaseEvaluator):
    """Evaluate execution efficiency (time, tokens)."""
    
    def __init__(
        self,
        name: str = "efficiency",
        target_time: float = 30.0,  # Target execution time in seconds
        max_time: float = 120.0,     # Maximum acceptable time
        weight: float = 1.0
    ):
        super().__init__(name, weight)
        self.target_time = target_time
        self.max_time = max_time
    
    def evaluate(
        self,
        trace: ExecutionTrace,
        test_case: Optional[dict] = None
    ) -> EvaluationResult:
        
        exec_time = trace.execution_time
        
        # Score based on execution time
        if exec_time <= self.target_time:
            score = 1.0
        elif exec_time <= self.max_time:
            # Linear decrease from target to max
            score = 1.0 - (exec_time - self.target_time) / (self.max_time - self.target_time)
        else:
            score = 0.0
        
        issues = []
        if exec_time > self.max_time:
            issues.append(f"Execution too slow: {exec_time:.1f}s > {self.max_time}s")
        
        return EvaluationResult(
            metric_name=self.name,
            score=score,
            details={
                "execution_time": exec_time,
                "target_time": self.target_time,
                "tokens_used": trace.tokens_used
            },
            issues=issues
        )


class CompositeEvaluator(BaseEvaluator):
    """Combine multiple evaluators with weights."""
    
    def __init__(
        self,
        name: str = "composite",
        evaluators: Optional[list[BaseEvaluator]] = None,
        weight: float = 1.0
    ):
        super().__init__(name, weight)
        self.evaluators = evaluators or []
    
    def add_evaluator(self, evaluator: BaseEvaluator):
        """Add an evaluator to the composite."""
        self.evaluators.append(evaluator)
    
    def evaluate(
        self,
        trace: ExecutionTrace,
        test_case: Optional[dict] = None
    ) -> EvaluationResult:
        
        if not self.evaluators:
            return EvaluationResult(
                metric_name=self.name,
                score=1.0,
                details={},
                issues=[]
            )
        
        total_weight = sum(e.weight for e in self.evaluators)
        weighted_score = 0.0
        all_details = {}
        all_issues = []
        
        for evaluator in self.evaluators:
            result = evaluator.evaluate(trace, test_case)
            
            weighted_score += result.score * evaluator.weight
            all_details[evaluator.name] = {
                "score": result.score,
                "details": result.details
            }
            all_issues.extend(result.issues)
        
        final_score = weighted_score / total_weight if total_weight > 0 else 0
        
        return EvaluationResult(
            metric_name=self.name,
            score=final_score,
            details=all_details,
            issues=all_issues
        )


def create_default_evaluator() -> CompositeEvaluator:
    """Create a default composite evaluator."""
    
    evaluator = CompositeEvaluator(name="default")
    
    evaluator.add_evaluator(BinaryEvaluator(name="success", weight=0.25))
    evaluator.add_evaluator(FileExistsEvaluator(weight=0.25))
    evaluator.add_evaluator(ErrorRateEvaluator(name="error_rate", weight=0.25))
    evaluator.add_evaluator(EfficiencyEvaluator(weight=0.25))
    
    return evaluator


def create_evaluator_from_config(config: dict) -> BaseEvaluator:
    """Create an evaluator from configuration."""
    
    evaluator_types = {
        "binary": BinaryEvaluator,
        "file_exists": FileExistsEvaluator,
        "file_validity": FileValidityEvaluator,
        "content_match": ContentMatchEvaluator,
        "error_rate": ErrorRateEvaluator,
        "efficiency": EfficiencyEvaluator,
        "computed": ErrorRateEvaluator,  # Alias
    }
    
    eval_type = config.get("type", "binary")
    name = config.get("name", eval_type)
    weight = config.get("weight", 1.0)
    
    evaluator_class = evaluator_types.get(eval_type, BinaryEvaluator)
    
    return evaluator_class(name=name, weight=weight)
