#!/usr/bin/env python3
"""
Skill Optimizer - Main GEPA Implementation

Generic framework for optimizing any Claude skill using
GEPA (Genetic-Pareto) reflective text evolution.

Usage:
    # Initialize optimization
    python skill_optimizer.py init --skill-path ~/.claude/skills/my-skill
    
    # Run optimization
    python skill_optimizer.py optimize --config config.yaml
    
    # Evaluate a skill
    python skill_optimizer.py evaluate --skill-path PATH --test-cases tests.yaml
"""

import argparse
import json
import os
import random
import sys
import time
import yaml
from dataclasses import dataclass, field, asdict
from datetime import datetime
from pathlib import Path
from typing import Any, Callable, Optional

# Local imports
from skill_loader import Skill, SkillLoader, SkillWriter, load_skill
from claude_executor import (
    ClaudeCodeExecutor, 
    ExecutorConfig, 
    ExecutionTrace,
    create_executor
)


@dataclass
class TestCase:
    """A single test case for skill evaluation."""
    id: str
    description: str
    prompt: str
    
    # Expected outcomes
    expected_outputs: list[dict] = field(default_factory=list)
    quality_criteria: list[str] = field(default_factory=list)
    
    # Metadata
    tags: list[str] = field(default_factory=list)
    complexity: str = "medium"  # simple, medium, complex, edge_case
    weight: float = 1.0  # Importance weight
    
    @classmethod
    def from_dict(cls, data: dict) -> "TestCase":
        return cls(**{k: v for k, v in data.items() if k in cls.__dataclass_fields__})


@dataclass
class Candidate:
    """A candidate skill version being optimized."""
    id: str
    generation: int
    
    # The skill content
    skill: Skill
    
    # Modifications from original
    modifications: list[str] = field(default_factory=list)
    
    # Evaluation results
    scores: dict[str, float] = field(default_factory=dict)
    traces: list[ExecutionTrace] = field(default_factory=list)
    
    # Pareto ranking
    pareto_rank: int = 0
    
    # Lineage
    parent_id: Optional[str] = None
    mutation_type: str = ""


@dataclass
class OptimizationConfig:
    """Configuration for the optimization process."""
    
    # Skill settings
    skill_path: str = ""
    skill_name: str = ""
    
    # Components to optimize
    optimize_components: list[str] = field(default_factory=lambda: [
        "instructions",
        "examples",
        "workflows"
    ])
    
    # Optimization parameters
    max_iterations: int = 10
    max_evaluations: int = 100
    population_size: int = 5
    batch_size: int = 5
    
    # Model settings
    executor_mode: str = "cli"  # 'cli' or 'mock'
    model: str = "claude-sonnet-4-20250514"
    timeout: int = 300
    
    # Evaluation metrics
    metrics: list[dict] = field(default_factory=lambda: [
        {"name": "task_completion", "weight": 0.3, "type": "binary"},
        {"name": "output_quality", "weight": 0.3, "type": "llm_judge"},
        {"name": "error_rate", "weight": 0.2, "type": "computed"},
        {"name": "efficiency", "weight": 0.2, "type": "computed"}
    ])
    
    # Output settings
    output_dir: str = "./optimization_results"
    save_checkpoints: bool = True
    checkpoint_every: int = 2
    verbose: bool = True
    
    @classmethod
    def from_yaml(cls, path: str) -> "OptimizationConfig":
        with open(path, "r") as f:
            data = yaml.safe_load(f)
        
        # Flatten nested config
        flat = {}
        if "skill" in data:
            flat.update({f"skill_{k}": v for k, v in data["skill"].items()})
            flat["skill_path"] = data["skill"].get("path", "")
        if "optimization" in data:
            flat.update(data["optimization"])
        if "evaluation" in data:
            flat["metrics"] = data["evaluation"].get("metrics", [])
        if "claude" in data:
            flat.update(data["claude"])
        
        return cls(**{k: v for k, v in flat.items() if k in cls.__dataclass_fields__})
    
    def to_yaml(self, path: str):
        data = {
            "skill": {
                "path": self.skill_path,
                "name": self.skill_name,
                "components": self.optimize_components
            },
            "optimization": {
                "max_iterations": self.max_iterations,
                "max_evaluations": self.max_evaluations,
                "population_size": self.population_size,
                "batch_size": self.batch_size
            },
            "evaluation": {
                "metrics": self.metrics
            },
            "claude": {
                "model": self.model,
                "timeout": self.timeout,
                "executor_mode": self.executor_mode
            },
            "output": {
                "output_dir": self.output_dir,
                "save_checkpoints": self.save_checkpoints
            }
        }
        
        with open(path, "w") as f:
            yaml.dump(data, f, default_flow_style=False)


@dataclass
class OptimizationResult:
    """Results from optimization."""
    
    # Best result
    best_candidate: Candidate
    
    # All candidates explored
    all_candidates: list[Candidate]
    
    # Score progression
    initial_scores: dict[str, float]
    final_scores: dict[str, float]
    improvement: dict[str, float]
    
    # Process stats
    total_iterations: int
    total_evaluations: int
    runtime_seconds: float
    
    def to_report(self) -> str:
        """Generate human-readable report."""
        lines = [
            "# Skill Optimization Report",
            f"\n**Generated**: {datetime.now().isoformat()}",
            f"\n## Summary",
            f"- Total iterations: {self.total_iterations}",
            f"- Total evaluations: {self.total_evaluations}",
            f"- Runtime: {self.runtime_seconds:.1f} seconds",
            f"\n## Score Improvement",
        ]
        
        for metric, improvement in self.improvement.items():
            initial = self.initial_scores.get(metric, 0)
            final = self.final_scores.get(metric, 0)
            lines.append(f"- **{metric}**: {initial:.2%} → {final:.2%} ({improvement:+.2%})")
        
        lines.extend([
            f"\n## Best Candidate",
            f"- ID: {self.best_candidate.id}",
            f"- Generation: {self.best_candidate.generation}",
            f"- Pareto Rank: {self.best_candidate.pareto_rank}",
            f"\n### Modifications Made:",
        ])
        
        for mod in self.best_candidate.modifications:
            lines.append(f"- {mod}")
        
        return "\n".join(lines)


class SkillOptimizer:
    """
    Main optimizer class implementing GEPA for skill optimization.
    """
    
    def __init__(
        self,
        config: OptimizationConfig,
        test_cases: list[TestCase],
        evaluators: Optional[dict[str, Callable]] = None
    ):
        self.config = config
        self.test_cases = test_cases
        self.evaluators = evaluators or {}
        
        # Load skill
        self.loader = SkillLoader()
        self.original_skill = self.loader.load(config.skill_path)
        
        # Create executor
        self.executor = create_executor(
            mode=config.executor_mode,
            model=config.model,
            timeout=config.timeout,
            save_traces=True,
            traces_dir=os.path.join(config.output_dir, "traces")
        )
        
        # State
        self.candidates: list[Candidate] = []
        self.evaluation_count = 0
        self.generation = 0
        
        # Setup output
        os.makedirs(config.output_dir, exist_ok=True)
        os.makedirs(os.path.join(config.output_dir, "candidates"), exist_ok=True)
    
    def optimize(self) -> OptimizationResult:
        """
        Run the full GEPA optimization loop.
        """
        start_time = time.time()
        
        self._log("=" * 60)
        self._log(f"Starting Skill Optimization: {self.original_skill.name}")
        self._log("=" * 60)
        
        # Step 1: Create and evaluate seed candidate
        seed = self._create_seed_candidate()
        initial_scores = self._evaluate_candidate(seed)
        seed.scores = initial_scores
        self.candidates.append(seed)
        
        self._log(f"Seed scores: {self._format_scores(initial_scores)}")
        
        best_candidate = seed
        
        # Step 2: Main optimization loop
        while (self.generation < self.config.max_iterations and
               self.evaluation_count < self.config.max_evaluations):
            
            self.generation += 1
            self._log(f"\n--- Generation {self.generation} ---")
            
            # Select parents
            parents = self._select_parents()
            
            # Generate mutations
            new_candidates = []
            for parent in parents:
                mutations = self._generate_mutations(parent)
                new_candidates.extend(mutations)
            
            # Evaluate new candidates
            for candidate in new_candidates:
                if self.evaluation_count >= self.config.max_evaluations:
                    break
                    
                scores = self._evaluate_candidate(candidate)
                candidate.scores = scores
                self.candidates.append(candidate)
            
            # Compute Pareto ranks
            self._compute_pareto_ranks()
            
            # Update best
            new_best = min(self.candidates, key=lambda c: (c.pareto_rank, -c.scores.get("overall", 0)))
            if new_best.pareto_rank <= best_candidate.pareto_rank:
                if new_best.scores.get("overall", 0) > best_candidate.scores.get("overall", 0):
                    best_candidate = new_best
                    self._log(f"New best: {best_candidate.id} (overall: {best_candidate.scores.get('overall', 0):.2%})")
            
            # Checkpoint
            if self.config.save_checkpoints and self.generation % self.config.checkpoint_every == 0:
                self._save_checkpoint()
            
            # Check convergence
            if self._check_convergence():
                self._log("Optimization converged")
                break
        
        # Final evaluation
        final_scores = best_candidate.scores
        
        # Calculate improvement
        improvement = {
            metric: final_scores.get(metric, 0) - initial_scores.get(metric, 0)
            for metric in initial_scores
        }
        
        result = OptimizationResult(
            best_candidate=best_candidate,
            all_candidates=self.candidates,
            initial_scores=initial_scores,
            final_scores=final_scores,
            improvement=improvement,
            total_iterations=self.generation,
            total_evaluations=self.evaluation_count,
            runtime_seconds=time.time() - start_time
        )
        
        # Save results
        self._save_results(result)
        
        self._log("=" * 60)
        self._log("Optimization Complete!")
        self._log(f"Improvement: {self._format_scores(improvement)}")
        self._log("=" * 60)
        
        return result
    
    def _create_seed_candidate(self) -> Candidate:
        """Create the seed candidate from original skill."""
        return Candidate(
            id="seed_v0",
            generation=0,
            skill=self.original_skill,
            modifications=[]
        )
    
    def _evaluate_candidate(self, candidate: Candidate) -> dict[str, float]:
        """Evaluate a candidate on test cases."""
        
        self._log(f"Evaluating {candidate.id}...")
        
        # Write candidate skill to temp location
        temp_skill_dir = os.path.join(self.config.output_dir, "temp_skill")
        writer = SkillWriter()
        writer.write(candidate.skill, temp_skill_dir)
        
        # Sample test cases
        test_batch = random.sample(
            self.test_cases,
            min(self.config.batch_size, len(self.test_cases))
        )
        
        # Execute tests
        traces = []
        for test in test_batch:
            trace = self.executor.execute(
                prompt=test.prompt,
                task_id=test.id,
                skill_path=temp_skill_dir
            )
            traces.append(trace)
            self.evaluation_count += 1
        
        candidate.traces = traces
        
        # Calculate scores
        scores = self._calculate_scores(traces, test_batch)
        
        return scores
    
    def _calculate_scores(
        self,
        traces: list[ExecutionTrace],
        test_cases: list[TestCase]
    ) -> dict[str, float]:
        """Calculate scores from execution traces."""
        
        scores = {}
        
        # Task completion (binary)
        completion_scores = [1.0 if t.success else 0.0 for t in traces]
        scores["task_completion"] = sum(completion_scores) / len(completion_scores)
        
        # Error rate
        error_counts = [len(t.errors) for t in traces]
        max_errors = max(error_counts) if error_counts else 1
        scores["error_rate"] = 1.0 - (sum(error_counts) / (len(traces) * max(max_errors, 1)))
        
        # Efficiency (based on execution time)
        times = [t.execution_time for t in traces]
        avg_time = sum(times) / len(times) if times else 0
        # Normalize: assume 30s is "average", faster is better
        scores["efficiency"] = max(0, min(1, 1 - (avg_time - 30) / 60))
        
        # Output quality (if evaluator provided)
        if "output_quality" in self.evaluators:
            quality_scores = []
            for trace, test in zip(traces, test_cases):
                score = self.evaluators["output_quality"](trace, test)
                quality_scores.append(score)
            scores["output_quality"] = sum(quality_scores) / len(quality_scores)
        else:
            # Default: based on success and file creation
            file_scores = []
            for trace in traces:
                if trace.files_created:
                    file_scores.append(1.0)
                elif trace.success:
                    file_scores.append(0.7)
                else:
                    file_scores.append(0.0)
            scores["output_quality"] = sum(file_scores) / len(file_scores)
        
        # Calculate weighted overall score
        total_weight = 0
        overall = 0
        
        for metric_config in self.config.metrics:
            name = metric_config["name"]
            weight = metric_config.get("weight", 0.25)
            
            if name in scores:
                overall += scores[name] * weight
                total_weight += weight
        
        scores["overall"] = overall / total_weight if total_weight > 0 else 0
        
        return scores
    
    def _select_parents(self) -> list[Candidate]:
        """Select parent candidates for mutation."""
        
        # Sort by Pareto rank, then by overall score
        sorted_candidates = sorted(
            self.candidates,
            key=lambda c: (c.pareto_rank, -c.scores.get("overall", 0))
        )
        
        # Select top candidates
        num_parents = min(
            self.config.population_size // 2 + 1,
            len(sorted_candidates)
        )
        
        return sorted_candidates[:num_parents]
    
    def _generate_mutations(self, parent: Candidate) -> list[Candidate]:
        """Generate mutated candidates through reflection."""
        
        mutations = []
        
        # Analyze failures
        failure_traces = [t for t in parent.traces if not t.success]
        
        if not failure_traces:
            # If no failures, try random improvements
            return [self._random_mutation(parent)]
        
        # Generate reflection text
        reflection_text = self._build_reflection_prompt(parent, failure_traces)
        
        # Get improvement suggestions (in production, call LLM here)
        improvements = self._analyze_failures(failure_traces)
        
        # Apply improvements
        for component, suggestions in improvements.items():
            for suggestion in suggestions[:2]:  # Limit mutations per component
                mutated = self._apply_mutation(parent, component, suggestion)
                if mutated:
                    mutations.append(mutated)
        
        return mutations if mutations else [self._random_mutation(parent)]
    
    def _build_reflection_prompt(
        self,
        parent: Candidate,
        failure_traces: list[ExecutionTrace]
    ) -> str:
        """Build prompt for reflection LLM."""
        
        parts = [
            "# Skill Optimization Reflection",
            f"\n## Current Skill: {parent.skill.name}",
            f"\n### Instructions (excerpt)\n{parent.skill.instructions[:2000]}...",
            f"\n## Failed Executions ({len(failure_traces)} failures)",
        ]
        
        for trace in failure_traces[:5]:
            parts.append(trace.to_reflection_text())
            parts.append("---")
        
        parts.extend([
            "\n## Task",
            "Analyze why these executions failed and propose specific improvements.",
            "Focus on:",
            "1. Ambiguous instructions that led to wrong decisions",
            "2. Missing guidance for edge cases",
            "3. Unclear examples or workflows",
            "4. Error handling gaps",
            "\nProvide concrete text changes, not vague suggestions."
        ])
        
        return "\n".join(parts)
    
    def _analyze_failures(self, traces: list[ExecutionTrace]) -> dict[str, list[str]]:
        """Analyze failure traces and suggest improvements."""
        
        improvements = {}
        
        # Categorize errors
        error_categories = {
            "timeout": [],
            "file_not_found": [],
            "validation": [],
            "execution": [],
            "other": []
        }
        
        for trace in traces:
            for error in trace.errors:
                error_lower = error.lower()
                if "timeout" in error_lower:
                    error_categories["timeout"].append(error)
                elif "not found" in error_lower or "missing" in error_lower:
                    error_categories["file_not_found"].append(error)
                elif "invalid" in error_lower or "validation" in error_lower:
                    error_categories["validation"].append(error)
                elif "failed" in error_lower or "error" in error_lower:
                    error_categories["execution"].append(error)
                else:
                    error_categories["other"].append(error)
        
        # Generate improvements based on categories
        if error_categories["timeout"]:
            improvements["instructions"] = improvements.get("instructions", [])
            improvements["instructions"].append(
                "Add timeout handling guidance and chunking strategies for large tasks"
            )
        
        if error_categories["file_not_found"]:
            improvements["instructions"] = improvements.get("instructions", [])
            improvements["instructions"].append(
                "Add file existence validation before operations"
            )
        
        if error_categories["validation"]:
            improvements["workflows"] = improvements.get("workflows", [])
            improvements["workflows"].append(
                "Include validation steps after each major operation"
            )
        
        if error_categories["execution"]:
            improvements["examples"] = improvements.get("examples", [])
            improvements["examples"].append(
                "Add error handling examples for common failure modes"
            )
        
        return improvements
    
    def _apply_mutation(
        self,
        parent: Candidate,
        component: str,
        suggestion: str
    ) -> Optional[Candidate]:
        """Apply a mutation to create a new candidate."""
        
        mutation_id = f"gen{self.generation}_{component}_{len(self.candidates)}"
        
        # Clone parent skill
        import copy
        new_skill = copy.deepcopy(parent.skill)
        
        # Apply mutation based on component
        if component == "instructions":
            new_skill.instructions = self._insert_improvement(
                new_skill.instructions,
                suggestion,
                "## Best Practices"  # Insert before this section if exists
            )
        elif component == "examples":
            new_skill.instructions = self._insert_improvement(
                new_skill.instructions,
                f"\n**Example (GEPA-optimized)**: {suggestion}\n",
                "## Dependencies"
            )
        elif component == "workflows":
            new_skill.instructions = self._insert_improvement(
                new_skill.instructions,
                f"\n**Workflow Improvement**: {suggestion}\n",
                "## Code Style"
            )
        
        return Candidate(
            id=mutation_id,
            generation=self.generation,
            skill=new_skill,
            modifications=parent.modifications + [f"{component}: {suggestion}"],
            parent_id=parent.id,
            mutation_type=f"reflection_{component}"
        )
    
    def _insert_improvement(
        self,
        content: str,
        improvement: str,
        before_section: str
    ) -> str:
        """Insert improvement text into content."""
        
        if before_section in content:
            idx = content.find(before_section)
            return content[:idx] + improvement + "\n\n" + content[idx:]
        else:
            return content + "\n\n" + improvement
    
    def _random_mutation(self, parent: Candidate) -> Candidate:
        """Generate a random mutation."""
        
        mutation_id = f"gen{self.generation}_random_{len(self.candidates)}"
        
        random_improvements = [
            "Add explicit error handling for edge cases",
            "Include validation step after each operation",
            "Add timeout handling for long-running operations",
            "Specify output format requirements more clearly",
            "Add examples for common failure scenarios"
        ]
        
        improvement = random.choice(random_improvements)
        
        import copy
        new_skill = copy.deepcopy(parent.skill)
        new_skill.instructions += f"\n\n**Note**: {improvement}"
        
        return Candidate(
            id=mutation_id,
            generation=self.generation,
            skill=new_skill,
            modifications=parent.modifications + [f"random: {improvement}"],
            parent_id=parent.id,
            mutation_type="random"
        )
    
    def _compute_pareto_ranks(self):
        """Compute Pareto ranks for all candidates."""
        
        metrics = ["overall", "task_completion", "output_quality", "error_rate"]
        
        n = len(self.candidates)
        dominated_by = [0] * n
        
        for i in range(n):
            for j in range(n):
                if i != j:
                    if self._dominates(
                        self.candidates[j].scores,
                        self.candidates[i].scores,
                        metrics
                    ):
                        dominated_by[i] += 1
        
        for i, candidate in enumerate(self.candidates):
            candidate.pareto_rank = dominated_by[i]
    
    def _dominates(
        self,
        scores_a: dict,
        scores_b: dict,
        metrics: list[str]
    ) -> bool:
        """Check if scores_a Pareto-dominates scores_b."""
        
        better_in_one = False
        
        for metric in metrics:
            a = scores_a.get(metric, 0)
            b = scores_b.get(metric, 0)
            
            if a < b:
                return False
            if a > b:
                better_in_one = True
        
        return better_in_one
    
    def _check_convergence(self) -> bool:
        """Check if optimization has converged."""
        
        if len(self.candidates) < 5:
            return False
        
        recent = self.candidates[-5:]
        scores = [c.scores.get("overall", 0) for c in recent]
        
        if max(scores) - min(scores) < 0.01:
            return True
        
        return False
    
    def _save_checkpoint(self):
        """Save optimization checkpoint."""
        
        checkpoint = {
            "generation": self.generation,
            "evaluation_count": self.evaluation_count,
            "candidates": [
                {
                    "id": c.id,
                    "generation": c.generation,
                    "scores": c.scores,
                    "pareto_rank": c.pareto_rank,
                    "modifications": c.modifications
                }
                for c in self.candidates
            ]
        }
        
        path = os.path.join(
            self.config.output_dir,
            f"checkpoint_gen{self.generation}.json"
        )
        
        with open(path, "w") as f:
            json.dump(checkpoint, f, indent=2)
        
        self._log(f"Saved checkpoint: {path}")
    
    def _save_results(self, result: OptimizationResult):
        """Save final optimization results."""
        
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        
        # Save report
        report_path = os.path.join(self.config.output_dir, f"report_{timestamp}.md")
        with open(report_path, "w") as f:
            f.write(result.to_report())
        
        # Save optimized skill
        optimized_dir = os.path.join(self.config.output_dir, "optimized_skill")
        writer = SkillWriter()
        writer.write(result.best_candidate.skill, optimized_dir)
        
        # Save metrics history
        metrics_path = os.path.join(self.config.output_dir, f"metrics_{timestamp}.json")
        with open(metrics_path, "w") as f:
            json.dump({
                "initial": result.initial_scores,
                "final": result.final_scores,
                "improvement": result.improvement,
                "history": [
                    {"id": c.id, "gen": c.generation, "scores": c.scores}
                    for c in result.all_candidates
                ]
            }, f, indent=2)
        
        self._log(f"Results saved to {self.config.output_dir}")
    
    def _format_scores(self, scores: dict) -> str:
        """Format scores for logging."""
        return ", ".join(f"{k}: {v:.2%}" for k, v in scores.items())
    
    def _log(self, message: str):
        """Log message if verbose."""
        if self.config.verbose:
            print(message)


def load_test_cases(path: str) -> list[TestCase]:
    """Load test cases from YAML file."""
    
    with open(path, "r") as f:
        data = yaml.safe_load(f)
    
    cases = []
    for case_data in data.get("test_cases", []):
        cases.append(TestCase.from_dict(case_data))
    
    return cases


def init_optimization(skill_path: str, output_dir: str):
    """Initialize optimization for a skill."""
    
    print(f"Initializing optimization for: {skill_path}")
    
    # Load skill
    loader = SkillLoader()
    skill = loader.load(skill_path)
    
    print(f"Loaded skill: {skill.name}")
    print(f"Description: {skill.description[:100]}...")
    
    # Create output directory
    os.makedirs(output_dir, exist_ok=True)
    
    # Create config
    config = OptimizationConfig(
        skill_path=skill_path,
        skill_name=skill.name,
        output_dir=output_dir
    )
    
    config.to_yaml(os.path.join(output_dir, "config.yaml"))
    
    # Create test cases template
    test_template = {
        "test_cases": [
            {
                "id": "test_001",
                "description": "Basic functionality test",
                "prompt": "TODO: Add your test prompt here",
                "expected_outputs": [
                    {"type": "file", "pattern": "*.*"}
                ],
                "quality_criteria": [
                    "Task completes successfully"
                ],
                "tags": ["basic"],
                "complexity": "simple"
            },
            {
                "id": "test_002",
                "description": "Edge case test",
                "prompt": "TODO: Add edge case prompt",
                "tags": ["edge_case"],
                "complexity": "edge_case"
            }
        ]
    }
    
    with open(os.path.join(output_dir, "test_cases.yaml"), "w") as f:
        yaml.dump(test_template, f, default_flow_style=False)
    
    print(f"\nCreated:")
    print(f"  - {output_dir}/config.yaml")
    print(f"  - {output_dir}/test_cases.yaml")
    print(f"\nNext steps:")
    print(f"  1. Edit test_cases.yaml with your test cases")
    print(f"  2. Run: python skill_optimizer.py optimize --config {output_dir}/config.yaml")


def main():
    parser = argparse.ArgumentParser(
        description="GEPA Skill Optimizer - Optimize any Claude skill"
    )
    
    subparsers = parser.add_subparsers(dest="command", help="Command to run")
    
    # Init command
    init_parser = subparsers.add_parser("init", help="Initialize optimization")
    init_parser.add_argument("--skill-path", required=True, help="Path to skill")
    init_parser.add_argument("--output-dir", default="./optimization", help="Output directory")
    
    # Optimize command
    opt_parser = subparsers.add_parser("optimize", help="Run optimization")
    opt_parser.add_argument("--config", required=True, help="Config YAML file")
    opt_parser.add_argument("--test-cases", help="Test cases YAML (overrides config)")
    opt_parser.add_argument("--max-iterations", type=int, help="Max iterations")
    opt_parser.add_argument("--mock", action="store_true", help="Use mock executor")
    
    # Evaluate command
    eval_parser = subparsers.add_parser("evaluate", help="Evaluate a skill")
    eval_parser.add_argument("--skill-path", required=True, help="Path to skill")
    eval_parser.add_argument("--test-cases", required=True, help="Test cases YAML")
    
    args = parser.parse_args()
    
    if args.command == "init":
        init_optimization(args.skill_path, args.output_dir)
        
    elif args.command == "optimize":
        # Load config
        config = OptimizationConfig.from_yaml(args.config)
        
        if args.max_iterations:
            config.max_iterations = args.max_iterations
        if args.mock:
            config.executor_mode = "mock"
        
        # Load test cases
        test_cases_path = args.test_cases or os.path.join(
            os.path.dirname(args.config),
            "test_cases.yaml"
        )
        test_cases = load_test_cases(test_cases_path)
        
        print(f"Loaded {len(test_cases)} test cases")
        
        # Run optimization
        optimizer = SkillOptimizer(config, test_cases)
        result = optimizer.optimize()
        
        print(f"\nOptimization complete!")
        print(f"Best candidate: {result.best_candidate.id}")
        print(f"Improvement: {result.improvement}")
        
    elif args.command == "evaluate":
        # Simple evaluation
        config = OptimizationConfig(
            skill_path=args.skill_path,
            executor_mode="cli"
        )
        
        test_cases = load_test_cases(args.test_cases)
        
        optimizer = SkillOptimizer(config, test_cases)
        seed = optimizer._create_seed_candidate()
        scores = optimizer._evaluate_candidate(seed)
        
        print(f"\nEvaluation Results:")
        for metric, score in scores.items():
            print(f"  {metric}: {score:.2%}")
    
    else:
        parser.print_help()


if __name__ == "__main__":
    main()
