"""
SYNTHOS Web Search + Reasoning Engine
======================================

Search-augmented generation flow:
1. REASON  — decompose the query into search-worthy sub-questions
2. SEARCH  — fetch results from DuckDuckGo (no API key needed)
3. EXTRACT — pull relevant facts from search results
4. SYNTHESIZE — compose a grounded answer from the facts
"""

from __future__ import annotations

import re
import json
import time
import urllib.request
import urllib.parse
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Tuple
from html.parser import HTMLParser


# ═══════════════════════════════════════════════════════════════════════════════
# Data structures
# ═══════════════════════════════════════════════════════════════════════════════

@dataclass
class SearchResult:
    """A single search result."""
    title: str
    url: str
    snippet: str


@dataclass
class SearchPlan:
    """Pre-generation reasoning plan for a search query."""
    original_query: str
    intent: str               # what the user wants to know
    sub_questions: List[str]  # decomposed search queries
    search_strategy: str      # how we'll combine results


@dataclass
class SearchAnswer:
    """Final synthesized answer from search results."""
    query: str
    answer: str
    sources: List[SearchResult]
    plan: SearchPlan
    elapsed_ms: float = 0.0


# ═══════════════════════════════════════════════════════════════════════════════
# HTML text extractor
# ═══════════════════════════════════════════════════════════════════════════════

class _TextExtractor(HTMLParser):
    """Strip HTML tags and extract plain text."""
    def __init__(self):
        super().__init__()
        self._parts: List[str] = []
        self._skip = False

    def handle_starttag(self, tag, attrs):
        if tag in ("script", "style", "noscript"):
            self._skip = True

    def handle_endtag(self, tag):
        if tag in ("script", "style", "noscript"):
            self._skip = False

    def handle_data(self, data):
        if not self._skip:
            self._parts.append(data)

    def get_text(self) -> str:
        return " ".join(self._parts)


def _strip_html(html: str) -> str:
    """Remove HTML tags from a string."""
    extractor = _TextExtractor()
    try:
        extractor.feed(html)
        return extractor.get_text().strip()
    except Exception:
        return re.sub(r"<[^>]+>", "", html).strip()


# ═══════════════════════════════════════════════════════════════════════════════
# DuckDuckGo Search (no API key needed)
# ═══════════════════════════════════════════════════════════════════════════════

_DDG_URL = "https://html.duckduckgo.com/html/"
_HEADERS = {
    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml",
    "Accept-Language": "en-US,en;q=0.9",
}


def _search_ddg(query: str, max_results: int = 5) -> List[SearchResult]:
    """
    Search DuckDuckGo and return parsed results.
    Uses the HTML endpoint (no API key required).
    """
    results: List[SearchResult] = []
    try:
        data = urllib.parse.urlencode({"q": query, "kl": "us-en"}).encode("utf-8")
        req = urllib.request.Request(_DDG_URL, data=data, headers=_HEADERS, method="POST")
        with urllib.request.urlopen(req, timeout=10) as resp:
            html = resp.read().decode("utf-8", errors="replace")

        # Parse results from HTML
        # DuckDuckGo HTML results have class="result__a" for titles and class="result__snippet" for snippets
        title_pattern = re.compile(
            r'class="result__a"[^>]*href="([^"]*)"[^>]*>(.*?)</a>', re.S
        )
        snippet_pattern = re.compile(
            r'class="result__snippet"[^>]*>(.*?)</(?:a|td|div|span)', re.S
        )

        titles = title_pattern.findall(html)
        snippets = snippet_pattern.findall(html)

        for i in range(min(len(titles), len(snippets), max_results)):
            url = titles[i][0]
            # DuckDuckGo wraps URLs in redirect — extract the actual URL
            actual_url_match = re.search(r"uddg=([^&]+)", url)
            if actual_url_match:
                url = urllib.parse.unquote(actual_url_match.group(1))

            title_text = _strip_html(titles[i][1])
            snippet_text = _strip_html(snippets[i])

            if title_text and snippet_text:
                results.append(SearchResult(
                    title=title_text[:200],
                    url=url[:500],
                    snippet=snippet_text[:500],
                ))

    except Exception as e:
        # Search failed — return empty results with error info
        results.append(SearchResult(
            title="Search Error",
            url="",
            snippet=f"Could not reach DuckDuckGo: {str(e)[:200]}",
        ))

    return results


# ═══════════════════════════════════════════════════════════════════════════════
# Reasoning Pre-Generation (Plan before searching)
# ═══════════════════════════════════════════════════════════════════════════════

def _plan_search(query: str) -> SearchPlan:
    """
    Decompose a query into a search plan BEFORE executing any searches.
    This is the 'reasoning' step — figure out what to search for.
    """
    q = query.strip()
    q_lower = q.lower()

    # Detect intent
    if re.search(r"(?i)(latest|recent|current|today|now|2024|2025|2026|news)", q_lower):
        intent = "current_events"
    elif re.search(r"(?i)(how\s+to|tutorial|guide|steps|instructions)", q_lower):
        intent = "how_to"
    elif re.search(r"(?i)(compare|vs|versus|difference|better)", q_lower):
        intent = "comparison"
    elif re.search(r"(?i)(who\s+is|biography|born|died|age)", q_lower):
        intent = "person"
    elif re.search(r"(?i)(what\s+is|define|meaning|definition)", q_lower):
        intent = "definition"
    elif re.search(r"(?i)(why|reason|cause|because)", q_lower):
        intent = "explanation"
    elif re.search(r"(?i)(best|top|recommend|review)", q_lower):
        intent = "recommendation"
    elif re.search(r"(?i)(price|cost|how\s+much|salary|worth)", q_lower):
        intent = "factual_number"
    else:
        intent = "general"

    # Decompose into sub-questions based on intent
    sub_questions = []

    if intent == "comparison":
        # Split into parts and search each + comparison
        parts = re.split(r"\s+(?:vs|versus|or|compared\s+to)\s+", q, flags=re.I)
        if len(parts) >= 2:
            sub_questions.append(q)  # Full comparison query
            sub_questions.append(parts[0].strip())
            sub_questions.append(parts[1].strip())
        else:
            sub_questions.append(q)
        strategy = "Compare facts from individual searches, then synthesize differences."

    elif intent == "how_to":
        sub_questions.append(q)
        # Also search for common issues
        sub_questions.append(f"{q} common mistakes")
        strategy = "Combine step-by-step instructions with common pitfalls."

    elif intent == "person":
        sub_questions.append(q)
        # Extract person name and search for more
        name_match = re.search(r"(?:who\s+is|about)\s+(.+?)[\?\.]?\s*$", q, re.I)
        if name_match:
            sub_questions.append(f"{name_match.group(1)} biography")
        strategy = "Gather biographical facts and compose a summary."

    elif intent == "recommendation":
        sub_questions.append(q)
        sub_questions.append(f"{q} 2025")
        strategy = "Aggregate recommendations and present the top options."

    elif intent == "explanation":
        sub_questions.append(q)
        # Simplify the question for a broader search
        simplified = re.sub(r"(?i)^why\s+(is|are|do|does|did|was|were)\s+", "", q)
        if simplified != q:
            sub_questions.append(f"{simplified} reason explanation")
        strategy = "Find causal explanations and combine into a clear answer."

    else:
        sub_questions.append(q)
        # Add a more specific variant
        keywords = [w for w in q.split() if len(w) > 3 and w.lower() not in
                    {"what", "when", "where", "which", "that", "this", "from", "about", "with"}]
        if len(keywords) >= 2:
            sub_questions.append(" ".join(keywords[:4]))
        strategy = "Gather relevant facts and compose a direct answer."

    # Limit to 3 sub-questions
    sub_questions = sub_questions[:3]

    return SearchPlan(
        original_query=q,
        intent=intent,
        sub_questions=sub_questions,
        search_strategy=strategy,
    )


# ═══════════════════════════════════════════════════════════════════════════════
# Synthesize Answer from Search Results
# ═══════════════════════════════════════════════════════════════════════════════

def _synthesize(plan: SearchPlan, all_results: List[SearchResult]) -> str:
    """
    Compose a grounded answer from search results.
    This is the 'generation' step — synthesize facts into prose.
    """
    if not all_results or (len(all_results) == 1 and all_results[0].title == "Search Error"):
        error_msg = all_results[0].snippet if all_results else "No results found."
        return (
            f"I tried to search for \"{plan.original_query}\" but couldn't reach the web.\n\n"
            f"Error: {error_msg}\n\n"
            f"This might be due to network restrictions. You can try:\n"
            f"  • Checking your internet connection\n"
            f"  • Searching directly at https://duckduckgo.com/?q={urllib.parse.quote(plan.original_query)}"
        )

    lines = []

    # Header with search strategy
    lines.append(f"**Search: \"{plan.original_query}\"**")
    lines.append(f"Strategy: {plan.search_strategy}")
    lines.append("")

    # Deduplicate snippets
    seen_snippets = set()
    unique_results = []
    for r in all_results:
        key = r.snippet[:80].lower()
        if key not in seen_snippets and r.title != "Search Error":
            seen_snippets.add(key)
            unique_results.append(r)

    # Synthesize based on intent
    if plan.intent == "comparison" and len(plan.sub_questions) >= 3:
        lines.append(f"Comparing **{plan.sub_questions[1]}** vs **{plan.sub_questions[2]}**:")
        lines.append("")
        for r in unique_results[:5]:
            lines.append(f"  • {r.snippet[:200]}")
        lines.append("")

    elif plan.intent == "how_to":
        lines.append("Here's what I found:")
        lines.append("")
        for i, r in enumerate(unique_results[:5], 1):
            lines.append(f"  {i}. **{r.title[:60]}**")
            lines.append(f"     {r.snippet[:200]}")
            lines.append("")

    elif plan.intent == "person":
        # Combine snippets about the person
        for r in unique_results[:3]:
            lines.append(f"{r.snippet[:300]}")
            lines.append("")

    else:
        # General: lead with best snippet, then supporting
        if unique_results:
            lines.append(unique_results[0].snippet[:400])
            lines.append("")
            if len(unique_results) > 1:
                lines.append("Additional info:")
                for r in unique_results[1:4]:
                    lines.append(f"  • {r.snippet[:200]}")
                lines.append("")

    # Sources
    source_results = [r for r in unique_results if r.url and r.title != "Search Error"]
    if source_results:
        lines.append("Sources:")
        for r in source_results[:5]:
            domain = re.search(r"https?://([^/]+)", r.url)
            domain_str = domain.group(1) if domain else r.url[:40]
            lines.append(f"  [{domain_str}] {r.title[:60]}")

    return "\n".join(lines)


# ═══════════════════════════════════════════════════════════════════════════════
# Public API: Search + Reason
# ═══════════════════════════════════════════════════════════════════════════════

def search_and_reason(query: str, progress_callback=None) -> SearchAnswer:
    """
    Full search-augmented reasoning pipeline:
    1. PLAN   — decompose query into sub-questions
    2. SEARCH — fetch results from DuckDuckGo
    3. SYNTHESIZE — compose a grounded answer
    4. LEARN  — grow the knowledge store from results
    """
    from synthos.lm.agent import learn, knowledge_count

    t0 = time.perf_counter()

    def _emit(msg):
        if progress_callback:
            progress_callback(msg)

    # 1. Reasoning pre-generation
    _emit(f"Planning search for: {query[:50]}...")
    plan = _plan_search(query)
    _emit(f"Intent: {plan.intent} | {len(plan.sub_questions)} sub-queries")

    # 2. Execute searches
    all_results: List[SearchResult] = []
    for i, sq in enumerate(plan.sub_questions):
        _emit(f"Searching [{i+1}/{len(plan.sub_questions)}]: {sq[:50]}...")
        results = _search_ddg(sq, max_results=3)
        all_results.extend(results)

    _emit(f"Got {len(all_results)} results, synthesizing answer...")

    # 3. Synthesize
    answer_text = _synthesize(plan, all_results)

    # 4. Learn — grow the knowledge store from search results
    before = knowledge_count()
    for r in all_results:
        if r.title != "Search Error" and r.snippet and len(r.snippet) > 30:
            # Extract a clean topic key from the title
            topic_key = re.sub(r"[^\w\s]", "", r.title.lower()).strip()
            topic_key = " ".join(topic_key.split()[:5])  # max 5 words as key
            if topic_key and len(topic_key) > 3:
                learn(topic_key, r.snippet)
    after = knowledge_count()
    new_facts = after - before
    if new_facts > 0:
        _emit(f"Learned {new_facts} new facts (knowledge store: {after} total)")

    elapsed = (time.perf_counter() - t0) * 1000
    _emit(f"Done in {elapsed:.0f}ms")

    return SearchAnswer(
        query=query,
        answer=answer_text,
        sources=all_results,
        plan=plan,
        elapsed_ms=elapsed,
    )


def needs_web_search(prompt: str) -> bool:
    """Detect if a prompt would benefit from web search."""
    p = prompt.lower()

    # Exclude system/command prompts that contain trigger words coincidentally
    if re.match(r"(?i)(show|check|list|display|view|run|execute|git|ping|find\s+files?)", p):
        # These are system commands, not web search requests
        if re.search(r"(?i)(directory|dir|folder|process|disk|port|env|file|system|git\b)", p):
            return False

    return bool(re.search(
        r"(?i)(search\s+(?:for|about|the\s+web)|google|look\s+up|find\s+(?:out|info|information)"
        r"|latest|current|recent|news|today|2024|2025|2026"
        r"|who\s+(?:is|was|are|were)\s+\w+.*(?:born|president|CEO|founder|king|queen|minister)"
        r"|what\s+(?:is|are)\s+the\s+(?:latest|current|newest|best|top|price|cost)"
        r"|how\s+(?:much|many)\s+(?:does|do|is|are)\s+\w+.*(?:cost|earn|worth|weigh)"
        r"|weather|stock\s+price|score|results?\s+of"
        r"|what\s+happened|breaking\s+news)",
        prompt,
    ))


# ═══════════════════════════════════════════════════════════════════════════════
# Deep Research: 10+ query search for unknown topics
# ═══════════════════════════════════════════════════════════════════════════════

_RESEARCH_STOP = frozenset({
    "the", "a", "an", "is", "are", "was", "were", "be", "been", "being",
    "have", "has", "had", "do", "does", "did", "will", "would", "shall",
    "should", "may", "might", "must", "can", "could", "about", "above",
    "after", "again", "all", "also", "and", "any", "because", "but",
    "by", "for", "from", "how", "if", "in", "into", "it", "its",
    "just", "like", "more", "most", "not", "of", "on", "or", "other",
    "out", "over", "own", "same", "so", "some", "such", "than", "that",
    "then", "there", "these", "this", "those", "through", "to", "too",
    "under", "until", "up", "very", "what", "when", "where", "which",
    "while", "who", "whom", "why", "with", "you", "your", "me", "my",
    "tell", "explain", "describe", "show", "give", "please", "know",
    "define", "meaning",
})


def _generate_research_queries(question: str) -> List[str]:
    """
    Generate 10+ diverse sentence queries for deep research on an unknown topic.
    Each query targets a different facet of the topic based on the question type.
    """
    q = question.strip()
    q_lower = q.lower()

    # Extract core subject
    words = [w for w in re.findall(r"\w+", q_lower) if len(w) > 2 and w not in _RESEARCH_STOP]
    subject = " ".join(words) if words else q[:50]

    queries = []

    # Detect question type and generate targeted queries
    if re.match(r"(?i)(what|define|meaning|what's)", q):
        queries = [
            f"what is {subject} definition",
            f"{subject} explained simply",
            f"{subject} meaning and overview",
            f"{subject} key characteristics and properties",
            f"{subject} types and categories",
            f"{subject} history and origin",
            f"{subject} how does it work",
            f"{subject} real world examples",
            f"{subject} applications and uses",
            f"{subject} advantages and disadvantages",
            f"{subject} vs alternatives comparison",
            f"{subject} latest developments 2025",
        ]

    elif re.match(r"(?i)(who|who's)", q):
        queries = [
            f"who is {subject}",
            f"{subject} biography",
            f"{subject} early life and background",
            f"{subject} career and achievements",
            f"{subject} notable accomplishments",
            f"{subject} awards and recognition",
            f"{subject} influence and impact",
            f"{subject} quotes and philosophy",
            f"{subject} controversies",
            f"{subject} legacy and contributions",
            f"{subject} personal life",
            f"{subject} latest news 2025",
        ]

    elif re.match(r"(?i)(how|how's|how to|how do|how does|how can)", q):
        queries = [
            f"{q}",
            f"{subject} step by step guide",
            f"{subject} process explained",
            f"{subject} how it works mechanism",
            f"{subject} tutorial for beginners",
            f"{subject} best practices",
            f"{subject} common mistakes to avoid",
            f"{subject} tools and requirements needed",
            f"{subject} tips and tricks",
            f"{subject} troubleshooting problems",
            f"{subject} advanced techniques",
            f"{subject} examples and demonstrations",
        ]

    elif re.match(r"(?i)(when|when's)", q):
        queries = [
            f"{q}",
            f"{subject} timeline history",
            f"{subject} origin date first",
            f"{subject} key dates and milestones",
            f"{subject} chronological history",
            f"{subject} major events timeline",
            f"{subject} when did it start",
            f"{subject} historical development",
            f"{subject} evolution over time",
            f"{subject} future predictions timeline",
            f"{subject} important years and dates",
            f"{subject} recent developments 2025",
        ]

    elif re.match(r"(?i)(why|why's)", q):
        queries = [
            f"{q}",
            f"why does {subject} happen reason",
            f"{subject} causes and explanations",
            f"{subject} root cause analysis",
            f"{subject} scientific explanation",
            f"{subject} purpose and motivation",
            f"{subject} contributing factors",
            f"{subject} evidence and research",
            f"{subject} common theories",
            f"{subject} consequences and effects",
            f"{subject} misconceptions debunked",
            f"{subject} expert opinions",
        ]

    elif re.match(r"(?i)(where|where's)", q):
        queries = [
            f"{q}",
            f"where is {subject} located",
            f"{subject} location and geography",
            f"{subject} map and directions",
            f"{subject} region and area",
            f"{subject} nearby attractions",
            f"{subject} population and demographics",
            f"{subject} climate and environment",
            f"{subject} history of the place",
            f"{subject} things to know about",
            f"{subject} travel guide",
            f"{subject} interesting facts",
        ]

    else:
        queries = [
            f"{subject} overview",
            f"what is {subject}",
            f"{subject} definition and meaning",
            f"{subject} how does it work",
            f"{subject} history and background",
            f"{subject} key facts",
            f"{subject} types and categories",
            f"{subject} examples",
            f"{subject} importance and significance",
            f"{subject} latest information 2025",
            f"{subject} pros and cons",
            f"{subject} future outlook",
        ]

    return queries[:12]  # ensure at least 10, cap at 12


def deep_research(question: str, progress_callback=None,
                  max_queries: int = 12) -> Tuple[List[SearchResult], List[str]]:
    """
    Perform deep research on an unknown topic using N diverse search queries.
    Returns (all_results, queries_used).
    """
    from synthos.lm.agent import learn, knowledge_count

    def _emit(msg):
        if progress_callback:
            progress_callback(msg)

    queries = _generate_research_queries(question)[:max_queries]
    _emit(f"Deep research: {len(queries)} queries for unknown topic")

    all_results: List[SearchResult] = []
    seen_snippets: set = set()

    consecutive_failures = 0
    for i, sq in enumerate(queries):
        _emit(f"  [{i+1}/{len(queries)}] {sq[:55]}...")
        try:
            results = _search_ddg(sq, max_results=3)
        except Exception:
            results = []
        errors = [r for r in results if r.title == "Search Error"]
        if errors and not any(r.title != "Search Error" for r in results):
            consecutive_failures += 1
            if consecutive_failures >= 3:
                _emit("  [!] Network unreachable — skipping remaining queries")
                break
            continue
        consecutive_failures = 0
        for r in results:
            key = r.snippet[:80].lower()
            if key not in seen_snippets and r.title != "Search Error":
                seen_snippets.add(key)
                all_results.append(r)

    _emit(f"Got {len(all_results)} unique results from {len(queries)} queries")

    # Learn everything we found
    before = knowledge_count()
    for r in all_results:
        if r.snippet and len(r.snippet) > 30:
            topic_key = re.sub(r"[^\w\s]", "", r.title.lower()).strip()
            topic_key = " ".join(topic_key.split()[:5])
            if topic_key and len(topic_key) > 3:
                learn(topic_key, r.snippet)
    after = knowledge_count()
    new_facts = after - before
    if new_facts > 0:
        _emit(f"Learned {new_facts} new facts (knowledge store: {after} total)")

    return all_results, queries
