python-mcp/tools/research.py

from __future__ import annotations

from typing import Any

from core.tools.base import BaseTool, ToolContext
from core.tools.registry import registry
from core.events import bus


class ResearchTool(BaseTool):
    """
    High-level research orchestrator.

    Combines:
    - search
    - intelligent ranking
    - crawling
    into a structured report.
    """

    name = "research"
    description = "Autonomous web research pipeline"

    # =========================
    # EXECUTE
    # =========================

    def execute(self, payload: dict[str, Any], ctx: ToolContext):
        query = payload.get("query")
        depth = payload.get("depth", 1)
        max_sources = payload.get("max_sources", 3)

        if not isinstance(query, str):
            raise ValueError("query must be string")

        bus.log(
            "RESEARCH",
            "research_execute",
            "INFO",
            {
                "query": query,
                "depth": depth,
                "max_sources": max_sources
            }
        )

        # Step 1: search
        search_results = registry.run(
            "search",
            {"action": "search", "query": query, "limit": 10},
            ctx
        )

        results = search_results.get("results", [])

        if not results:
            return {
                "query": query,
                "error": "No search results found"
            }

        # Step 2: intelligent ranking
        ranked = registry.run(
            "intelligent_search",
            {
                "action": "rank",
                "query": query,
                "results": results
            },
            ctx
        )

        ranked_list = ranked.get("ranked", [])[:max_sources]

        # Step 3: crawl top sources
        pages = []

        for item in ranked_list:
            url = item.get("url")

            if not url:
                continue

            page = registry.run(
                "crawler",
                {
                    "action": "fetch",
                    "url": url
                },
                ctx
            )

            pages.append({
                "url": url,
                "title": item.get("title"),
                "text": page.get("text", ""),
                "score": item.get("score", 0)
            })

        # Step 4: synthesize structure
        return {
            "query": query,
            "sources_used": len(pages),
            "sources": pages,
            "summary_hint": self._build_hint(pages)
        }

    # =========================
    # SIMPLE SYNTHESIS HELPER
    # =========================

    def _build_hint(self, pages: list[dict[str, Any]]) -> str:
        """
        Lightweight heuristic summary hint.

        This is NOT a full LLM summary — just structure guidance.
        """

        if not pages:
            return "No data available."

        topics = []

        for p in pages:
            text = p.get("text", "")

            # crude keyword extraction (lightweight, no deps)
            words = text.split()
            keywords = [w for w in words if len(w) > 6][:10]

            topics.append({
                "url": p.get("url"),
                "keywords": keywords
            })

        return (
            "Key extracted themes per source:\n"
            + "\n".join(
                f"- {t['url']}: {', '.join(t['keywords'][:5])}"
                for t in topics
            )
        )


# =========================
# REGISTER
# =========================

registry.register(ResearchTool())