Source code for oats.tool.webfetch

"""
WebFetch tool for fetching and processing web content.

Provides :class:`WebFetchTool` which fetches content from URLs and converts
HTML to simplified plain text/markdown. Content is truncated if it exceeds
the maximum length.
"""

from __future__ import annotations

import asyncio
import re
from typing import Any
from urllib.parse import urlparse
import httpx
from oats.tool.registry import Tool, ToolContext, ToolResult
from oats.log import cl

log = cl('tool.webfetch')


[docs] class WebFetchTool(Tool): """Fetch content from a URL and convert it to plain text/markdown. Validates the URL, fetches the content via HTTP, and converts HTML to simplified text. Content is truncated if it exceeds the maximum length (100KB). Example: :: webfetch url="https://docs.python.org/3/library/asyncio.html" """ MAX_CONTENT_LENGTH = 100000 TIMEOUT = 30 @property def name(self) -> str: return "webfetch" @property def description(self) -> str: return """Fetch content from a URL and process it. Use this to: - Retrieve documentation from URLs - Fetch API responses - Get content from web pages The content is converted to plain text/markdown for easier processing.""" @property def parameters(self) -> dict[str, Any]: return { "type": "object", "properties": { "url": { "type": "string", "description": "The URL to fetch content from", }, "prompt": { "type": "string", "description": "Optional prompt to describe what information to extract", }, }, "required": ["url"], }
[docs] async def execute(self, args: dict[str, Any], ctx: ToolContext) -> ToolResult: """Fetch content from a URL and convert it to plain text/markdown. Validates the URL, fetches the content via HTTP, and converts HTML to simplified text. Content is truncated if it exceeds the maximum length. Args: args: Must contain ``url`` (str). May contain ``prompt`` (str) for describing what information to extract. ctx: The tool execution context. Returns: A :class:`ToolResult` with the fetched content and metadata. """ url = args.get("url", "") prompt = args.get("prompt", "") if not url: return ToolResult( title="WebFetch", output="", error="No URL provided", ) # Validate URL try: parsed = urlparse(url) if not parsed.scheme: url = f"https://{url}" elif parsed.scheme == "http": url = url.replace("http://", "https://", 1) except Exception as e: return ToolResult( title="WebFetch", output="", error=f"Invalid URL: {e}", ) try: async with httpx.AsyncClient( timeout=self.TIMEOUT, follow_redirects=True, headers={ "User-Agent": "Mozilla/5.0 (compatible; OpenCode/1.0)", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", }, ) as client: response = await client.get(url) response.raise_for_status() content_type = response.headers.get("content-type", "") content = response.text # Convert HTML to simplified text if "text/html" in content_type: content = self._html_to_text(content) # Truncate if too long if len(content) > self.MAX_CONTENT_LENGTH: content = content[: self.MAX_CONTENT_LENGTH] content += "\n\n[Content truncated]" output = f"URL: {url}\n\n{content}" if prompt: output = f"Prompt: {prompt}\n\n{output}" return ToolResult( title=f"WebFetch: {parsed.netloc}", output=output, metadata={ "url": url, "content_type": content_type, "content_length": len(content), "status_code": response.status_code, }, ) except httpx.TimeoutException: return ToolResult( title="WebFetch", output="", error=f"Request timed out after {self.TIMEOUT} seconds", ) except httpx.HTTPStatusError as e: return ToolResult( title="WebFetch", output="", error=f"HTTP error {e.response.status_code}: {e.response.reason_phrase}", ) except Exception as e: return ToolResult( title="WebFetch", output="", error=f"Failed to fetch URL: {e}", )
def _html_to_text(self, html: str) -> str: """Convert HTML to plain text with basic markdown formatting. Strips script/style elements, converts headers, links, bold, italic, code blocks, lists, and paragraphs to markdown equivalents. Removes remaining HTML tags and decodes common HTML entities. Args: html: The raw HTML content. Returns: The text content with basic markdown formatting. """ # Remove script and style elements html = re.sub(r"<script[^>]*>.*?</script>", "", html, flags=re.DOTALL | re.IGNORECASE) html = re.sub(r"<style[^>]*>.*?</style>", "", html, flags=re.DOTALL | re.IGNORECASE) # Convert headers to markdown for i in range(1, 7): html = re.sub( rf"<h{i}[^>]*>(.*?)</h{i}>", rf"\n{'#' * i} \1\n", html, flags=re.DOTALL | re.IGNORECASE, ) # Convert links to markdown html = re.sub( r'<a[^>]*href=["\']([^"\']*)["\'][^>]*>(.*?)</a>', r"[\2](\1)", html, flags=re.DOTALL | re.IGNORECASE, ) # Convert bold/strong html = re.sub(r"<(b|strong)[^>]*>(.*?)</\1>", r"**\2**", html, flags=re.DOTALL | re.IGNORECASE) # Convert italic/em html = re.sub(r"<(i|em)[^>]*>(.*?)</\1>", r"*\2*", html, flags=re.DOTALL | re.IGNORECASE) # Convert code html = re.sub(r"<code[^>]*>(.*?)</code>", r"`\1`", html, flags=re.DOTALL | re.IGNORECASE) # Convert pre blocks html = re.sub(r"<pre[^>]*>(.*?)</pre>", r"\n```\n\1\n```\n", html, flags=re.DOTALL | re.IGNORECASE) # Convert lists html = re.sub(r"<li[^>]*>(.*?)</li>", r"\n- \1", html, flags=re.DOTALL | re.IGNORECASE) # Convert paragraphs and divs to newlines html = re.sub(r"<(p|div)[^>]*>", "\n", html, flags=re.IGNORECASE) html = re.sub(r"</(p|div)>", "\n", html, flags=re.IGNORECASE) # Convert br to newline html = re.sub(r"<br[^>]*>", "\n", html, flags=re.IGNORECASE) # Remove remaining HTML tags html = re.sub(r"<[^>]+>", "", html) # Decode common HTML entities html = html.replace("&nbsp;", " ") html = html.replace("&amp;", "&") html = html.replace("&lt;", "<") html = html.replace("&gt;", ">") html = html.replace("&quot;", '"') html = html.replace("&#39;", "'") # Clean up whitespace html = re.sub(r"\n\s*\n\s*\n", "\n\n", html) html = re.sub(r"[ \t]+", " ", html) return html.strip()