Skip to content

Instantly share code, notes, and snippets.

@katagaki
Created January 15, 2025 04:46
Show Gist options
  • Save katagaki/360b276e840432a29590dcd50be506cc to your computer and use it in GitHub Desktop.
Save katagaki/360b276e840432a29590dcd50be506cc to your computer and use it in GitHub Desktop.
Web browser tool for LangChain
import re
from asyncio import run as run_async
from typing import Optional, Type
from bs4 import BeautifulSoup
from langchain_core.callbacks.manager import AsyncCallbackManagerForToolRun
from langchain_core.tools import BaseTool
from playwright.async_api import BrowserContext, Page, async_playwright
from pydantic import BaseModel, Field
user_agent: str = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36"
element_tags_to_remove: list[str] = [
"head", "header", "footer", "nav", "sidebar", "menu", "img", "object", "svg", "iframe"
]
# Update uBlock Origin Lite: https://github.com/uBlockOrigin/uBOL-home/releases
# Place extracted uBlock Origin Lite in ./chromium/ubolite
class WebBrowserInput(BaseModel):
url: str = Field(description="URL of the website")
class WebBrowserTool(BaseTool):
name: str = "WebBrowser"
description: str = "Gets the contents of a website"
args_schema: Type[BaseModel] = WebBrowserInput
return_direct: bool = False
def _run(
self,
url: str,
run_manager: Optional[AsyncCallbackManagerForToolRun] = None
) -> str:
return run_async(self._arun(url, run_manager))
async def _arun(
self,
url: str,
run_manager: Optional[AsyncCallbackManagerForToolRun] = None
) -> str:
page_html: str | None = None
async with async_playwright() as p:
browser_context: BrowserContext = await p.chromium.launch_persistent_context(
user_data_dir="./.chromium/profile",
args=[
"--password-store=basic",
"--disable-extensions-except=./chromium/ubolite",
"--load-extension=./chromium/ubolite"
],
headless=True,
user_agent=user_agent
)
page: Page = browser_context.pages[0]
await page.goto(url)
await page.wait_for_load_state("load")
page_html = await page.content()
await browser_context.close()
if page_html:
soup: BeautifulSoup = BeautifulSoup(page_html, "html.parser")
try:
for element_tag in element_tags_to_remove:
if element := soup.find(element_tag):
element.decompose()
except Exception:
pass
page_text = soup.get_text()
page_text = page_text.strip()
page_text = re.sub(r"\n+", "\n", page_text)
page_text = re.sub(r" {2,}", " ", page_text)
return page_text
else:
raise RuntimeError("Failed to fetch page HTML.")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment