Created
January 15, 2025 04:46
-
-
Save katagaki/360b276e840432a29590dcd50be506cc to your computer and use it in GitHub Desktop.
Web browser tool for LangChain
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
from asyncio import run as run_async | |
from typing import Optional, Type | |
from bs4 import BeautifulSoup | |
from langchain_core.callbacks.manager import AsyncCallbackManagerForToolRun | |
from langchain_core.tools import BaseTool | |
from playwright.async_api import BrowserContext, Page, async_playwright | |
from pydantic import BaseModel, Field | |
user_agent: str = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36" | |
element_tags_to_remove: list[str] = [ | |
"head", "header", "footer", "nav", "sidebar", "menu", "img", "object", "svg", "iframe" | |
] | |
# Update uBlock Origin Lite: https://github.com/uBlockOrigin/uBOL-home/releases | |
# Place extracted uBlock Origin Lite in ./chromium/ubolite | |
class WebBrowserInput(BaseModel): | |
url: str = Field(description="URL of the website") | |
class WebBrowserTool(BaseTool): | |
name: str = "WebBrowser" | |
description: str = "Gets the contents of a website" | |
args_schema: Type[BaseModel] = WebBrowserInput | |
return_direct: bool = False | |
def _run( | |
self, | |
url: str, | |
run_manager: Optional[AsyncCallbackManagerForToolRun] = None | |
) -> str: | |
return run_async(self._arun(url, run_manager)) | |
async def _arun( | |
self, | |
url: str, | |
run_manager: Optional[AsyncCallbackManagerForToolRun] = None | |
) -> str: | |
page_html: str | None = None | |
async with async_playwright() as p: | |
browser_context: BrowserContext = await p.chromium.launch_persistent_context( | |
user_data_dir="./.chromium/profile", | |
args=[ | |
"--password-store=basic", | |
"--disable-extensions-except=./chromium/ubolite", | |
"--load-extension=./chromium/ubolite" | |
], | |
headless=True, | |
user_agent=user_agent | |
) | |
page: Page = browser_context.pages[0] | |
await page.goto(url) | |
await page.wait_for_load_state("load") | |
page_html = await page.content() | |
await browser_context.close() | |
if page_html: | |
soup: BeautifulSoup = BeautifulSoup(page_html, "html.parser") | |
try: | |
for element_tag in element_tags_to_remove: | |
if element := soup.find(element_tag): | |
element.decompose() | |
except Exception: | |
pass | |
page_text = soup.get_text() | |
page_text = page_text.strip() | |
page_text = re.sub(r"\n+", "\n", page_text) | |
page_text = re.sub(r" {2,}", " ", page_text) | |
return page_text | |
else: | |
raise RuntimeError("Failed to fetch page HTML.") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment