Last active
February 8, 2025 04:09
-
-
Save rengler33/f8b9d3f26a518c08a414f6f86109863c to your computer and use it in GitHub Desktop.
How to Capture Network Traffic When Scraping with Selenium & Python
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# see rkengler.com for related blog post | |
# https://www.rkengler.com/how-to-capture-network-traffic-when-scraping-with-selenium-and-python/ | |
import json | |
import pprint | |
from selenium import webdriver | |
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities | |
capabilities = DesiredCapabilities.CHROME | |
# capabilities["loggingPrefs"] = {"performance": "ALL"} # chromedriver < ~75 | |
capabilities["goog:loggingPrefs"] = {"performance": "ALL"} # chromedriver 75+ | |
driver = webdriver.Chrome( | |
r"chromedriver.exe", | |
desired_capabilities=capabilities, | |
) | |
def process_browser_logs_for_network_events(logs): | |
""" | |
Return only logs which have a method that start with "Network.response", "Network.request", or "Network.webSocket" | |
since we're interested in the network events specifically. | |
""" | |
for entry in logs: | |
log = json.loads(entry["message"])["message"] | |
if ( | |
"Network.response" in log["method"] | |
or "Network.request" in log["method"] | |
or "Network.webSocket" in log["method"] | |
): | |
yield log | |
driver.get("https://www.rkengler.com") | |
logs = driver.get_log("performance") | |
events = process_browser_logs_for_network_events(logs) | |
with open("log_entries.txt", "wt") as out: | |
for event in events: | |
pprint.pprint(event, stream=out) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
does this bring back the actual network response, because I'm seeing headers, status codes and other metadata but no actual response data 🥲