Skip to content

Instantly share code, notes, and snippets.

@bangonkali
Created January 15, 2025 08:39
Show Gist options
  • Save bangonkali/2812badce23845c68444365706a3ab49 to your computer and use it in GitHub Desktop.
Save bangonkali/2812badce23845c68444365706a3ab49 to your computer and use it in GitHub Desktop.
Simple python script to download webnovel. May not work in the future when the webnovel sites put in place more automated security.
import time
from random import randrange
from datetime import datetime
from playwright.sync_api import sync_playwright
from urllib.parse import urlparse
import json
output_directory = './.local/some-novel'
timeout = 1_000 * 60 * 4
def log(message):
now = datetime.now()
print(f'{now} {message}')
with open(f'{output_directory}/0-log.txt', "a") as f:
f.write(f'{now} {message}\n')
def get_last_part_of_url(url):
parsed_url = urlparse(url)
path = parsed_url.path
segments = path.rstrip('/').split('/')
return segments[-1]
with sync_playwright() as p:
userDataDir = "C:/Users/USER/AppData/Local/Google/Chrome/User Data"
browser = p.chromium.launch_persistent_context(
user_data_dir=userDataDir,
headless=False,
channel="chrome",
args=["--start-maximized"],
no_viewport=True,
)
page = browser.pages[-1]
page.goto("https://www.webnovelsite.com/novel/some-novel/1-chapter-starting-chapter/")
book = {
'chapters': []
}
while True:
with open("test.txt", "a") as myfile:
myfile.write("appended text")
log(f'page {page.url} downloading...')
page.wait_for_selector(".reading-content", timeout=timeout)
content = page.locator(".reading-content")
chapter_heading = page.locator("#chapter-heading")
if content.count() > 0:
content_text = content.first.inner_text()
data = {
'content': content_text,
'url': page.url,
'heading': chapter_heading.first.text_content(),
}
book['chapters'].append(data)
file_name = get_last_part_of_url(page.url)
with open(f'{output_directory}/{file_name}.json', 'w', encoding="utf-8") as f:
json.dump(data, f, ensure_ascii=False, indent=2)
with open(f'{output_directory}/{file_name}.html', 'w', encoding="utf-8") as f:
f.write(f'{page.content()}\n')
with open(f'{output_directory}/0-book.json', 'w', encoding="utf-8") as f:
json.dump(book, f, ensure_ascii=False, indent=2)
page.wait_for_selector("a.next_page", timeout=timeout)
next_page_button = page.locator("a.next_page")
if next_page_button.count() == 0:
log(f'page {page.url} download completed. no more pages')
break
else:
log(f'page {page.url} download completed. going to next page')
next_page_button.first.click()
# delay = randrange(1, 2)
# log(f'delaying {delay} seconds...')
# time.sleep(delay)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment