Created
January 15, 2025 08:39
-
-
Save bangonkali/2812badce23845c68444365706a3ab49 to your computer and use it in GitHub Desktop.
Simple python script to download webnovel. May not work in the future when the webnovel sites put in place more automated security.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import time | |
from random import randrange | |
from datetime import datetime | |
from playwright.sync_api import sync_playwright | |
from urllib.parse import urlparse | |
import json | |
output_directory = './.local/some-novel' | |
timeout = 1_000 * 60 * 4 | |
def log(message): | |
now = datetime.now() | |
print(f'{now} {message}') | |
with open(f'{output_directory}/0-log.txt', "a") as f: | |
f.write(f'{now} {message}\n') | |
def get_last_part_of_url(url): | |
parsed_url = urlparse(url) | |
path = parsed_url.path | |
segments = path.rstrip('/').split('/') | |
return segments[-1] | |
with sync_playwright() as p: | |
userDataDir = "C:/Users/USER/AppData/Local/Google/Chrome/User Data" | |
browser = p.chromium.launch_persistent_context( | |
user_data_dir=userDataDir, | |
headless=False, | |
channel="chrome", | |
args=["--start-maximized"], | |
no_viewport=True, | |
) | |
page = browser.pages[-1] | |
page.goto("https://www.webnovelsite.com/novel/some-novel/1-chapter-starting-chapter/") | |
book = { | |
'chapters': [] | |
} | |
while True: | |
with open("test.txt", "a") as myfile: | |
myfile.write("appended text") | |
log(f'page {page.url} downloading...') | |
page.wait_for_selector(".reading-content", timeout=timeout) | |
content = page.locator(".reading-content") | |
chapter_heading = page.locator("#chapter-heading") | |
if content.count() > 0: | |
content_text = content.first.inner_text() | |
data = { | |
'content': content_text, | |
'url': page.url, | |
'heading': chapter_heading.first.text_content(), | |
} | |
book['chapters'].append(data) | |
file_name = get_last_part_of_url(page.url) | |
with open(f'{output_directory}/{file_name}.json', 'w', encoding="utf-8") as f: | |
json.dump(data, f, ensure_ascii=False, indent=2) | |
with open(f'{output_directory}/{file_name}.html', 'w', encoding="utf-8") as f: | |
f.write(f'{page.content()}\n') | |
with open(f'{output_directory}/0-book.json', 'w', encoding="utf-8") as f: | |
json.dump(book, f, ensure_ascii=False, indent=2) | |
page.wait_for_selector("a.next_page", timeout=timeout) | |
next_page_button = page.locator("a.next_page") | |
if next_page_button.count() == 0: | |
log(f'page {page.url} download completed. no more pages') | |
break | |
else: | |
log(f'page {page.url} download completed. going to next page') | |
next_page_button.first.click() | |
# delay = randrange(1, 2) | |
# log(f'delaying {delay} seconds...') | |
# time.sleep(delay) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
playwright |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment