Created
February 10, 2025 09:32
-
-
Save kylehowells/e4eabe7d6c59b1d451f0a7ad91f18268 to your computer and use it in GitHub Desktop.
Download and convert to json for further processing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Get one language: | |
# https://b.jw-cdn.org/apis/pub-media/GETPUBMEDIALINKS?booknum=0&output=json&pub=nwt&fileformat=jwpub&alllangs=0&langwritten=S | |
# Get all languages: | |
# https://b.jw-cdn.org/apis/pub-media/GETPUBMEDIALINKS?booknum=0&output=json&pub=nwt&fileformat=epub&alllangs=1&langwritten=E | |
import requests | |
def get_media_links(language_code: str, format: str, document: str) -> list[str]: | |
response = requests.get(f'https://b.jw-cdn.org/apis/pub-media/GETPUBMEDIALINKS?booknum=0&output=json&pub={document}&fileformat={format}&alllangs=0&langwritten={language_code}') | |
return response.json() | |
# MARK: - Download | |
def download(langCode: str|None=None, docID: str|None=None): | |
language_code = langCode or 'E' | |
# format = 'jwpub' | |
format = 'epub' | |
document = docID or 'nwt' | |
media_links = get_media_links(language_code, format, document) | |
print(media_links) | |
publication_name = media_links['pubName'] | |
print("Publication Name: ", publication_name) | |
file_url = media_links['files'][language_code][format.upper()][0]['file']['url'] | |
print("File URL: ", file_url) | |
file_name = file_url.split('/')[-1] | |
print(f"Downloading: {file_name}") | |
response = requests.get(file_url) | |
with open(file_name, 'wb') as f: | |
f.write(response.content) | |
print(f"Downloaded: {file_name}") | |
# MARK: - Main | |
if __name__ == '__main__': | |
import sys | |
langCode = None | |
docID = None | |
if len(sys.argv) > 1: | |
langCode = sys.argv[1] | |
if len(sys.argv) > 2: | |
docID = sys.argv[2] | |
download(langCode=langCode, docID=docID) | |
# MARK: - Notes | |
"""" | |
{ | |
"pubName": "La Biblia. Traducción del Nuevo Mundo (revisión del 2019)", | |
"parentPubName": "La Biblia. Traducción del Nuevo Mundo (revisión del 2019)", | |
"booknum": 0, | |
"pub": "nwt", | |
"issue": "", | |
"formattedDate": "", | |
"fileformat": [ | |
"JWPUB" | |
], | |
"track": null, | |
"specialty": "", | |
"pubImage": { | |
"url": "", | |
"modifiedDatetime": "", | |
"checksum": null | |
}, | |
"languages": { | |
"S": { | |
"name": "español", | |
"direction": "ltr", | |
"locale": "es", | |
"script": "ROMAN" | |
} | |
}, | |
"files": { | |
"S": { | |
"JWPUB": [ | |
{ | |
"title": "La Biblia. Traducción del Nuevo Mundo (revisión del 2019)", | |
"file": { | |
"url": "https://cfp2.jw-cdn.org/a/03c55e4/2/o/nwt_S.jwpub", | |
"stream": "https://jw.org", | |
"modifiedDatetime": "2024-08-15 14:38:30", | |
"checksum": "31bef50c135d9940e97ebc47fb99cc44" | |
}, | |
"filesize": 37413926, | |
"trackImage": { | |
"url": "", | |
"modifiedDatetime": "", | |
"checksum": null | |
}, | |
"markers": null, | |
"label": "0p", | |
"track": 0, | |
"hasTrack": false, | |
"pub": "nwt", | |
"docid": 0, | |
"booknum": 0, | |
"mimetype": "application/octet-stream", | |
"edition": "", | |
"editionDescr": "Normal", | |
"format": "", | |
"formatDescr": "Normal", | |
"specialty": "", | |
"specialtyDescr": "", | |
"subtitled": false, | |
"frameWidth": 0, | |
"frameHeight": 0, | |
"frameRate": 0, | |
"duration": 0, | |
"bitRate": 0 | |
} | |
] | |
} | |
} | |
} | |
""" |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
import ebooklib | |
from ebooklib import epub | |
from pathlib import Path | |
import os | |
from bs4 import BeautifulSoup, Tag | |
from bs4.element import Tag, NavigableString | |
import re | |
from dataclasses import dataclass, asdict | |
# MARK: - Helper Function | |
def read_item(item: epub.EpubItem) -> BeautifulSoup: | |
print(item.get_name()) | |
content = item.get_content() | |
soup = BeautifulSoup(content, 'html.parser') | |
return soup | |
# MARK: - Extract Book into HTML Files | |
def extract_book(book_path: str) -> None: | |
""" | |
Extracts content from an EPUB book and processes its documents. | |
Args: | |
book_path: str - Path to the EPUB file to process | |
""" | |
# Convert book path to Path object for proper handling | |
path_obj = Path(book_path) | |
basename = path_obj.stem | |
# Create output directory if it doesn't exist | |
output_dir = Path(basename) | |
output_dir.mkdir(exist_ok=True) | |
book = epub.read_epub(book_path) | |
for item in book.get_items(): | |
if item.get_type() == ebooklib.ITEM_DOCUMENT: | |
print('==================================') | |
file_name = item.get_name() | |
print('NAME : ', file_name) | |
print('----------------------------------') | |
content = item.get_content() | |
# Write content to file in the output directory | |
output_path = output_dir / file_name | |
os.makedirs(os.path.dirname(output_path), exist_ok=True) | |
with open(output_path, 'wb') as f: | |
f.write(content) | |
print('==================================') | |
# MARK: - Extract Verse Info | |
@dataclass | |
class BibleVerse: | |
# The chapter number | |
chapter: int | |
# The verse number | |
verse: int | |
# The text of the verse | |
text: str | |
@dataclass | |
class BibleParagraph: | |
# verses don't mean a whole sentence, they can be a part of a sentence. | |
# Group into full paragraphs. | |
verses: list[BibleVerse] | |
# The text of the paragraph | |
text: str | |
@dataclass | |
class BibleChapter: | |
# The chapter number | |
number: int | |
# The grouped paragraphs | |
paragraphs: list[BibleParagraph] | |
# The individual verses | |
verses: list[BibleVerse] | |
# The text whole of the chapter | |
text: str | |
def __init__(self): | |
self.verses = [] | |
self.paragraphs = [] | |
self.text = "" | |
self.number = 0 | |
@dataclass | |
class BibleBook: | |
# The name of the book of the Bible | |
name: str | |
# The chapters of the book of the Bible | |
chapters: list[BibleChapter] | |
def __init__(self): | |
self.name = "" | |
self.chapters = [] | |
@dataclass | |
class Bible: | |
# The translation name | |
name: str | |
# The books of the Bible | |
books: list[BibleBook] | |
def __init__(self): | |
self.name = "" | |
self.books = [] | |
class BibleEncoder(json.JSONEncoder): | |
""" | |
Custom JSON encoder for Bible classes. | |
""" | |
def default(self, obj): | |
if isinstance(obj, (Bible, BibleBook, BibleChapter)): | |
return asdict(obj) | |
return super().default(obj) | |
# MARK: - Extract Bible | |
def extract_verses(book_file_name: str) -> Bible: | |
""" | |
Extracts verses from a book file. | |
Args: | |
book_file_name: str - Name of the book file to process | |
Returns: | |
Bible - The parsed Bible object | |
""" | |
book = epub.read_epub(book_file_name) | |
bible = Bible() | |
bible.name = get_title(book) | |
bible.books = [] | |
toc_items = list_toc(book) | |
for book_name, book_link in toc_items: | |
print(f"{book_name} -> {book_link}") | |
chapter_links = extract_chapter_links(book_link, book) | |
bible_book = BibleBook() | |
bible_book.name = book_name | |
bible_book.chapters = [] | |
for chapter_num, chapter_link in chapter_links: | |
chapter = extract_chapter_verses(chapter_link, book) | |
print(f"Found {len(chapter.paragraphs)} paragraphs") | |
print(f"Found {len(chapter.verses)} verses") | |
print(f"Found {sum([len(paragraph.verses) for paragraph in chapter.paragraphs])} p.verses") | |
print(f"Found {len(chapter.text)} text") | |
chapter.number = int(chapter_num) | |
bible_book.chapters.append(chapter) | |
# return | |
bible.books.append(bible_book) | |
print(f"Found {len(bible.books)} books") | |
# Write bible to json file using the custom encoder | |
path_obj = Path(book_file_name) | |
basename = path_obj.stem | |
with open(f"{basename}.json", "w") as f: | |
json.dump(bible, f, cls=BibleEncoder, indent=2) | |
return bible | |
def get_title(book: epub.EpubBook) -> str: | |
""" | |
Extracts the title from the EPUB book. | |
""" | |
return book.get_metadata('DC', 'title')[0][0] | |
def list_toc(book: epub.EpubBook) -> list[tuple[str, str]]: | |
""" | |
Lists all table of contents entries from the EPUB book. | |
Args: | |
book_path: str - Path to the EPUB file to process | |
Returns: | |
list[tuple[str, str]] - List of tuples containing the (title, link) of each TOC entry | |
""" | |
toc_items = [] | |
# Get the table of contents | |
for item in book.get_items(): | |
if item.get_type() == ebooklib.ITEM_DOCUMENT: | |
if item.get_name().startswith('toc'): | |
soup = read_item(item) | |
# Find the TOC navigation list | |
nav_list = soup.find('nav', attrs={'epub:type': 'toc'}).find('ol') | |
# Extract all links and their text | |
for link in nav_list.find_all('a'): | |
href = link.get('href') | |
text = link.get_text(strip=True) | |
toc_items.append((text, href)) | |
# Get just the links to Bible Book Chapter Lists TOCs | |
book_tocs = [] | |
for item in toc_items: | |
name = item[0] | |
link = item[1] | |
if link.startswith('biblechapternav'): | |
book_tocs.append(item) | |
print(f"{name} -> {link}") | |
return book_tocs | |
def extract_chapter_links(book_file_name: str, book: epub.EpubBook) -> list[tuple[str, str]]: | |
""" | |
Extracts chapter links from the bible chapter navigation file. | |
Args: | |
book_file_name: str - Name of the navigation file to process (e.g. biblechapternav1.xhtml) | |
book: epub.EpubBook - The epub book object containing the file | |
Returns: | |
list[tuple[str, str]] - List of tuples containing the (chapter number, link) of each chapter | |
""" | |
print(f"Extracting chapter verses from {book_file_name}") | |
item = book.get_item_with_href(book_file_name) | |
if item is None: | |
print(f"Could not find item with href {book_file_name}") | |
return | |
soup = read_item(item) | |
# Get the book name from the h2 heading | |
book_heading = soup.find('h2', class_='w_navigation w_bibleChapter') | |
if book_heading: | |
book_name = book_heading.get_text(strip=True) | |
print(f"Processing book: {book_name}") | |
# Find all chapter links in the table | |
chapter_links = soup.find_all('a') | |
chapters = [] | |
for link in chapter_links: | |
if link.parent.name == 'td': # Only process links inside table cells | |
href = link.get('href') | |
chapter_num = link.get_text(strip=True) | |
chapters.append((chapter_num, href)) | |
print(f"Chapter {chapter_num}: {href}") | |
return chapters | |
def join_parts(parts: list[str]) -> str: | |
result = [] | |
for i, part in enumerate(parts): | |
part = clean_text(part) | |
# print unicode character values for the first 3 chars in this string | |
# for char in part[:3]: | |
# print(f"Char: '{char}' - {ord(char)}") | |
# # Add the last 3 chars of the part to the result | |
# for char in part[-3:]: | |
# print(f"Char: '{char}' - {ord(char)}") | |
if i > 0: | |
# If previous part ends with space and current starts with space | |
# strip spaces from both and add single space | |
previous_part = clean_text(result[-1]) | |
if previous_part.endswith(' ') and part.startswith(' '): | |
result[-1] = previous_part.rstrip() | |
result.append(' ' + part.lstrip()) | |
else: | |
result.append(clean_text(part)) | |
else: | |
result.append(clean_text(part)) | |
return ''.join(result) | |
def extract_chapter_verses(chapter_link: str, book: epub.EpubBook) -> BibleChapter | None: | |
""" | |
Extracts verses from a chapter file in the EPUB book. | |
Args: | |
chapter_link: str - Link to the chapter file to process | |
book: epub.EpubBook - The EPUB book object | |
Returns: | |
list[BibleVerse] - List of parsed Bible verses | |
""" | |
item = book.get_item_with_href(chapter_link) | |
if item is None: | |
print(f"Could not find item with href {chapter_link}") | |
return None | |
soup = read_item(item) | |
print(f"Processing chapter: {chapter_link}") | |
# Get all p tags with class sb | |
verse_paragraph_elements = soup.find_all('p', class_='sb') | |
print(f"Found {len(verse_paragraph_elements)} verse paragraph elements") | |
# <p id="p2" data-pid="2" class="p2 sb"><span id="pos48"/><span id="chapter1"/><span id="chapter1_verse1"/><span class="w_ch"><strong>1</strong> </span>In the beginning God created the heavens and the earth.</p> | |
# List of all the paragraph's (lists of verses) in the chapter | |
verse_paragraphs: list[list[str]] = [] | |
# <p id="p7" data-pid="7" class="p7 sb"> | |
# <span id="pos1609"/> | |
# <span id="chapter1_verse14"/><strong><sup>14</sup></strong> Then God said: “Let there be luminaries<span id="footnotesource5"/><a epub:type="noteref" href="#footnote5">*</a> in the expanse of the heavens to make a division between the day and the night, and they will serve as signs for seasons and for days and years. | |
# <span id="chapter1_verse15"/><strong><sup>15</sup></strong> They will serve as luminaries in the expanse of the heavens to shine upon the earth.” And it was so. | |
# <span id="chapter1_verse16"/><strong><sup>16</sup></strong> And God went on to make the two great luminaries, the greater luminary for dominating the day and the lesser luminary for dominating the night, and also the stars. | |
# <span id="chapter1_verse17"/><strong><sup>17</sup></strong> Thus God put them in the expanse of the heavens to shine upon the earth <span id="chapter1_verse18"/><strong><sup>18</sup></strong> and to dominate by day and by night and to make a division between the light and the darkness. Then God saw that it was good. | |
# <span id="chapter1_verse19"/><strong><sup>19</sup></strong> And there was evening and there was morning, a fourth day. | |
# </p> | |
for verse_paragraph in verse_paragraph_elements: | |
verses_in_paragraph: list[tuple[int, int, str]] = [] # List of verses in the paragraph (chapter, verse, text) | |
single_verse_sections: list[str] = [] # List of sections of text which make up a single verse | |
current_chapter: int = 0 | |
current_verse: int = 0 | |
for child in verse_paragraph.children: | |
# print(f"Child: '{child}' - '{child.name}'") | |
# if child is not str and child.name is not None: | |
# print(f"{child.name} - {child.get('id', '')} - {child.get('class', '')} - {child.children}") | |
if child.name == 'span' and child.get('id', '').startswith('pos'): | |
continue | |
if child.name == 'span' and child.get('id', '').startswith('chapter') and not '_verse' in child.get('id', ''): | |
current_verse = child.get_text(strip=True) | |
# print(f"Chapter: {current_chapter} - Verse: '{current_verse}'") | |
continue | |
if child.name == 'span' and child.get('id', '').startswith('chapter') and '_verse' in child.get('id', ''): | |
child_id = child.get('id', '') | |
# Found the end of this verse | |
if len(single_verse_sections) > 0: | |
combined_verse_text = join_parts(single_verse_sections) | |
verses_in_paragraph.append((current_chapter, current_verse, combined_verse_text)) | |
single_verse_sections = [] | |
current_chapter = child_id.split('_')[0][7:] | |
current_verse = child_id.split('_')[1][5:] | |
# print(f"Chapter: {current_chapter} - Verse: '{current_verse}'") | |
continue | |
if child.name == 'span' and 'w_ch' in child.get('class', []): | |
# print("Verse marker: ", child.get_text(strip=True)) | |
continue | |
if child.name is None and child.string is not None: | |
# print(f"Child: '{child}' - '{child.string}'") | |
text = child.string | |
# count = len(text) | |
# stripped_count = len(text.strip()) | |
# if count != stripped_count: | |
# print(f"Stripped count: {stripped_count} - count: {count}") | |
#text = text.lstrip() | |
single_verse_sections.append(text) | |
continue | |
if child.name == 'strong' and isinstance(child, Tag) and len(child.contents) > 0 and isinstance(child.contents[0], Tag) and child.contents[0].name == 'sup': | |
# print("Verse number: ", child.get_text(strip=True)) | |
continue | |
if child.name == 'strong' and len(child.contents) > 0 and child.contents[0].name == 'span' and 'altsize': | |
span_child = child.contents[0] | |
if span_child.get('class', []) and len(span_child.contents) > 0 and span_child.contents[0].name == 'sup': | |
# print("Verse number: ", child.get_text(strip=True)) | |
continue | |
if child.name == 'span' and child.get('id', '').startswith('footnotesource'): | |
continue | |
if child.name == 'a' and child.get('epub:type', '') == 'noteref': | |
continue | |
if child.name == 'span' and "pageNum" in child.get('class', []): | |
continue | |
if child.name == 'em' and child.string is not None: | |
single_verse_sections.append(child.string) | |
continue | |
if child.name == 'strong' and child.string is not None: | |
strong_text = child.string.strip() | |
if strong_text == current_verse: | |
continue | |
print(f"ERROR: Unknown child: {child}") | |
return None | |
# Add the last verse to the list | |
combined_verse_text = join_parts(single_verse_sections) | |
verses_in_paragraph.append((current_chapter, current_verse, combined_verse_text)) | |
# Add the paragraph to the list | |
verse_paragraphs.append(verses_in_paragraph) | |
print(f"Found {len(verse_paragraphs)} paragraphs, containing {sum([len(paragraph) for paragraph in verse_paragraphs])} verses") | |
# # Print the verses | |
# print('----------------------------------') | |
# print("\n\n") | |
# print("\n\n") | |
# for paragraph in verse_paragraphs: | |
# for verse in paragraph: | |
# print(f"Chapter: {verse[0]} - Verse: {verse[1]} - Text: {verse[2]}") | |
# print("\n\n") | |
# print("\n\n") | |
# print('----------------------------------') | |
# print("\n\n") | |
# print("\n\n") | |
# for paragraph in verse_paragraphs: | |
# text = join_parts([verse[2] for verse in paragraph]) | |
# print(text) | |
# print() | |
# print("\n\n") | |
# print("\n\n") | |
# print('----------------------------------') | |
chapter = BibleChapter() | |
for verses in verse_paragraphs: | |
text = join_parts([verse[2] for verse in verses]) | |
verses = [BibleVerse(int(verse[0]), int(verse[1]), verse[2]) for verse in verses] | |
paragraph = BibleParagraph(verses, clean_text_extra(text.strip())) | |
chapter.paragraphs.append(paragraph) | |
chapter.verses.extend(verses) | |
chapter.text = clean_text_extra('\n'.join([paragraph.text.strip() for paragraph in chapter.paragraphs])) | |
# print(f"Chapter: {chapter.number} - \n\n{chapter.text}\n\n") | |
return chapter | |
def clean_text(text: str) -> str: | |
""" | |
Cleans the text by removing leading and trailing spaces and replacing multiple spaces with a single space. | |
""" | |
text = text.replace(chr(160), ' ') # Replace non-breaking space with space | |
text = text.replace(u"\u201D", '"') # Replace right double quotation mark with double quotation mark | |
text = text.replace(u"\u201C", '"') # Replace left double quotation mark with double quotation mark | |
text = text.replace(u"\u2019", "'") # Replace right single quotation mark with single quotation mark | |
text = text.replace(u"\u2018", "'") # Replace left single quotation mark with single quotation mark | |
text = text.replace(u"\u02b9", "'") # Replace right single quotation mark with single quotation mark | |
return text | |
def clean_text_extra(text: str) -> str: | |
"""Remove extra stuff from the summary text, but leave this stuff in the verses themselves""" | |
text = text.replace(u"\u00b7", '') # remove * dot in the middle of words for punctuation | |
# text = text.replace(u"\u2014", '-') # remove em dash | |
return text | |
# MARK: - Main | |
if __name__ == '__main__': | |
import sys | |
file_name = None | |
default_file = "nwt_E.epub" | |
try: | |
file_name = sys.argv[1] if len(sys.argv) > 1 else default_file | |
except IndexError: | |
print(f"Using default file: {default_file}") | |
except FileNotFoundError: | |
print(f"Error: Could not find file '{file_name}'") | |
sys.exit(1) | |
except Exception as e: | |
print(f"Error processing file: {str(e)}") | |
sys.exit(1) | |
file = file_name or default_file | |
# extract_book(file) | |
extract_verses(file) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment