Skip to content

Instantly share code, notes, and snippets.

@kylehowells
Created February 10, 2025 09:32
Show Gist options
  • Save kylehowells/e4eabe7d6c59b1d451f0a7ad91f18268 to your computer and use it in GitHub Desktop.
Save kylehowells/e4eabe7d6c59b1d451f0a7ad91f18268 to your computer and use it in GitHub Desktop.
Download and convert to json for further processing
# Get one language:
# https://b.jw-cdn.org/apis/pub-media/GETPUBMEDIALINKS?booknum=0&output=json&pub=nwt&fileformat=jwpub&alllangs=0&langwritten=S
# Get all languages:
# https://b.jw-cdn.org/apis/pub-media/GETPUBMEDIALINKS?booknum=0&output=json&pub=nwt&fileformat=epub&alllangs=1&langwritten=E
import requests
def get_media_links(language_code: str, format: str, document: str) -> list[str]:
response = requests.get(f'https://b.jw-cdn.org/apis/pub-media/GETPUBMEDIALINKS?booknum=0&output=json&pub={document}&fileformat={format}&alllangs=0&langwritten={language_code}')
return response.json()
# MARK: - Download
def download(langCode: str|None=None, docID: str|None=None):
language_code = langCode or 'E'
# format = 'jwpub'
format = 'epub'
document = docID or 'nwt'
media_links = get_media_links(language_code, format, document)
print(media_links)
publication_name = media_links['pubName']
print("Publication Name: ", publication_name)
file_url = media_links['files'][language_code][format.upper()][0]['file']['url']
print("File URL: ", file_url)
file_name = file_url.split('/')[-1]
print(f"Downloading: {file_name}")
response = requests.get(file_url)
with open(file_name, 'wb') as f:
f.write(response.content)
print(f"Downloaded: {file_name}")
# MARK: - Main
if __name__ == '__main__':
import sys
langCode = None
docID = None
if len(sys.argv) > 1:
langCode = sys.argv[1]
if len(sys.argv) > 2:
docID = sys.argv[2]
download(langCode=langCode, docID=docID)
# MARK: - Notes
""""
{
"pubName": "La Biblia. Traducción del Nuevo Mundo (revisión del 2019)",
"parentPubName": "La Biblia. Traducción del Nuevo Mundo (revisión del 2019)",
"booknum": 0,
"pub": "nwt",
"issue": "",
"formattedDate": "",
"fileformat": [
"JWPUB"
],
"track": null,
"specialty": "",
"pubImage": {
"url": "",
"modifiedDatetime": "",
"checksum": null
},
"languages": {
"S": {
"name": "español",
"direction": "ltr",
"locale": "es",
"script": "ROMAN"
}
},
"files": {
"S": {
"JWPUB": [
{
"title": "La Biblia. Traducción del Nuevo Mundo (revisión del 2019)",
"file": {
"url": "https://cfp2.jw-cdn.org/a/03c55e4/2/o/nwt_S.jwpub",
"stream": "https://jw.org",
"modifiedDatetime": "2024-08-15 14:38:30",
"checksum": "31bef50c135d9940e97ebc47fb99cc44"
},
"filesize": 37413926,
"trackImage": {
"url": "",
"modifiedDatetime": "",
"checksum": null
},
"markers": null,
"label": "0p",
"track": 0,
"hasTrack": false,
"pub": "nwt",
"docid": 0,
"booknum": 0,
"mimetype": "application/octet-stream",
"edition": "",
"editionDescr": "Normal",
"format": "",
"formatDescr": "Normal",
"specialty": "",
"specialtyDescr": "",
"subtitled": false,
"frameWidth": 0,
"frameHeight": 0,
"frameRate": 0,
"duration": 0,
"bitRate": 0
}
]
}
}
}
"""
import json
import ebooklib
from ebooklib import epub
from pathlib import Path
import os
from bs4 import BeautifulSoup, Tag
from bs4.element import Tag, NavigableString
import re
from dataclasses import dataclass, asdict
# MARK: - Helper Function
def read_item(item: epub.EpubItem) -> BeautifulSoup:
print(item.get_name())
content = item.get_content()
soup = BeautifulSoup(content, 'html.parser')
return soup
# MARK: - Extract Book into HTML Files
def extract_book(book_path: str) -> None:
"""
Extracts content from an EPUB book and processes its documents.
Args:
book_path: str - Path to the EPUB file to process
"""
# Convert book path to Path object for proper handling
path_obj = Path(book_path)
basename = path_obj.stem
# Create output directory if it doesn't exist
output_dir = Path(basename)
output_dir.mkdir(exist_ok=True)
book = epub.read_epub(book_path)
for item in book.get_items():
if item.get_type() == ebooklib.ITEM_DOCUMENT:
print('==================================')
file_name = item.get_name()
print('NAME : ', file_name)
print('----------------------------------')
content = item.get_content()
# Write content to file in the output directory
output_path = output_dir / file_name
os.makedirs(os.path.dirname(output_path), exist_ok=True)
with open(output_path, 'wb') as f:
f.write(content)
print('==================================')
# MARK: - Extract Verse Info
@dataclass
class BibleVerse:
# The chapter number
chapter: int
# The verse number
verse: int
# The text of the verse
text: str
@dataclass
class BibleParagraph:
# verses don't mean a whole sentence, they can be a part of a sentence.
# Group into full paragraphs.
verses: list[BibleVerse]
# The text of the paragraph
text: str
@dataclass
class BibleChapter:
# The chapter number
number: int
# The grouped paragraphs
paragraphs: list[BibleParagraph]
# The individual verses
verses: list[BibleVerse]
# The text whole of the chapter
text: str
def __init__(self):
self.verses = []
self.paragraphs = []
self.text = ""
self.number = 0
@dataclass
class BibleBook:
# The name of the book of the Bible
name: str
# The chapters of the book of the Bible
chapters: list[BibleChapter]
def __init__(self):
self.name = ""
self.chapters = []
@dataclass
class Bible:
# The translation name
name: str
# The books of the Bible
books: list[BibleBook]
def __init__(self):
self.name = ""
self.books = []
class BibleEncoder(json.JSONEncoder):
"""
Custom JSON encoder for Bible classes.
"""
def default(self, obj):
if isinstance(obj, (Bible, BibleBook, BibleChapter)):
return asdict(obj)
return super().default(obj)
# MARK: - Extract Bible
def extract_verses(book_file_name: str) -> Bible:
"""
Extracts verses from a book file.
Args:
book_file_name: str - Name of the book file to process
Returns:
Bible - The parsed Bible object
"""
book = epub.read_epub(book_file_name)
bible = Bible()
bible.name = get_title(book)
bible.books = []
toc_items = list_toc(book)
for book_name, book_link in toc_items:
print(f"{book_name} -> {book_link}")
chapter_links = extract_chapter_links(book_link, book)
bible_book = BibleBook()
bible_book.name = book_name
bible_book.chapters = []
for chapter_num, chapter_link in chapter_links:
chapter = extract_chapter_verses(chapter_link, book)
print(f"Found {len(chapter.paragraphs)} paragraphs")
print(f"Found {len(chapter.verses)} verses")
print(f"Found {sum([len(paragraph.verses) for paragraph in chapter.paragraphs])} p.verses")
print(f"Found {len(chapter.text)} text")
chapter.number = int(chapter_num)
bible_book.chapters.append(chapter)
# return
bible.books.append(bible_book)
print(f"Found {len(bible.books)} books")
# Write bible to json file using the custom encoder
path_obj = Path(book_file_name)
basename = path_obj.stem
with open(f"{basename}.json", "w") as f:
json.dump(bible, f, cls=BibleEncoder, indent=2)
return bible
def get_title(book: epub.EpubBook) -> str:
"""
Extracts the title from the EPUB book.
"""
return book.get_metadata('DC', 'title')[0][0]
def list_toc(book: epub.EpubBook) -> list[tuple[str, str]]:
"""
Lists all table of contents entries from the EPUB book.
Args:
book_path: str - Path to the EPUB file to process
Returns:
list[tuple[str, str]] - List of tuples containing the (title, link) of each TOC entry
"""
toc_items = []
# Get the table of contents
for item in book.get_items():
if item.get_type() == ebooklib.ITEM_DOCUMENT:
if item.get_name().startswith('toc'):
soup = read_item(item)
# Find the TOC navigation list
nav_list = soup.find('nav', attrs={'epub:type': 'toc'}).find('ol')
# Extract all links and their text
for link in nav_list.find_all('a'):
href = link.get('href')
text = link.get_text(strip=True)
toc_items.append((text, href))
# Get just the links to Bible Book Chapter Lists TOCs
book_tocs = []
for item in toc_items:
name = item[0]
link = item[1]
if link.startswith('biblechapternav'):
book_tocs.append(item)
print(f"{name} -> {link}")
return book_tocs
def extract_chapter_links(book_file_name: str, book: epub.EpubBook) -> list[tuple[str, str]]:
"""
Extracts chapter links from the bible chapter navigation file.
Args:
book_file_name: str - Name of the navigation file to process (e.g. biblechapternav1.xhtml)
book: epub.EpubBook - The epub book object containing the file
Returns:
list[tuple[str, str]] - List of tuples containing the (chapter number, link) of each chapter
"""
print(f"Extracting chapter verses from {book_file_name}")
item = book.get_item_with_href(book_file_name)
if item is None:
print(f"Could not find item with href {book_file_name}")
return
soup = read_item(item)
# Get the book name from the h2 heading
book_heading = soup.find('h2', class_='w_navigation w_bibleChapter')
if book_heading:
book_name = book_heading.get_text(strip=True)
print(f"Processing book: {book_name}")
# Find all chapter links in the table
chapter_links = soup.find_all('a')
chapters = []
for link in chapter_links:
if link.parent.name == 'td': # Only process links inside table cells
href = link.get('href')
chapter_num = link.get_text(strip=True)
chapters.append((chapter_num, href))
print(f"Chapter {chapter_num}: {href}")
return chapters
def join_parts(parts: list[str]) -> str:
result = []
for i, part in enumerate(parts):
part = clean_text(part)
# print unicode character values for the first 3 chars in this string
# for char in part[:3]:
# print(f"Char: '{char}' - {ord(char)}")
# # Add the last 3 chars of the part to the result
# for char in part[-3:]:
# print(f"Char: '{char}' - {ord(char)}")
if i > 0:
# If previous part ends with space and current starts with space
# strip spaces from both and add single space
previous_part = clean_text(result[-1])
if previous_part.endswith(' ') and part.startswith(' '):
result[-1] = previous_part.rstrip()
result.append(' ' + part.lstrip())
else:
result.append(clean_text(part))
else:
result.append(clean_text(part))
return ''.join(result)
def extract_chapter_verses(chapter_link: str, book: epub.EpubBook) -> BibleChapter | None:
"""
Extracts verses from a chapter file in the EPUB book.
Args:
chapter_link: str - Link to the chapter file to process
book: epub.EpubBook - The EPUB book object
Returns:
list[BibleVerse] - List of parsed Bible verses
"""
item = book.get_item_with_href(chapter_link)
if item is None:
print(f"Could not find item with href {chapter_link}")
return None
soup = read_item(item)
print(f"Processing chapter: {chapter_link}")
# Get all p tags with class sb
verse_paragraph_elements = soup.find_all('p', class_='sb')
print(f"Found {len(verse_paragraph_elements)} verse paragraph elements")
# <p id="p2" data-pid="2" class="p2 sb"><span id="pos48"/><span id="chapter1"/><span id="chapter1_verse1"/><span class="w_ch"><strong>1</strong> </span>In the beginning God created the heavens and the earth.</p>&#13;
# List of all the paragraph's (lists of verses) in the chapter
verse_paragraphs: list[list[str]] = []
# <p id="p7" data-pid="7" class="p7 sb">
# <span id="pos1609"/>
# <span id="chapter1_verse14"/><strong><sup>14</sup></strong> Then God said: “Let there be luminaries<span id="footnotesource5"/><a epub:type="noteref" href="#footnote5">*</a> in the expanse of the heavens to make a division between the day and the night, and they will serve as signs for seasons and for days and years.
# <span id="chapter1_verse15"/><strong><sup>15</sup></strong> They will serve as luminaries in the expanse of the heavens to shine upon the earth.” And it was so.
# <span id="chapter1_verse16"/><strong><sup>16</sup></strong> And God went on to make the two great luminaries, the greater luminary for dominating the day and the lesser luminary for dominating the night, and also the stars.
# <span id="chapter1_verse17"/><strong><sup>17</sup></strong> Thus God put them in the expanse of the heavens to shine upon the earth <span id="chapter1_verse18"/><strong><sup>18</sup></strong> and to dominate by day and by night and to make a division between the light and the darkness. Then God saw that it was good.
# <span id="chapter1_verse19"/><strong><sup>19</sup></strong> And there was evening and there was morning, a fourth day.
# </p>&#13;
for verse_paragraph in verse_paragraph_elements:
verses_in_paragraph: list[tuple[int, int, str]] = [] # List of verses in the paragraph (chapter, verse, text)
single_verse_sections: list[str] = [] # List of sections of text which make up a single verse
current_chapter: int = 0
current_verse: int = 0
for child in verse_paragraph.children:
# print(f"Child: '{child}' - '{child.name}'")
# if child is not str and child.name is not None:
# print(f"{child.name} - {child.get('id', '')} - {child.get('class', '')} - {child.children}")
if child.name == 'span' and child.get('id', '').startswith('pos'):
continue
if child.name == 'span' and child.get('id', '').startswith('chapter') and not '_verse' in child.get('id', ''):
current_verse = child.get_text(strip=True)
# print(f"Chapter: {current_chapter} - Verse: '{current_verse}'")
continue
if child.name == 'span' and child.get('id', '').startswith('chapter') and '_verse' in child.get('id', ''):
child_id = child.get('id', '')
# Found the end of this verse
if len(single_verse_sections) > 0:
combined_verse_text = join_parts(single_verse_sections)
verses_in_paragraph.append((current_chapter, current_verse, combined_verse_text))
single_verse_sections = []
current_chapter = child_id.split('_')[0][7:]
current_verse = child_id.split('_')[1][5:]
# print(f"Chapter: {current_chapter} - Verse: '{current_verse}'")
continue
if child.name == 'span' and 'w_ch' in child.get('class', []):
# print("Verse marker: ", child.get_text(strip=True))
continue
if child.name is None and child.string is not None:
# print(f"Child: '{child}' - '{child.string}'")
text = child.string
# count = len(text)
# stripped_count = len(text.strip())
# if count != stripped_count:
# print(f"Stripped count: {stripped_count} - count: {count}")
#text = text.lstrip()
single_verse_sections.append(text)
continue
if child.name == 'strong' and isinstance(child, Tag) and len(child.contents) > 0 and isinstance(child.contents[0], Tag) and child.contents[0].name == 'sup':
# print("Verse number: ", child.get_text(strip=True))
continue
if child.name == 'strong' and len(child.contents) > 0 and child.contents[0].name == 'span' and 'altsize':
span_child = child.contents[0]
if span_child.get('class', []) and len(span_child.contents) > 0 and span_child.contents[0].name == 'sup':
# print("Verse number: ", child.get_text(strip=True))
continue
if child.name == 'span' and child.get('id', '').startswith('footnotesource'):
continue
if child.name == 'a' and child.get('epub:type', '') == 'noteref':
continue
if child.name == 'span' and "pageNum" in child.get('class', []):
continue
if child.name == 'em' and child.string is not None:
single_verse_sections.append(child.string)
continue
if child.name == 'strong' and child.string is not None:
strong_text = child.string.strip()
if strong_text == current_verse:
continue
print(f"ERROR: Unknown child: {child}")
return None
# Add the last verse to the list
combined_verse_text = join_parts(single_verse_sections)
verses_in_paragraph.append((current_chapter, current_verse, combined_verse_text))
# Add the paragraph to the list
verse_paragraphs.append(verses_in_paragraph)
print(f"Found {len(verse_paragraphs)} paragraphs, containing {sum([len(paragraph) for paragraph in verse_paragraphs])} verses")
# # Print the verses
# print('----------------------------------')
# print("\n\n")
# print("\n\n")
# for paragraph in verse_paragraphs:
# for verse in paragraph:
# print(f"Chapter: {verse[0]} - Verse: {verse[1]} - Text: {verse[2]}")
# print("\n\n")
# print("\n\n")
# print('----------------------------------')
# print("\n\n")
# print("\n\n")
# for paragraph in verse_paragraphs:
# text = join_parts([verse[2] for verse in paragraph])
# print(text)
# print()
# print("\n\n")
# print("\n\n")
# print('----------------------------------')
chapter = BibleChapter()
for verses in verse_paragraphs:
text = join_parts([verse[2] for verse in verses])
verses = [BibleVerse(int(verse[0]), int(verse[1]), verse[2]) for verse in verses]
paragraph = BibleParagraph(verses, clean_text_extra(text.strip()))
chapter.paragraphs.append(paragraph)
chapter.verses.extend(verses)
chapter.text = clean_text_extra('\n'.join([paragraph.text.strip() for paragraph in chapter.paragraphs]))
# print(f"Chapter: {chapter.number} - \n\n{chapter.text}\n\n")
return chapter
def clean_text(text: str) -> str:
"""
Cleans the text by removing leading and trailing spaces and replacing multiple spaces with a single space.
"""
text = text.replace(chr(160), ' ') # Replace non-breaking space with space
text = text.replace(u"\u201D", '"') # Replace right double quotation mark with double quotation mark
text = text.replace(u"\u201C", '"') # Replace left double quotation mark with double quotation mark
text = text.replace(u"\u2019", "'") # Replace right single quotation mark with single quotation mark
text = text.replace(u"\u2018", "'") # Replace left single quotation mark with single quotation mark
text = text.replace(u"\u02b9", "'") # Replace right single quotation mark with single quotation mark
return text
def clean_text_extra(text: str) -> str:
"""Remove extra stuff from the summary text, but leave this stuff in the verses themselves"""
text = text.replace(u"\u00b7", '') # remove * dot in the middle of words for punctuation
# text = text.replace(u"\u2014", '-') # remove em dash
return text
# MARK: - Main
if __name__ == '__main__':
import sys
file_name = None
default_file = "nwt_E.epub"
try:
file_name = sys.argv[1] if len(sys.argv) > 1 else default_file
except IndexError:
print(f"Using default file: {default_file}")
except FileNotFoundError:
print(f"Error: Could not find file '{file_name}'")
sys.exit(1)
except Exception as e:
print(f"Error processing file: {str(e)}")
sys.exit(1)
file = file_name or default_file
# extract_book(file)
extract_verses(file)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment