kylehowells · February 10, 2025 09:32
diff --git a/download_nwt.py b/download_nwt.py

 # Get one language:
 # https://b.jw-cdn.org/apis/pub-media/GETPUBMEDIALINKS?booknum=0&output=json&pub=nwt&fileformat=jwpub&alllangs=0&langwritten=S

 # Get all languages:
 # https://b.jw-cdn.org/apis/pub-media/GETPUBMEDIALINKS?booknum=0&output=json&pub=nwt&fileformat=epub&alllangs=1&langwritten=E


 import requests


 def get_media_links(language_code: str, format: str, document: str) -> list[str]:
    response = requests.get(f'https://b.jw-cdn.org/apis/pub-media/GETPUBMEDIALINKS?booknum=0&output=json&pub={document}&fileformat={format}&alllangs=0&langwritten={language_code}')
    return response.json()

 # MARK: - Download

 def download(langCode: str|None=None, docID: str|None=None):
    language_code = langCode or 'E'
    # format = 'jwpub'
    format = 'epub'
    document = docID or 'nwt'
    media_links = get_media_links(language_code, format, document)
    print(media_links)
    publication_name = media_links['pubName']
    print("Publication Name: ", publication_name)
    file_url = media_links['files'][language_code][format.upper()][0]['file']['url']
    print("File URL: ", file_url)
    file_name = file_url.split('/')[-1]
    print(f"Downloading: {file_name}")
    response = requests.get(file_url)
    with open(file_name, 'wb') as f:
        f.write(response.content)
    print(f"Downloaded: {file_name}")


 # MARK: - Main

 if __name__ == '__main__':
    import sys
    langCode = None
    docID = None
    
    if len(sys.argv) > 1:
        langCode = sys.argv[1]
    if len(sys.argv) > 2:
        docID = sys.argv[2]
    
    download(langCode=langCode, docID=docID)


 # MARK: - Notes

 """"
 {
  "pubName": "La Biblia. Traducción del Nuevo Mundo (revisión del 2019)",
  "parentPubName": "La Biblia. Traducción del Nuevo Mundo (revisión del 2019)",
  "booknum": 0,
  "pub": "nwt",
  "issue": "",
  "formattedDate": "",
  "fileformat": [
    "JWPUB"
  ],
  "track": null,
  "specialty": "",
  "pubImage": {
    "url": "",
    "modifiedDatetime": "",
    "checksum": null
  },
  "languages": {
    "S": {
      "name": "español",
      "direction": "ltr",
      "locale": "es",
      "script": "ROMAN"
    }
  },
  "files": {
    "S": {
      "JWPUB": [
        {
          "title": "La Biblia. Traducción del Nuevo Mundo (revisión del 2019)",
          "file": {
            "url": "https://cfp2.jw-cdn.org/a/03c55e4/2/o/nwt_S.jwpub",
            "stream": "https://jw.org",
            "modifiedDatetime": "2024-08-15 14:38:30",
            "checksum": "31bef50c135d9940e97ebc47fb99cc44"
          },
          "filesize": 37413926,
          "trackImage": {
            "url": "",
            "modifiedDatetime": "",
            "checksum": null
          },
          "markers": null,
          "label": "0p",
          "track": 0,
          "hasTrack": false,
          "pub": "nwt",
          "docid": 0,
          "booknum": 0,
          "mimetype": "application/octet-stream",
          "edition": "",
          "editionDescr": "Normal",
          "format": "",
          "formatDescr": "Normal",
          "specialty": "",
          "specialtyDescr": "",
          "subtitled": false,
          "frameWidth": 0,
          "frameHeight": 0,
          "frameRate": 0,
          "duration": 0,
          "bitRate": 0
        }
      ]
    }
  }
 }
 """
diff --git a/epub_to_json.py b/epub_to_json.py
 import json
 import ebooklib
 from ebooklib import epub
 from pathlib import Path
 import os
 from bs4 import BeautifulSoup, Tag
 from bs4.element import Tag, NavigableString
 import re
 from dataclasses import dataclass, asdict


 # MARK: - Helper Function

 def read_item(item: epub.EpubItem) -> BeautifulSoup:
    print(item.get_name())
    content = item.get_content()
    soup = BeautifulSoup(content, 'html.parser')
    return soup


 # MARK: - Extract Book into HTML Files

 def extract_book(book_path: str) -> None:
    """
    Extracts content from an EPUB book and processes its documents.
    
    Args:
        book_path: str - Path to the EPUB file to process
    """
    # Convert book path to Path object for proper handling
    path_obj = Path(book_path)
    basename = path_obj.stem
    
    # Create output directory if it doesn't exist
    output_dir = Path(basename)
    output_dir.mkdir(exist_ok=True)

    book = epub.read_epub(book_path)

    for item in book.get_items():
        if item.get_type() == ebooklib.ITEM_DOCUMENT:
            print('==================================')
            file_name = item.get_name()
            print('NAME : ', file_name)
            print('----------------------------------')
            content = item.get_content()
            
            # Write content to file in the output directory
            output_path = output_dir / file_name
            os.makedirs(os.path.dirname(output_path), exist_ok=True)
            
            with open(output_path, 'wb') as f:
                f.write(content)
            
            print('==================================')


 # MARK: - Extract Verse Info

 @dataclass
 class BibleVerse:
    # The chapter number
    chapter: int
    # The verse number
    verse: int
    # The text of the verse
    text: str

 @dataclass
 class BibleParagraph:
    # verses don't mean a whole sentence, they can be a part of a sentence.
    # Group into full paragraphs.
    verses: list[BibleVerse]
    # The text of the paragraph
    text: str

 @dataclass
 class BibleChapter:
    # The chapter number
    number: int
    # The grouped paragraphs
    paragraphs: list[BibleParagraph]
    # The individual verses
    verses: list[BibleVerse]
    # The text whole of the chapter
    text: str

    def __init__(self):
        self.verses = []
        self.paragraphs = []
        self.text = ""
        self.number = 0

 @dataclass
 class BibleBook:
    # The name of the book of the Bible
    name: str
    # The chapters of the book of the Bible
    chapters: list[BibleChapter]

    def __init__(self):
        self.name = ""
        self.chapters = []

 @dataclass
 class Bible:
    # The translation name
    name: str
    # The books of the Bible
    books: list[BibleBook]

    def __init__(self):
        self.name = ""
        self.books = []

 class BibleEncoder(json.JSONEncoder):
    """
    Custom JSON encoder for Bible classes.
    """
    def default(self, obj):
        if isinstance(obj, (Bible, BibleBook, BibleChapter)):
            return asdict(obj)
        return super().default(obj)


 # MARK: - Extract Bible

 def extract_verses(book_file_name: str) -> Bible:
    """
    Extracts verses from a book file.
    
    Args:
        book_file_name: str - Name of the book file to process
    Returns:
        Bible - The parsed Bible object
    """
    book = epub.read_epub(book_file_name)

    bible = Bible()
    bible.name = get_title(book)
    bible.books = []
    
    toc_items = list_toc(book)
    for book_name, book_link in toc_items:
        print(f"{book_name} -> {book_link}")
        chapter_links = extract_chapter_links(book_link, book)
        bible_book = BibleBook()
        bible_book.name = book_name
        bible_book.chapters = []

        for chapter_num, chapter_link in chapter_links:
            chapter = extract_chapter_verses(chapter_link, book)
            print(f"Found {len(chapter.paragraphs)} paragraphs")
            print(f"Found {len(chapter.verses)} verses")
            print(f"Found {sum([len(paragraph.verses) for paragraph in chapter.paragraphs])} p.verses")
            print(f"Found {len(chapter.text)} text")
            chapter.number = int(chapter_num)
            bible_book.chapters.append(chapter)
            # return

        bible.books.append(bible_book)

    print(f"Found {len(bible.books)} books")

    # Write bible to json file using the custom encoder
    path_obj = Path(book_file_name)
    basename = path_obj.stem
    with open(f"{basename}.json", "w") as f:
        json.dump(bible, f, cls=BibleEncoder, indent=2)

    return bible


 def get_title(book: epub.EpubBook) -> str:
    """
    Extracts the title from the EPUB book.
    """
    return book.get_metadata('DC', 'title')[0][0]


 def list_toc(book: epub.EpubBook) -> list[tuple[str, str]]:
    """
    Lists all table of contents entries from the EPUB book.
    
    Args:
        book_path: str - Path to the EPUB file to process
    Returns:
        list[tuple[str, str]] - List of tuples containing the (title, link) of each TOC entry
    """

    toc_items = []

    # Get the table of contents
    for item in book.get_items():
        if item.get_type() == ebooklib.ITEM_DOCUMENT:
            if item.get_name().startswith('toc'):
                soup = read_item(item)
                # Find the TOC navigation list
                nav_list = soup.find('nav', attrs={'epub:type': 'toc'}).find('ol')
                
                # Extract all links and their text
                for link in nav_list.find_all('a'):
                    href = link.get('href')
                    text = link.get_text(strip=True)
                    toc_items.append((text, href))
    
    # Get just the links to Bible Book Chapter Lists TOCs
    book_tocs = []
    for item in toc_items:
        name = item[0]
        link = item[1]
        if link.startswith('biblechapternav'):
            book_tocs.append(item)
            print(f"{name} -> {link}")
    
    return book_tocs


 def extract_chapter_links(book_file_name: str, book: epub.EpubBook) -> list[tuple[str, str]]:
    """
    Extracts chapter links from the bible chapter navigation file.
    
    Args:
        book_file_name: str - Name of the navigation file to process (e.g. biblechapternav1.xhtml)
        book: epub.EpubBook - The epub book object containing the file
    Returns:
        list[tuple[str, str]] - List of tuples containing the (chapter number, link) of each chapter
    """
    print(f"Extracting chapter verses from {book_file_name}")
    item = book.get_item_with_href(book_file_name)
    if item is None:
        print(f"Could not find item with href {book_file_name}")
        return
        
    soup = read_item(item)
    
    # Get the book name from the h2 heading
    book_heading = soup.find('h2', class_='w_navigation w_bibleChapter')
    if book_heading:
        book_name = book_heading.get_text(strip=True)
        print(f"Processing book: {book_name}")
    
    # Find all chapter links in the table
    chapter_links = soup.find_all('a')
    chapters = []
    
    for link in chapter_links:
        if link.parent.name == 'td':  # Only process links inside table cells
            href = link.get('href')
            chapter_num = link.get_text(strip=True)
            chapters.append((chapter_num, href))
            print(f"Chapter {chapter_num}: {href}")
    
    return chapters

 def join_parts(parts: list[str]) -> str:
    result = []
    for i, part in enumerate(parts):
        part = clean_text(part)
        # print unicode character values for the first 3 chars in this string
        # for char in part[:3]:
        #     print(f"Char: '{char}' - {ord(char)}")
        # # Add the last 3 chars of the part to the result
        # for char in part[-3:]:
        #     print(f"Char: '{char}' - {ord(char)}")
        if i > 0:
            # If previous part ends with space and current starts with space
            # strip spaces from both and add single space
            previous_part = clean_text(result[-1])
            if previous_part.endswith(' ') and part.startswith(' '):
                result[-1] = previous_part.rstrip()
                result.append(' ' + part.lstrip())
            else:
                result.append(clean_text(part))
        else:
            result.append(clean_text(part))
    return ''.join(result)

 def extract_chapter_verses(chapter_link: str, book: epub.EpubBook) -> BibleChapter | None:
    """
    Extracts verses from a chapter file in the EPUB book.
    
    Args:
        chapter_link: str - Link to the chapter file to process
        book: epub.EpubBook - The EPUB book object
        
    Returns:
        list[BibleVerse] - List of parsed Bible verses
    """
    item = book.get_item_with_href(chapter_link)
    if item is None:
        print(f"Could not find item with href {chapter_link}")
        return None
    
    soup = read_item(item)
    print(f"Processing chapter: {chapter_link}")
    
    # Get all p tags with class sb
    verse_paragraph_elements = soup.find_all('p', class_='sb')
    print(f"Found {len(verse_paragraph_elements)} verse paragraph elements")
    # <p id="p2" data-pid="2" class="p2 sb"><span id="pos48"/><span id="chapter1"/><span id="chapter1_verse1"/><span class="w_ch"><strong>1</strong> </span>In the beginning God created the heavens and the earth.</p>&#13;
    
    # List of all the paragraph's (lists of verses) in the chapter
    verse_paragraphs: list[list[str]] = []

    # <p id="p7" data-pid="7" class="p7 sb">
    # <span id="pos1609"/>
    # <span id="chapter1_verse14"/><strong><sup>14</sup></strong> Then God said: “Let there be luminaries<span id="footnotesource5"/><a epub:type="noteref" href="#footnote5">*</a> in the expanse of the heavens to make a division between the day and the night, and they will serve as signs for seasons and for days and years.
    # <span id="chapter1_verse15"/><strong><sup>15</sup></strong> They will serve as luminaries in the expanse of the heavens to shine upon the earth.” And it was so.
    # <span id="chapter1_verse16"/><strong><sup>16</sup></strong> And God went on to make the two great luminaries, the greater luminary for dominating the day and the lesser luminary for dominating the night, and also the stars.
    # <span id="chapter1_verse17"/><strong><sup>17</sup></strong> Thus God put them in the expanse of the heavens to shine upon the earth <span id="chapter1_verse18"/><strong><sup>18</sup></strong> and to dominate by day and by night and to make a division between the light and the darkness. Then God saw that it was good.
    # <span id="chapter1_verse19"/><strong><sup>19</sup></strong> And there was evening and there was morning, a fourth day.
    # </p>&#13;

    for verse_paragraph in verse_paragraph_elements:
        verses_in_paragraph: list[tuple[int, int, str]] = [] # List of verses in the paragraph (chapter, verse, text)
        single_verse_sections: list[str] = [] # List of sections of text which make up a single verse
        current_chapter: int = 0
        current_verse: int = 0

        for child in verse_paragraph.children:
            # print(f"Child: '{child}' - '{child.name}'")
            # if child is not str and child.name is not None:
            #     print(f"{child.name} - {child.get('id', '')} - {child.get('class', '')} - {child.children}")
            if child.name == 'span' and child.get('id', '').startswith('pos'):
                continue
            if child.name == 'span' and child.get('id', '').startswith('chapter') and not '_verse' in child.get('id', ''):
                current_verse = child.get_text(strip=True)
                # print(f"Chapter: {current_chapter} - Verse: '{current_verse}'")
                continue
            if child.name == 'span' and child.get('id', '').startswith('chapter') and '_verse' in child.get('id', ''):
                child_id = child.get('id', '')
                # Found the end of this verse
                if len(single_verse_sections) > 0:
                    combined_verse_text = join_parts(single_verse_sections)
                    verses_in_paragraph.append((current_chapter, current_verse, combined_verse_text))
                    single_verse_sections = []
                current_chapter = child_id.split('_')[0][7:]
                current_verse = child_id.split('_')[1][5:]
                # print(f"Chapter: {current_chapter} - Verse: '{current_verse}'")
                continue
            if child.name == 'span' and 'w_ch' in child.get('class', []):
                # print("Verse marker: ", child.get_text(strip=True))
                continue
            if child.name is None and child.string is not None:
                # print(f"Child: '{child}' - '{child.string}'")
                text = child.string
                # count = len(text)
                # stripped_count = len(text.strip())
                # if count != stripped_count:
                #     print(f"Stripped count: {stripped_count} - count: {count}")
                #text = text.lstrip()
                single_verse_sections.append(text)
                continue
            if child.name == 'strong' and isinstance(child, Tag) and len(child.contents) > 0 and isinstance(child.contents[0], Tag) and child.contents[0].name == 'sup':
                # print("Verse number: ", child.get_text(strip=True))
                continue
            if child.name == 'strong' and len(child.contents) > 0 and child.contents[0].name == 'span' and 'altsize':
                span_child = child.contents[0]
                if span_child.get('class', []) and len(span_child.contents) > 0 and span_child.contents[0].name == 'sup':
                    # print("Verse number: ", child.get_text(strip=True))
                    continue
            if child.name == 'span' and child.get('id', '').startswith('footnotesource'):
                continue
            if child.name == 'a' and child.get('epub:type', '') == 'noteref':
                continue
            if child.name == 'span' and "pageNum" in child.get('class', []):
                continue
            if child.name == 'em' and child.string is not None:
                single_verse_sections.append(child.string)
                continue
            if child.name == 'strong' and child.string is not None:
                strong_text = child.string.strip()
                if strong_text == current_verse:
                    continue
            print(f"ERROR: Unknown child: {child}")
            return None

        # Add the last verse to the list
        combined_verse_text = join_parts(single_verse_sections)
        verses_in_paragraph.append((current_chapter, current_verse, combined_verse_text))
        # Add the paragraph to the list
        verse_paragraphs.append(verses_in_paragraph)
    
    print(f"Found {len(verse_paragraphs)} paragraphs, containing {sum([len(paragraph) for paragraph in verse_paragraphs])} verses")

    # # Print the verses
    # print('----------------------------------')
    # print("\n\n")
    # print("\n\n")
    # for paragraph in verse_paragraphs:
    #     for verse in paragraph:
    #         print(f"Chapter: {verse[0]} - Verse: {verse[1]} - Text: {verse[2]}")
    # print("\n\n")
    # print("\n\n")
    # print('----------------------------------')
    # print("\n\n")
    # print("\n\n")
    # for paragraph in verse_paragraphs:
    #     text = join_parts([verse[2] for verse in paragraph])
    #     print(text)
    #     print()
    # print("\n\n")
    # print("\n\n")
    # print('----------------------------------')

    chapter = BibleChapter()

    for verses in verse_paragraphs:
        text = join_parts([verse[2] for verse in verses])
        verses = [BibleVerse(int(verse[0]), int(verse[1]), verse[2]) for verse in verses]
        paragraph = BibleParagraph(verses, clean_text_extra(text.strip()))
        chapter.paragraphs.append(paragraph)
        chapter.verses.extend(verses)

    chapter.text = clean_text_extra('\n'.join([paragraph.text.strip() for paragraph in chapter.paragraphs]))
    # print(f"Chapter: {chapter.number} - \n\n{chapter.text}\n\n")
    return chapter

 def clean_text(text: str) -> str:
    """
    Cleans the text by removing leading and trailing spaces and replacing multiple spaces with a single space.
    """
    text = text.replace(chr(160), ' ') # Replace non-breaking space with space
    text = text.replace(u"\u201D", '"') # Replace right double quotation mark with double quotation mark
    text = text.replace(u"\u201C", '"') # Replace left double quotation mark with double quotation mark
    text = text.replace(u"\u2019", "'") # Replace right single quotation mark with single quotation mark
    text = text.replace(u"\u2018", "'") # Replace left single quotation mark with single quotation mark
    text = text.replace(u"\u02b9", "'") # Replace right single quotation mark with single quotation mark
    return text

 def clean_text_extra(text: str) -> str:
    """Remove extra stuff from the summary text, but leave this stuff in the verses themselves"""
    text = text.replace(u"\u00b7", '') # remove * dot in the middle of words for punctuation
    # text = text.replace(u"\u2014", '-') # remove em dash
    return text

 # MARK: - Main

 if __name__ == '__main__':
    import sys

    file_name = None
    default_file = "nwt_E.epub"
    
    try:
        file_name = sys.argv[1] if len(sys.argv) > 1 else default_file
    except IndexError:
        print(f"Using default file: {default_file}")
    except FileNotFoundError:
        print(f"Error: Could not find file '{file_name}'")
        sys.exit(1)
    except Exception as e:
        print(f"Error processing file: {str(e)}")
        sys.exit(1)
    
    file = file_name or default_file
    # extract_book(file)
    extract_verses(file)

	# Get one language:
	# https://b.jw-cdn.org/apis/pub-media/GETPUBMEDIALINKS?booknum=0&output=json&pub=nwt&fileformat=jwpub&alllangs=0&langwritten=S

	# Get all languages:
	# https://b.jw-cdn.org/apis/pub-media/GETPUBMEDIALINKS?booknum=0&output=json&pub=nwt&fileformat=epub&alllangs=1&langwritten=E


	import requests


	def get_media_links(language_code: str, format: str, document: str) -> list[str]:
	response = requests.get(f'https://b.jw-cdn.org/apis/pub-media/GETPUBMEDIALINKS?booknum=0&output=json&pub={document}&fileformat={format}&alllangs=0&langwritten={language_code}')
	return response.json()

	# MARK: - Download

	def download(langCode: str\|None=None, docID: str\|None=None):
	language_code = langCode or 'E'
	# format = 'jwpub'
	format = 'epub'
	document = docID or 'nwt'
	media_links = get_media_links(language_code, format, document)
	print(media_links)
	publication_name = media_links['pubName']
	print("Publication Name: ", publication_name)
	file_url = media_links['files'][language_code][format.upper()][0]['file']['url']
	print("File URL: ", file_url)
	file_name = file_url.split('/')[-1]
	print(f"Downloading: {file_name}")
	response = requests.get(file_url)
	with open(file_name, 'wb') as f:
	f.write(response.content)
	print(f"Downloaded: {file_name}")


	# MARK: - Main

	if __name__ == '__main__':
	import sys
	langCode = None
	docID = None

	if len(sys.argv) > 1:
	langCode = sys.argv[1]
	if len(sys.argv) > 2:
	docID = sys.argv[2]

	download(langCode=langCode, docID=docID)


	# MARK: - Notes

	""""
	{
	"pubName": "La Biblia. Traducción del Nuevo Mundo (revisión del 2019)",
	"parentPubName": "La Biblia. Traducción del Nuevo Mundo (revisión del 2019)",
	"booknum": 0,
	"pub": "nwt",
	"issue": "",
	"formattedDate": "",
	"fileformat": [
	"JWPUB"
	],
	"track": null,
	"specialty": "",
	"pubImage": {
	"url": "",
	"modifiedDatetime": "",
	"checksum": null
	},
	"languages": {
	"S": {
	"name": "español",
	"direction": "ltr",
	"locale": "es",
	"script": "ROMAN"
	}
	},
	"files": {
	"S": {
	"JWPUB": [
	{
	"title": "La Biblia. Traducción del Nuevo Mundo (revisión del 2019)",
	"file": {
	"url": "https://cfp2.jw-cdn.org/a/03c55e4/2/o/nwt_S.jwpub",
	"stream": "https://jw.org",
	"modifiedDatetime": "2024-08-15 14:38:30",
	"checksum": "31bef50c135d9940e97ebc47fb99cc44"
	},
	"filesize": 37413926,
	"trackImage": {
	"url": "",
	"modifiedDatetime": "",
	"checksum": null
	},
	"markers": null,
	"label": "0p",
	"track": 0,
	"hasTrack": false,
	"pub": "nwt",
	"docid": 0,
	"booknum": 0,
	"mimetype": "application/octet-stream",
	"edition": "",
	"editionDescr": "Normal",
	"format": "",
	"formatDescr": "Normal",
	"specialty": "",
	"specialtyDescr": "",
	"subtitled": false,
	"frameWidth": 0,
	"frameHeight": 0,
	"frameRate": 0,
	"duration": 0,
	"bitRate": 0
	}
	]
	}
	}
	}
	"""
	import json
	import ebooklib
	from ebooklib import epub
	from pathlib import Path
	import os
	from bs4 import BeautifulSoup, Tag
	from bs4.element import Tag, NavigableString
	import re
	from dataclasses import dataclass, asdict


	# MARK: - Helper Function

	def read_item(item: epub.EpubItem) -> BeautifulSoup:
	print(item.get_name())
	content = item.get_content()
	soup = BeautifulSoup(content, 'html.parser')
	return soup


	# MARK: - Extract Book into HTML Files

	def extract_book(book_path: str) -> None:
	"""
	Extracts content from an EPUB book and processes its documents.

	Args:
	book_path: str - Path to the EPUB file to process
	"""
	# Convert book path to Path object for proper handling
	path_obj = Path(book_path)
	basename = path_obj.stem

	# Create output directory if it doesn't exist
	output_dir = Path(basename)
	output_dir.mkdir(exist_ok=True)

	book = epub.read_epub(book_path)

	for item in book.get_items():
	if item.get_type() == ebooklib.ITEM_DOCUMENT:
	print('==================================')
	file_name = item.get_name()
	print('NAME : ', file_name)
	print('----------------------------------')
	content = item.get_content()

	# Write content to file in the output directory
	output_path = output_dir / file_name
	os.makedirs(os.path.dirname(output_path), exist_ok=True)

	with open(output_path, 'wb') as f:
	f.write(content)

	print('==================================')


	# MARK: - Extract Verse Info

	@dataclass
	class BibleVerse:
	# The chapter number
	chapter: int
	# The verse number
	verse: int
	# The text of the verse
	text: str

	@dataclass
	class BibleParagraph:
	# verses don't mean a whole sentence, they can be a part of a sentence.
	# Group into full paragraphs.
	verses: list[BibleVerse]
	# The text of the paragraph
	text: str

	@dataclass
	class BibleChapter:
	# The chapter number
	number: int
	# The grouped paragraphs
	paragraphs: list[BibleParagraph]
	# The individual verses
	verses: list[BibleVerse]
	# The text whole of the chapter
	text: str

	def __init__(self):
	self.verses = []
	self.paragraphs = []
	self.text = ""
	self.number = 0

	@dataclass
	class BibleBook:
	# The name of the book of the Bible
	name: str
	# The chapters of the book of the Bible
	chapters: list[BibleChapter]

	def __init__(self):
	self.name = ""
	self.chapters = []

	@dataclass
	class Bible:
	# The translation name
	name: str
	# The books of the Bible
	books: list[BibleBook]

	def __init__(self):
	self.name = ""
	self.books = []

	class BibleEncoder(json.JSONEncoder):
	"""
	Custom JSON encoder for Bible classes.
	"""
	def default(self, obj):
	if isinstance(obj, (Bible, BibleBook, BibleChapter)):
	return asdict(obj)
	return super().default(obj)


	# MARK: - Extract Bible

	def extract_verses(book_file_name: str) -> Bible:
	"""
	Extracts verses from a book file.

	Args:
	book_file_name: str - Name of the book file to process
	Returns:
	Bible - The parsed Bible object
	"""
	book = epub.read_epub(book_file_name)

	bible = Bible()
	bible.name = get_title(book)
	bible.books = []

	toc_items = list_toc(book)
	for book_name, book_link in toc_items:
	print(f"{book_name} -> {book_link}")
	chapter_links = extract_chapter_links(book_link, book)
	bible_book = BibleBook()
	bible_book.name = book_name
	bible_book.chapters = []

	for chapter_num, chapter_link in chapter_links:
	chapter = extract_chapter_verses(chapter_link, book)
	print(f"Found {len(chapter.paragraphs)} paragraphs")
	print(f"Found {len(chapter.verses)} verses")
	print(f"Found {sum([len(paragraph.verses) for paragraph in chapter.paragraphs])} p.verses")
	print(f"Found {len(chapter.text)} text")
	chapter.number = int(chapter_num)
	bible_book.chapters.append(chapter)
	# return

	bible.books.append(bible_book)

	print(f"Found {len(bible.books)} books")

	# Write bible to json file using the custom encoder
	path_obj = Path(book_file_name)
	basename = path_obj.stem
	with open(f"{basename}.json", "w") as f:
	json.dump(bible, f, cls=BibleEncoder, indent=2)

	return bible


	def get_title(book: epub.EpubBook) -> str:
	"""
	Extracts the title from the EPUB book.
	"""
	return book.get_metadata('DC', 'title')[0][0]


	def list_toc(book: epub.EpubBook) -> list[tuple[str, str]]:
	"""
	Lists all table of contents entries from the EPUB book.

	Args:
	book_path: str - Path to the EPUB file to process
	Returns:
	list[tuple[str, str]] - List of tuples containing the (title, link) of each TOC entry
	"""

	toc_items = []

	# Get the table of contents
	for item in book.get_items():
	if item.get_type() == ebooklib.ITEM_DOCUMENT:
	if item.get_name().startswith('toc'):
	soup = read_item(item)
	# Find the TOC navigation list
	nav_list = soup.find('nav', attrs={'epub:type': 'toc'}).find('ol')

	# Extract all links and their text
	for link in nav_list.find_all('a'):
	href = link.get('href')
	text = link.get_text(strip=True)
	toc_items.append((text, href))

	# Get just the links to Bible Book Chapter Lists TOCs
	book_tocs = []
	for item in toc_items:
	name = item[0]
	link = item[1]
	if link.startswith('biblechapternav'):
	book_tocs.append(item)
	print(f"{name} -> {link}")

	return book_tocs


	def extract_chapter_links(book_file_name: str, book: epub.EpubBook) -> list[tuple[str, str]]:
	"""
	Extracts chapter links from the bible chapter navigation file.

	Args:
	book_file_name: str - Name of the navigation file to process (e.g. biblechapternav1.xhtml)
	book: epub.EpubBook - The epub book object containing the file
	Returns:
	list[tuple[str, str]] - List of tuples containing the (chapter number, link) of each chapter
	"""
	print(f"Extracting chapter verses from {book_file_name}")
	item = book.get_item_with_href(book_file_name)
	if item is None:
	print(f"Could not find item with href {book_file_name}")
	return

	soup = read_item(item)

	# Get the book name from the h2 heading
	book_heading = soup.find('h2', class_='w_navigation w_bibleChapter')
	if book_heading:
	book_name = book_heading.get_text(strip=True)
	print(f"Processing book: {book_name}")

	# Find all chapter links in the table
	chapter_links = soup.find_all('a')
	chapters = []

	for link in chapter_links:
	if link.parent.name == 'td': # Only process links inside table cells
	href = link.get('href')
	chapter_num = link.get_text(strip=True)
	chapters.append((chapter_num, href))
	print(f"Chapter {chapter_num}: {href}")

	return chapters

	def join_parts(parts: list[str]) -> str:
	result = []
	for i, part in enumerate(parts):
	part = clean_text(part)
	# print unicode character values for the first 3 chars in this string
	# for char in part[:3]:
	# print(f"Char: '{char}' - {ord(char)}")
	# # Add the last 3 chars of the part to the result
	# for char in part[-3:]:
	# print(f"Char: '{char}' - {ord(char)}")
	if i > 0:
	# If previous part ends with space and current starts with space
	# strip spaces from both and add single space
	previous_part = clean_text(result[-1])
	if previous_part.endswith(' ') and part.startswith(' '):
	result[-1] = previous_part.rstrip()
	result.append(' ' + part.lstrip())
	else:
	result.append(clean_text(part))
	else:
	result.append(clean_text(part))
	return ''.join(result)

	def extract_chapter_verses(chapter_link: str, book: epub.EpubBook) -> BibleChapter \| None:
	"""
	Extracts verses from a chapter file in the EPUB book.

	Args:
	chapter_link: str - Link to the chapter file to process
	book: epub.EpubBook - The EPUB book object

	Returns:
	list[BibleVerse] - List of parsed Bible verses
	"""
	item = book.get_item_with_href(chapter_link)
	if item is None:
	print(f"Could not find item with href {chapter_link}")
	return None

	soup = read_item(item)
	print(f"Processing chapter: {chapter_link}")

	# Get all p tags with class sb
	verse_paragraph_elements = soup.find_all('p', class_='sb')
	print(f"Found {len(verse_paragraph_elements)} verse paragraph elements")
	# <p id="p2" data-pid="2" class="p2 sb"><span id="pos48"/><span id="chapter1"/><span id="chapter1_verse1"/><span class="w_ch"><strong>1</strong> </span>In the beginning God created the heavens and the earth.</p>

	# List of all the paragraph's (lists of verses) in the chapter
	verse_paragraphs: list[list[str]] = []

	# <p id="p7" data-pid="7" class="p7 sb">
	# <span id="pos1609"/>
	# <span id="chapter1_verse14"/><strong><sup>14</sup></strong> Then God said: “Let there be luminaries<span id="footnotesource5"/><a epub:type="noteref" href="#footnote5">*</a> in the expanse of the heavens to make a division between the day and the night, and they will serve as signs for seasons and for days and years.
	# <span id="chapter1_verse15"/><strong><sup>15</sup></strong> They will serve as luminaries in the expanse of the heavens to shine upon the earth.” And it was so.
	# <span id="chapter1_verse16"/><strong><sup>16</sup></strong> And God went on to make the two great luminaries, the greater luminary for dominating the day and the lesser luminary for dominating the night, and also the stars.
	# <span id="chapter1_verse17"/><strong><sup>17</sup></strong> Thus God put them in the expanse of the heavens to shine upon the earth <span id="chapter1_verse18"/><strong><sup>18</sup></strong> and to dominate by day and by night and to make a division between the light and the darkness. Then God saw that it was good.
	# <span id="chapter1_verse19"/><strong><sup>19</sup></strong> And there was evening and there was morning, a fourth day.
	# </p>

	for verse_paragraph in verse_paragraph_elements:
	verses_in_paragraph: list[tuple[int, int, str]] = [] # List of verses in the paragraph (chapter, verse, text)
	single_verse_sections: list[str] = [] # List of sections of text which make up a single verse
	current_chapter: int = 0
	current_verse: int = 0

	for child in verse_paragraph.children:
	# print(f"Child: '{child}' - '{child.name}'")
	# if child is not str and child.name is not None:
	# print(f"{child.name} - {child.get('id', '')} - {child.get('class', '')} - {child.children}")
	if child.name == 'span' and child.get('id', '').startswith('pos'):
	continue
	if child.name == 'span' and child.get('id', '').startswith('chapter') and not '_verse' in child.get('id', ''):
	current_verse = child.get_text(strip=True)
	# print(f"Chapter: {current_chapter} - Verse: '{current_verse}'")
	continue
	if child.name == 'span' and child.get('id', '').startswith('chapter') and '_verse' in child.get('id', ''):
	child_id = child.get('id', '')
	# Found the end of this verse
	if len(single_verse_sections) > 0:
	combined_verse_text = join_parts(single_verse_sections)
	verses_in_paragraph.append((current_chapter, current_verse, combined_verse_text))
	single_verse_sections = []
	current_chapter = child_id.split('_')[0][7:]
	current_verse = child_id.split('_')[1][5:]
	# print(f"Chapter: {current_chapter} - Verse: '{current_verse}'")
	continue
	if child.name == 'span' and 'w_ch' in child.get('class', []):
	# print("Verse marker: ", child.get_text(strip=True))
	continue
	if child.name is None and child.string is not None:
	# print(f"Child: '{child}' - '{child.string}'")
	text = child.string
	# count = len(text)
	# stripped_count = len(text.strip())
	# if count != stripped_count:
	# print(f"Stripped count: {stripped_count} - count: {count}")
	#text = text.lstrip()
	single_verse_sections.append(text)
	continue
	if child.name == 'strong' and isinstance(child, Tag) and len(child.contents) > 0 and isinstance(child.contents[0], Tag) and child.contents[0].name == 'sup':
	# print("Verse number: ", child.get_text(strip=True))
	continue
	if child.name == 'strong' and len(child.contents) > 0 and child.contents[0].name == 'span' and 'altsize':
	span_child = child.contents[0]
	if span_child.get('class', []) and len(span_child.contents) > 0 and span_child.contents[0].name == 'sup':
	# print("Verse number: ", child.get_text(strip=True))
	continue
	if child.name == 'span' and child.get('id', '').startswith('footnotesource'):
	continue
	if child.name == 'a' and child.get('epub:type', '') == 'noteref':
	continue
	if child.name == 'span' and "pageNum" in child.get('class', []):
	continue
	if child.name == 'em' and child.string is not None:
	single_verse_sections.append(child.string)
	continue
	if child.name == 'strong' and child.string is not None:
	strong_text = child.string.strip()
	if strong_text == current_verse:
	continue
	print(f"ERROR: Unknown child: {child}")
	return None

	# Add the last verse to the list
	combined_verse_text = join_parts(single_verse_sections)
	verses_in_paragraph.append((current_chapter, current_verse, combined_verse_text))
	# Add the paragraph to the list
	verse_paragraphs.append(verses_in_paragraph)

	print(f"Found {len(verse_paragraphs)} paragraphs, containing {sum([len(paragraph) for paragraph in verse_paragraphs])} verses")

	# # Print the verses
	# print('----------------------------------')
	# print("\n\n")
	# print("\n\n")
	# for paragraph in verse_paragraphs:
	# for verse in paragraph:
	# print(f"Chapter: {verse[0]} - Verse: {verse[1]} - Text: {verse[2]}")
	# print("\n\n")
	# print("\n\n")
	# print('----------------------------------')
	# print("\n\n")
	# print("\n\n")
	# for paragraph in verse_paragraphs:
	# text = join_parts([verse[2] for verse in paragraph])
	# print(text)
	# print()
	# print("\n\n")
	# print("\n\n")
	# print('----------------------------------')

	chapter = BibleChapter()

	for verses in verse_paragraphs:
	text = join_parts([verse[2] for verse in verses])
	verses = [BibleVerse(int(verse[0]), int(verse[1]), verse[2]) for verse in verses]
	paragraph = BibleParagraph(verses, clean_text_extra(text.strip()))
	chapter.paragraphs.append(paragraph)
	chapter.verses.extend(verses)

	chapter.text = clean_text_extra('\n'.join([paragraph.text.strip() for paragraph in chapter.paragraphs]))
	# print(f"Chapter: {chapter.number} - \n\n{chapter.text}\n\n")
	return chapter

	def clean_text(text: str) -> str:
	"""
	Cleans the text by removing leading and trailing spaces and replacing multiple spaces with a single space.
	"""
	text = text.replace(chr(160), ' ') # Replace non-breaking space with space
	text = text.replace(u"\u201D", '"') # Replace right double quotation mark with double quotation mark
	text = text.replace(u"\u201C", '"') # Replace left double quotation mark with double quotation mark
	text = text.replace(u"\u2019", "'") # Replace right single quotation mark with single quotation mark
	text = text.replace(u"\u2018", "'") # Replace left single quotation mark with single quotation mark
	text = text.replace(u"\u02b9", "'") # Replace right single quotation mark with single quotation mark
	return text

	def clean_text_extra(text: str) -> str:
	"""Remove extra stuff from the summary text, but leave this stuff in the verses themselves"""
	text = text.replace(u"\u00b7", '') # remove * dot in the middle of words for punctuation
	# text = text.replace(u"\u2014", '-') # remove em dash
	return text

	# MARK: - Main

	if __name__ == '__main__':
	import sys

	file_name = None
	default_file = "nwt_E.epub"

	try:
	file_name = sys.argv[1] if len(sys.argv) > 1 else default_file
	except IndexError:
	print(f"Using default file: {default_file}")
	except FileNotFoundError:
	print(f"Error: Could not find file '{file_name}'")
	sys.exit(1)
	except Exception as e:
	print(f"Error processing file: {str(e)}")
	sys.exit(1)

	file = file_name or default_file
	# extract_book(file)
	extract_verses(file)