Skip to content

Instantly share code, notes, and snippets.

@Pymmdrza
Created October 2, 2024 17:05
Show Gist options
  • Save Pymmdrza/b401e4d681c2eb4ed7d051738c0b7e23 to your computer and use it in GitHub Desktop.
Save Pymmdrza/b401e4d681c2eb4ed7d051738c0b7e23 to your computer and use it in GitHub Desktop.
Inserter ID Content for H1, H2, H3, H4, H5 Tag on HTML File. With Python
from bs4 import BeautifulSoup
import sys
import os
def generate_id_from_text(text):
"""
Generates a valid id from text by converting it to lower case,
removing special characters, and replacing spaces with hyphens.
"""
return text.lower().replace(" ", "-").replace(",", "").replace(".", "").replace("'", "").replace("\"", "").replace("&", "and")
def create_table_of_contents(soup, headers):
"""
Create a table of contents (TOC) for the given headers and return it as a list of HTML tags.
"""
toc = soup.new_tag("ul") # Main TOC list
# Iterate through each header and create links to them
for header in headers:
# Create <li><a href="#header-id">Header Text</a></li>
toc_item = soup.new_tag("li")
link = soup.new_tag("a", href=f"#{header['id']}")
link.string = header.get_text()
toc_item.append(link)
toc.append(toc_item)
return toc
def process_html_file(file_path):
"""
Process the given HTML file, add IDs to headers if missing, and create a Table of Contents (TOC).
Saves the modified file with '_new_edition' suffix in the same directory.
"""
# Read the content of the HTML file
with open(file_path, 'r', encoding='utf-8') as file:
content = file.read()
# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(content, 'html.parser')
# Find all header tags (h1 to h5) and check if they have ids, if not, generate ids
headers = soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5'])
for header in headers:
if not header.get('id'): # If id is not present
header['id'] = generate_id_from_text(header.get_text())
# Create the Table of Contents
toc = create_table_of_contents(soup, headers)
# Insert the TOC at the beginning of the body
if soup.body:
soup.body.insert(0, toc) # Insert TOC at the beginning of <body>
else:
soup.insert(0, toc) # Insert TOC at the beginning of the document if <body> not found
# Generate the new filename with '_new_edition' suffix
base_name, ext = os.path.splitext(file_path)
new_file_path = f"{base_name}_new_edition{ext}"
# Save the modified content to a new file
with open(new_file_path, 'w', encoding='utf-8') as file:
file.write(str(soup))
print(f"Successfully processed and saved the modified file as: {new_file_path}")
# Main entry point for script execution
if __name__ == "__main__":
if len(sys.argv) < 2:
print("Usage: python id_inserter.py <HTML_FILE_PATH>")
else:
html_file = sys.argv[1]
if not os.path.isfile(html_file):
print(f"Error: File '{html_file}' not found.")
else:
process_html_file(html_file)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment