Last active
April 8, 2025 04:48
-
-
Save andreagrandi/0a7bf6e217d6561b00b6a5de6211ddaa to your computer and use it in GitHub Desktop.
Python script to migrate posts from Pelican to Hugo
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Migration script from Pelican to Hugo | |
import os, re, shutil | |
from pathlib import Path | |
INPUT_FOLDER = "content" | |
OUTPUT_FOLDER = "content-hugo" | |
# Custom sort key function | |
def sort_key(path): | |
# Extract the base filename without the extension | |
basename = os.path.basename(path) | |
# Extract the leading number before the dash | |
number = basename.split('-', 1)[0] | |
# Convert to integer to ensure numeric sort | |
return int(number) | |
def get_posts_filenames(folder): | |
file_list = [] | |
for root, _, files in os.walk(folder): | |
for file in files: | |
file_path = os.path.join(root, file) | |
if file_path.endswith(".md"): | |
if is_article(file_path): | |
file_list.append(file_path) | |
return sorted(file_list, key=sort_key) | |
def read_file_content(file_name): | |
with open(file_name, 'r') as file: | |
content = file.read() | |
return content | |
def metadata_to_yaml(metadata): | |
# Split the content by double newlines to separate metadata from the main content | |
lines = metadata.split("\n") | |
yaml_content = "---\n" | |
for line in lines: | |
if ":" in line: | |
key, value = line.split(": ", 1) | |
key = key.lower() # Convert keys to lowercase for consistency | |
# Special case for 'Date' to remove the time part | |
if key == "date": | |
value = value.split(" ")[0] | |
yaml_content += f"{key}: {value}\n" | |
# Special case for 'Status' | |
elif key == "status": | |
key = "draft" | |
value = "false" if value == "published" else "true" | |
yaml_content += f'{key}: {value}\n' | |
# Special case for 'Tags' and 'Category' to convert them into lists | |
elif key == "tags": | |
items = value.split(", ") | |
formatted_items = "\n- ".join(items) | |
yaml_content += f"{key}: \n- {formatted_items}\n" | |
elif key == "category": | |
key = "categories" | |
items = value.split(", ") | |
formatted_items = "\n- ".join(items) | |
yaml_content += f"{key}: \n- {formatted_items}\n" | |
elif key in ["summary"]: | |
key = "description" | |
yaml_content += f'{key}: "{value}"\n' | |
elif key == "author": | |
pass | |
else: | |
# For other keys, just copy the value | |
yaml_content += f'{key}: "{value}"\n' | |
yaml_content += "---" | |
return yaml_content | |
def replace_indented_blocks(text): | |
lines = text.split("\n") | |
changed_lines = "" | |
in_code_block = False | |
for line in lines: | |
if not line.startswith(" ") and line != "": | |
if in_code_block: | |
in_code_block = False | |
changed_lines = changed_lines.removesuffix("\n") | |
line = "```\n\n" + line | |
if line.startswith(" :::"): | |
in_code_block = True | |
line = line.replace(" :::", "```", 1) | |
if line.startswith(" ") and ":::" not in line: | |
in_code_block = True | |
line = line.replace(" ", "", 1) | |
changed_lines += line + "\n" | |
return changed_lines | |
def parse_year_from_metadata(metadata): | |
lines = metadata.split('\n') | |
year = "" | |
for line in lines: | |
if line.startswith("date:"): | |
# Extract the date part from the line | |
date_part = line.split("date:")[1].strip() | |
# Extract the year from the date part | |
return date_part.split("-")[0] | |
return year | |
def parse_slug_from_metadata(metadata): | |
lines = metadata.split('\n') | |
slug = "" | |
for line in lines: | |
if line.startswith("slug:"): | |
return line.split("slug:")[1].strip().replace('"', '') | |
return slug | |
def is_article(path): | |
base_name = os.path.basename(path) | |
file_name_without_ext, _ = os.path.splitext(base_name) | |
if file_name_without_ext.startswith(('0', '1', '2', '3', '4', '5', '6', '7', '8', '9')): | |
return True | |
return False | |
def get_article_id(path): | |
if os.path.exists(path): | |
# Check if the path is a directory | |
if os.path.isdir(path): | |
# List all files and directories in the given path | |
all_items = os.listdir(path) | |
return len(all_items) + 1 | |
def find_images(text): | |
# Regular expression pattern to find entries like ({static}/images/2017/10/keybase_identity.png) | |
# pattern = r"\(\{static\}(/images/[^\)]+)\)" | |
pattern = r"\{static\}(/images/\d{4}/\d{2}/[^)\s\"]+\.\w+)" | |
# Find all matches of the pattern in the text | |
matches = re.findall(pattern, text) | |
# Return the extracted paths | |
return matches | |
def get_filename(image_path): | |
# Extract the file name from the given image path | |
file_name = os.path.basename(image_path) | |
return file_name | |
def copy_images_to_article_folder(images, article_path, article_id, slug): | |
for image in images: | |
image_path = f"{INPUT_FOLDER}{image}" | |
image_name = get_filename(image_path) | |
new_image_path = f"{article_path}/{article_id}-{slug}/{image_name}" | |
if os.path.exists(image_path): | |
shutil.copy(image_path, new_image_path) | |
def replace_paths_with_filenames(text): | |
# Regular expression pattern to match the desired image paths | |
pattern = r"\{static\}(/images/[0-9]{4}/[0-9]{2}/[^)]+\.\w+)" | |
# Function to be used as the replacement in re.sub | |
def replacement(match): | |
# Extract the full path from the match | |
full_path = match.group(1) | |
# Extract just the file name | |
file_name = os.path.basename(full_path) | |
return file_name | |
# Replace all occurrences of the pattern in the text with just their file names | |
updated_text = re.sub(pattern, replacement, text) | |
return updated_text | |
def process_files(files): | |
for file in files: | |
if not is_article(file): | |
continue | |
file_content = read_file_content(file) | |
metadata, content = file_content.split("\n\n", 1) | |
# Parse metadata and convert it to YAML format for Hugo | |
yaml_metadata = metadata_to_yaml(metadata) | |
# Parse year from metadata | |
year = parse_year_from_metadata(yaml_metadata) | |
# Parse slug from metadata | |
slug = parse_slug_from_metadata(yaml_metadata) | |
# Create the output folder if it doesn't exist | |
article_path = f"{OUTPUT_FOLDER}/{year}" | |
Path(article_path).mkdir(parents=True, exist_ok=True) | |
# Get the article ID | |
article_id = get_article_id(article_path) | |
# Create the article folder if it doesn't exist | |
Path(f"{article_path}/{article_id}-{slug}").mkdir(parents=True, exist_ok=True) | |
# Replace indented blocks with Hugo code blocks | |
content = replace_indented_blocks(content) | |
# Find images in the content | |
images = find_images(content) | |
# Copy the images to the article folder | |
copy_images_to_article_folder(images, article_path, article_id, slug) | |
# Replace image paths with file names | |
content = replace_paths_with_filenames(content) | |
# Compose the final content using the YAML metadata and the processed content | |
final_content = f"{yaml_metadata}\n\n{content}" | |
# Write the final content to the output file | |
output_file = f"{article_path}/{article_id}-{slug}/index.md" | |
with open(output_file, 'w') as file: | |
file.write(final_content) | |
if __name__ == "__main__": | |
posts_filenames = get_posts_filenames(INPUT_FOLDER) | |
process_files(posts_filenames) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment