Last active
July 22, 2025 04:43
-
-
Save AngeloR/cce5451ab00183e7dfeef5cc31ffefbe to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import feedparser | |
import os | |
import re | |
import frontmatter | |
import sys | |
from markdownify import markdownify as md | |
import html | |
from datetime import datetime | |
import requests | |
from urllib.parse import urlparse | |
RSS_URL = "https://medium.com/feed/@xangelo" | |
OUTPUT_DIR = "content/posts/medium" | |
EXISTING_SLUGS = {f[:-3] for f in os.listdir(OUTPUT_DIR) if f.endswith(".md")} | |
# default to false, but read from --force flag if it's set | |
FORCE_REBUILD = False | |
if len(sys.argv) > 1 and sys.argv[1] == "--force": | |
FORCE_REBUILD = sys.argv[2] == "true" | |
def slugify(title): | |
return re.sub(r"[^\w-]", "", re.sub(r"\s+", "-", title.lower())).strip("-") | |
def resolve_medium_media_links(content): | |
""" | |
Find Medium media links in the content and resolve them to direct GitHub Gist URLs if applicable. | |
Medium media links look like: <https://medium.com/media/c7634bd7099d8b4a3c68e75789d29869/href> | |
""" | |
# Pattern to match Medium media links | |
medium_media_pattern = r'<(https://medium\.com/media/[a-f0-9]+/href)>' | |
def replace_link(match): | |
medium_url = match.group(1) | |
try: | |
# Follow the Medium media link to see where it redirects | |
print(f"Resolving Medium media link: {medium_url}") | |
response = requests.head(medium_url, allow_redirects=True, timeout=10) | |
final_url = response.url | |
# Check if the final URL is a GitHub Gist | |
parsed_url = urlparse(final_url) | |
if parsed_url.netloc == 'gist.github.com': | |
print(f"Resolved Medium media link: {medium_url} -> {final_url}") | |
return f"<script src=\"{final_url}.js\"></script>" | |
else: | |
print(f"Medium media link does not resolve to GitHub Gist: {medium_url} -> {final_url}") | |
return match.group(0) # Return original if not a gist | |
except requests.RequestException as e: | |
print(f"Failed to resolve Medium media link {medium_url}: {e}") | |
return match.group(0) # Return original on error | |
return re.sub(medium_media_pattern, replace_link, content) | |
feed = feedparser.parse(RSS_URL) | |
print(f"Parsing {RSS_URL}, {len(feed.entries)} posts, force rebuild: {FORCE_REBUILD}") | |
for entry in feed.entries: | |
slug = slugify(entry.title) | |
if slug in EXISTING_SLUGS and not FORCE_REBUILD: | |
continue | |
content_html = entry.get("content", [{}])[0].get("value", "") or entry.get("summary", "") | |
markdown_content = md(html.unescape(content_html)) | |
formatted_content = resolve_medium_media_links(markdown_content) | |
# Extract the first line as the summary and strip it from content | |
lines = formatted_content.strip().split('\n') | |
extracted_summary = lines[0].strip() | |
remaining_content = '\n'.join(lines[1:]).lstrip('\n') | |
# remove the blockquote from the summary | |
extracted_summary = re.sub('> *', '', extracted_summary).strip() | |
# remove all other markdown formatting from the summary | |
extracted_summary = re.sub(r'(\*\*|\*|__|_|`|~~|<[^>]+>|\[([^\]]+)\]\([^)]+\))', r'\2', extracted_summary).strip() | |
post = frontmatter.Post(remaining_content) | |
post["title"] = entry.title | |
post["summary"] = extracted_summary | |
post["date"] = entry.published | |
post["slug"] = slug | |
post["draft"] = False | |
post["medium_link"] = entry.link | |
# the last line of a post is a stat line that looks like this: | |
#  | |
# we should strip these out so that they don't count towards the viewer count on medium | |
post.content = post.content.replace("", "") | |
# add a line to the bottom of the post to indicate that it's from Medium | |
post.content += "\n\n---\n\nThis was originally published on Medium - " + entry.link | |
output_path = os.path.join(OUTPUT_DIR, f"{slug}.md") | |
with open(output_path, "w", encoding="utf-8") as f: | |
f.write(frontmatter.dumps(post)) | |
print(f"Saved: {output_path}") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment