Last active
May 31, 2025 14:18
-
-
Save appel/a6accfab384f80cb12c9c20a1075e942 to your computer and use it in GitHub Desktop.
Python script which converts Pocket's csv export file to html.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This script converts Pocket's csv export file to html, | |
# allowing you to import it to a wider range of apps (like Grimoire and Linkding). | |
# This is a deno to python port, via Gemini. | |
# | |
# Note that I did not write this, all credit goes to the original author: | |
# https://github.com/enjikaka/pocket-to-bookmark | |
# | |
# From that repo: "Mozilla is killing Pocket and you get your data export | |
# as an CSV file. I'll use Linkding instead and it supports the | |
# Netscape Bookmark File Format. This Deno script converts your CSV export | |
# to a Netscape Bookmark File Format-compatible HTML-file, that you can import | |
# to Linkding or your browser." | |
# | |
# Usage: | |
# $python3 pocket-to-bookmark.py part_000001.csv pocket-export.html | |
import sys | |
import csv | |
import html # For a more standard HTML escaping, though a custom one is used for fidelity | |
import time | |
from pathlib import Path | |
# --- Helper Functions --- | |
def escape_html_custom(text: str) -> str: | |
""" | |
Custom HTML escaping function to match the Deno script's behavior. | |
It escapes &, ", <, >. | |
""" | |
return ( | |
str(text) | |
.replace("&", "&") | |
.replace('"', """) | |
.replace("<", "<") | |
.replace(">", ">") | |
) | |
def generate_bookmark_html(bookmarks: list) -> str: | |
"""Generates the Netscape bookmark HTML structure.""" | |
lines = [] | |
lines.append("<!DOCTYPE NETSCAPE-Bookmark-file-1>") | |
lines.append('<META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=UTF-8">') | |
lines.append("<TITLE>Pocket Export</TITLE>") | |
lines.append("<H1>Pocket Export</H1>") | |
lines.append("<DL><p>") | |
lines.append(" <DT><H3>Pocket</H3>") | |
lines.append(" <DL><p>") | |
for bm in bookmarks: | |
# timeAdded is expected to be a Unix timestamp (seconds) | |
# Deno's Math.floor() is equivalent to int() truncation for positive numbers | |
add_date = int(bm['timeAdded']) | |
escaped_url = escape_html_custom(bm['url']) | |
escaped_title = escape_html_custom(bm['title']) | |
lines.append( | |
f' <DT><A HREF="{escaped_url}" ADD_DATE="{add_date}">{escaped_title}</A>' | |
) | |
lines.append(" </DL><p>") | |
lines.append("</DL><p>") | |
return "\n".join(lines) | |
# --- Main Script Logic --- | |
def main(): | |
if len(sys.argv) != 3: | |
print( | |
"Usage: python pocket_to_bookmarks.py input.csv output.html", | |
file=sys.stderr, | |
) | |
sys.exit(1) | |
input_path_str = sys.argv[1] | |
output_path_str = sys.argv[2] | |
input_path = Path(input_path_str) | |
output_path = Path(output_path_str) | |
if not input_path.is_file(): | |
print(f"Error: Input file not found at {input_path}", file=sys.stderr) | |
sys.exit(1) | |
all_bookmarks = [] | |
try: | |
csv_text = input_path.read_text(encoding='utf-8').strip() | |
if not csv_text: | |
print("Warning: CSV file is empty or contains only whitespace.", file=sys.stderr) | |
# Fall through to generate an empty HTML, similar to Deno script behavior | |
csv_lines = csv_text.splitlines() # Handles various newline characters | |
if not csv_lines: | |
html_output = generate_bookmark_html([]) | |
output_path.write_text(html_output, encoding='utf-8') | |
# Match Deno's log message structure, even if count is 0 | |
print(f"✅ Exporterade 0 unread-bokmärken → {output_path_str}") | |
return | |
# Deno: const headers = lines[0].split(","); | |
# Mimicking the Deno script's potentially naive header split. | |
# Pocket CSV headers are usually simple (e.g., "url,title,time_added,tags,status") | |
# so this simple split is often okay. | |
header_line = csv_lines[0] | |
headers = [header.strip() for header in header_line.split(',')] | |
# Process data rows | |
# The Deno script uses `jsr:@jlarky/csv-parse` per line for fields. | |
# Python's `csv.reader` on a single-element list `[line_str]` will parse that one line. | |
for line_str in csv_lines[1:]: | |
if not line_str.strip(): # Skip empty lines that might be in the CSV | |
continue | |
# Use csv.reader for the current line to parse its fields | |
# This handles commas within quoted fields correctly for the data part. | |
parsed_fields_list = list(csv.reader([line_str])) | |
fields = parsed_fields_list[0] if parsed_fields_list and parsed_fields_list[0] else [] | |
entry = {} | |
for i, key in enumerate(headers): | |
# Deno: fields[i] || "" | |
# This ensures a value, defaulting to "" if field is missing or falsy (like null/undefined in JS) | |
# Python equivalent: | |
entry[key] = fields[i].strip() if i < len(fields) else "" | |
# title: entry.title || entry.url | |
# In JS, `"" || "url"` results in `"url"`. Python's `or` behaves similarly for strings. | |
current_title = entry.get('title', '') | |
current_url = entry.get('url', '') | |
title = current_title or current_url | |
url = current_url | |
# timeAdded: parseInt(entry.time_added) || Date.now() / 1000 | |
# Deno's parseInt(""): NaN; parseInt("abc"): NaN; parseInt("0"): 0 | |
# In JS: (NaN || default) -> default; (0 || default) -> default | |
time_added_str = entry.get('time_added', '') | |
try: | |
if not time_added_str: # Empty string becomes default time | |
raise ValueError("Empty time_added string") | |
# Deno's parseInt truncates floats (e.g., parseInt("123.45") is 123) | |
val = int(float(time_added_str)) # Handles "123" and "123.45" | |
if val == 0: # Mimic JS (0 || default_time) behaviour | |
time_added = time.time() | |
else: | |
time_added = float(val) # Store as float, like time.time() | |
except ValueError: | |
time_added = time.time() # Default to current time in seconds (float) | |
status = entry.get('status', '') | |
all_bookmarks.append({ | |
'title': title, | |
'url': url, | |
'timeAdded': time_added, # Stored as float (seconds since epoch) | |
'status': status, | |
}) | |
except FileNotFoundError: | |
print(f"Error: Input file not found at {input_path_str}", file=sys.stderr) | |
sys.exit(1) | |
except Exception as e: | |
print(f"An error occurred during CSV processing: {e}", file=sys.stderr) | |
sys.exit(1) | |
# Filter and sort bookmarks | |
# Deno: const filteredBookmarks = allBookmarks.filter(b => b.status === "unread") | |
# .sort((a, b) => a.timeAdded - b.timeAdded); | |
filtered_bookmarks = [b for b in all_bookmarks if b['status'] == "unread"] | |
filtered_bookmarks.sort(key=lambda b: b['timeAdded']) | |
html_output = generate_bookmark_html(filtered_bookmarks) | |
try: | |
output_path.write_text(html_output, encoding='utf-8') | |
except Exception as e: | |
print(f"Error writing HTML to {output_path_str}: {e}", file=sys.stderr) | |
sys.exit(1) | |
# Deno: console.log(`✅ Exporterade ${allBookmarks.length} unread-bokmärken → ${outputPath}`); | |
# Replicating the Deno script's message, which uses the count of *all parsed* bookmarks | |
# rather than the count of *filtered (unread)* bookmarks in the message. | |
print(f"✅ Exporterade {len(all_bookmarks)} unread-bokmärken → {output_path_str}") | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment