Skip to content

Instantly share code, notes, and snippets.

@waveform80
Created April 14, 2025 11:56
Show Gist options
  • Save waveform80/80c48f59bb5dbd0bd9e3ebadcd6233a2 to your computer and use it in GitHub Desktop.
Save waveform80/80c48f59bb5dbd0bd9e3ebadcd6233a2 to your computer and use it in GitHub Desktop.

An extremely hacky, quickly thrown together script to extract various revisios of a specific Discourse post on discourse.ubuntu.com. Uses the "markdown diff" to extract the "current" revision and dumps them to individual markdown files

Does Discourse actually have an API for this? I couldn't find it...

#!/usr/bin/python3
import sys
import json
from pathlib import Path
from urllib.request import urlopen
from html.parser import HTMLParser
class TableParser(HTMLParser):
def __init__(self, column):
super().__init__()
self.extract_column = column
self.current_col = 0
self.state = 'top'
self.content = ''
def handle_starttag(self, tag, attrs):
if tag == 'table' and self.state == 'top':
self.content = ''
self.state = 'table'
elif tag == 'tr' and self.state == 'table':
self.state = 'tr'
self.current_col = 0
elif tag == 'td' and self.state == 'tr':
self.current_col += 1
def handle_endtag(self, tag):
if tag == 'tr' and self.state == 'tr':
self.state = 'table'
elif tag == 'table' and self.state == 'table':
self.state = 'top'
def handle_data(self, data):
if self.current_col == self.extract_column:
self.content += data
def main():
for rev in range(96, 107):
parser = TableParser(column=2)
with urlopen(f'https://discourse.ubuntu.com/posts/120902/revisions/{rev}.json') as fp:
data = json.load(fp)
changes = data['body_changes']['side_by_side_markdown']
parser.feed(changes)
Path(f'revision{rev}.md').write_text(parser.content)
if __name__ == '__main__':
sys.exit(main())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment