Last active
May 7, 2023 18:07
-
-
Save palevell/d366a21b4477bc18c1a56c218cd7cc93 to your computer and use it in GitHub Desktop.
Parses message.content of OpenAI chat responses
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# -*- coding: utf-8 -*- | |
# markdown_parser.py - Sunday, May 7, 2023 | |
# Via ChatGPT | |
import os | |
import re | |
import sys | |
import argparse | |
import subprocess | |
from bs4 import BeautifulSoup | |
def parse_markdown_file(md_file_path): | |
with open(md_file_path, "r") as f: | |
md_content = f.read() | |
code_blocks = re.findall(r"```([a-zA-Z0-9]+)?\n([\s\S]+?)\n```", md_content) | |
css_filename = None | |
js_filename = None | |
for i, block in enumerate(code_blocks): | |
lang, code = block | |
if lang and lang.lower() in ["html", "htm"]: | |
soup = BeautifulSoup(code, "html.parser") | |
css_tags = soup.find_all("link", {"rel": "stylesheet"}) | |
if css_tags: | |
css_filename = css_tags[0].get("href") | |
if css_filename.endswith(".css"): | |
css_filename = os.path.basename(css_filename) | |
js_tags = soup.find_all("script", {"src": True}) | |
if js_tags: | |
js_filename = js_tags[0].get("src") | |
if js_filename.endswith(".js"): | |
js_filename = os.path.basename(js_filename) | |
filename = "index.html" if i == 0 else f"{i}.html" | |
with open(filename, "w") as f: | |
f.write(code) | |
print(f"Saved HTML code block to {filename}") | |
elif lang and lang.lower() == "css": | |
if not css_filename: | |
css_filename = "style.css" | |
with open(css_filename, "w") as f: | |
f.write(code) | |
print(f"Saved CSS code block to {css_filename}") | |
elif lang and lang.lower() in ["js", "javascript"]: | |
if not js_filename: | |
js_filename = "script.js" | |
with open(js_filename, "w") as f: | |
f.write(code) | |
print(f"Saved JavaScript code block to {js_filename}") | |
else: | |
extension = lang.lower() if lang else "unknown" | |
filename = f"{i}.{extension}" | |
with open(filename, "w") as f: | |
f.write(code) | |
print(f"Saved code block to {filename}") | |
unknown_files = [f for f in os.listdir() if f.endswith(".unknown")] | |
if unknown_files: | |
for filename in unknown_files: | |
try: | |
output = subprocess.check_output(["file", filename]) | |
print(f"{filename}: {output.decode().strip()}") | |
except Exception as e: | |
print(f"Error while determining file type for {filename}: {e}") | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser(description="Parse Markdown file and extract code blocks") | |
parser.add_argument("markdown_file", help="Path to Markdown file to parse") | |
args = parser.parse_args() | |
parse_markdown_file(args.markdown_file) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
I need a Python script that can parse a file that contains Markdown. The markdown file contains code blocks that are delimited by three back-ticks (ie. "```"). The opening delimiter may contain the name of the language immediately after the back ticks. In this case, the entire code block should be saved with a filename, using the language as the file extension. For example a "```html" delimiter indicates the filename should have a ".html" extension. The first HTML file encountered can be saved as "index.html". The HTML code block may contain an href to a CSS file (ie. "style.css" or "styles.css"). This name should be used when CSS code blocks are encountered. The HTML code block may also contain a src reference to a JavaScript file (ie. "script.js" or "scripts.js"). This name should be used when JavaScript code blocks are encountered. Use the BeautifulSoup library to extract the filenames of linked CSS and/or JavaScript. If the opening delimiter lacks the name of the language, use your own best judgment in determining the language. If the language of the code block cannot be determined, the filename can use a ".unknown" extension. When finished processing the Markdown file, the script should check for files it saved with a ".unknown" extension and use the "file" utility to determine the contents of the file, and show the results. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment