Last active
June 16, 2025 19:12
-
-
Save moble/5f42f374df7bbd850e11503c47cb8d37 to your computer and use it in GitHub Desktop.
Download and extract named characters from the Wolfram Language documentation and generate a dictionary mapping those names to their Unicode representations
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
This script downloads and extracts named characters from the Wolfram | |
Language documentation and generates a dictionary mapping those names | |
to their Unicode representations. The named characters are available | |
at | |
https://reference.wolfram.com/language/guide/ListingOfNamedCharacters.html | |
Not all named characters have a direct Unicode equivalent, in which | |
case the name itself is used as the value. Some characters — "Null" | |
and "RawEscape" — do not fit nicely in a python script and are | |
skipped. | |
""" | |
import sys | |
import unicodedata | |
from bs4 import BeautifulSoup | |
import requests | |
# This is a mapping of named characters where the webpage doesn't | |
# actually include a representation of the character. Commented-out | |
# entries are do not have a good unicode equivalent. | |
named_to_unicode = { | |
# AlignmentMarker | |
# AutoLeftMatch | |
# AutoOperand | |
# AutoRightMatch | |
# AutoSpace | |
"DiscretionaryHyphen": "\u00AD", | |
"DiscretionaryLineSeparator": "\u2028", | |
"DiscretionaryParagraphSeparator": "\u2029", | |
# ImplicitPlus | |
# IndentingNewLine | |
"InvisibleApplication": "\u2061", | |
"InvisibleComma": "\u2063", | |
# InvisiblePostfixScriptBase | |
# InvisiblePrefixScriptBase | |
"InvisibleSpace": "\u200B", | |
"InvisibleTimes": "\u2062", | |
# LetterSpace | |
"LineSeparator": "\u2028", | |
"MediumSpace": "\u2005", | |
"NewLine": "\n", | |
"ParagraphSeparator": "\u2029", | |
# RawReturn | |
"ThickSpace": "\u2004", | |
"ThinSpace": "\u2009", | |
"VeryThinSpace": "\u200A", | |
# NegativeThinSpace | |
# NegativeMediumSpace | |
# NegativeThickSpace | |
# NegativeVeryThinSpace | |
"NoBreak": "\u00A0", | |
"NonBreakingSpace": "\u00A0", | |
"Null": "\u0000", | |
} | |
def extract(): | |
d = {} | |
resp = requests.get("https://reference.wolfram.com/language/guide/ListingOfNamedCharacters.html") | |
resp.raise_for_status() | |
soup = BeautifulSoup(resp.text, "html.parser") | |
print("{") | |
for p in soup.find_all("p", class_="singleFunction"): | |
sc = p.find("span", class_="special-character") | |
if not sc: | |
guide = p.find("span", class_="GuideCharacterImage") | |
if not guide: | |
# Attempt to extract the name from the ICN span and map it to Unicode | |
icn = p.find("span", class_="ICN") | |
if icn: | |
icn_text = icn.get_text(strip=True) | |
if icn_text.startswith(r"\[") and icn_text.endswith("]"): | |
name = icn_text[2:-1] | |
code = named_to_unicode.get(name, name) | |
print(f'"{name}":"{code}",') | |
d[name] = code | |
continue | |
print(f"Skipping: {p}", file=sys.stderr) | |
continue | |
else: | |
a = guide.find("a") | |
if not a: | |
print(f"Skipping: {p}", file=sys.stderr) | |
continue | |
code = a.get_text(strip=True) | |
icn = p.find("span", class_="ICN") | |
if not icn: | |
print(f"Skipping: {p}", file=sys.stderr) | |
continue | |
icn_text = icn.get_text(strip=True) | |
if icn_text.startswith(r"\[") and icn_text.endswith("]"): | |
name = icn_text[2:-1] | |
else: | |
name = icn_text | |
else: | |
name = next((c for c in sc["class"] if c not in ("special-character", "formalcharacter")), None) | |
code = sc.get_text(strip=True) | |
if name in ["Null", "RawEscape"]: | |
continue # Don't bother with these, as they screw up the output | |
if len(code) != 1: | |
code = named_to_unicode.get(name, name) | |
elif unicodedata.category(code)=="Co": # This is a private use character | |
code = name | |
if code.startswith("FormalScriptCapital"): | |
try: | |
code = unicodedata.lookup( | |
# Don't just use the last character, because we | |
# want to error if there is more than one. | |
"MATHEMATICAL SCRIPT CAPITAL " + code[len("FormalScriptCapital"):] | |
) + "\u0323" | |
except KeyError: | |
pass | |
if code.startswith("FormalScriptCapital"): | |
try: | |
code = unicodedata.lookup( | |
# Don't just use the last character, because we | |
# want to error if there is more than one. | |
"SCRIPT CAPITAL " + code[len("FormalScriptCapital"):] | |
) + "\u0323" | |
except KeyError: | |
pass | |
if code.startswith("ScriptCapital"): | |
try: | |
code = unicodedata.lookup( | |
# Don't just use the last character, because we | |
# want to error if there is more than one. | |
"MATHEMATICAL SCRIPT CAPITAL " + code[len("ScriptCapital"):] | |
) | |
except KeyError: | |
pass | |
if code.startswith("FormalScript"): | |
try: | |
code = unicodedata.lookup( | |
# Don't just use the last character, because we | |
# want to error if there is more than one. | |
"MATHEMATICAL SCRIPT SMALL " + code[len("FormalScript"):] | |
) + "\u0323" | |
except KeyError: | |
pass | |
if code.startswith("FormalScript"): | |
try: | |
code = unicodedata.lookup( | |
# Don't just use the last character, because we | |
# want to error if there is more than one. | |
"SCRIPT SMALL " + code[len("FormalScript"):] | |
) + "\u0323" | |
except KeyError: | |
pass | |
if code.startswith("Script"): | |
try: | |
code = unicodedata.lookup( | |
# Don't just use the last character, because we | |
# want to error if there is more than one. | |
"MATHEMATICAL SCRIPT SMALL " + code[len("Script"):] | |
) | |
except KeyError: | |
pass | |
# Do the same for "Gothic", "GothicCapital", "DoubleStruck", and "DoubleStruckCapital" | |
if code.startswith("GothicCapital"): | |
try: | |
code = unicodedata.lookup( | |
"MATHEMATICAL FRAKTUR CAPITAL " + code[len("GothicCapital"):] | |
) | |
except KeyError: | |
pass | |
if code.startswith("Gothic"): | |
try: | |
code = unicodedata.lookup( | |
"MATHEMATICAL FRAKTUR SMALL " + code[len("Gothic"):] | |
) | |
except KeyError: | |
pass | |
if code.startswith("DoubleStruckCapital"): | |
try: | |
code = unicodedata.lookup( | |
"MATHEMATICAL DOUBLE-STRUCK CAPITAL " + code[len("DoubleStruckCapital"):] | |
) | |
except KeyError: | |
pass | |
if code.startswith("DoubleStruckCapital"): | |
try: | |
code = unicodedata.lookup( | |
"DOUBLE-STRUCK CAPITAL " + code[len("DoubleStruckCapital"):] | |
) | |
except KeyError: | |
pass | |
if code.startswith("DoubleStruck"): | |
try: | |
code = unicodedata.lookup( | |
"MATHEMATICAL DOUBLE-STRUCK SMALL " + code[len("DoubleStruck"):] | |
) | |
except KeyError: | |
pass | |
if name.startswith("Formal") and len(code) == 1: | |
code = f"{code}\u0323" | |
if code == "\\": | |
code = "\\\\" | |
if code == '"': | |
code = '\\"' | |
print(f""""{name}":"{code}",""") | |
d[name] = code | |
if not name.isascii() or not name.isprintable(): | |
print(f"Warning: {name} is not printable ASCII: {code!r}", file=sys.stderr) | |
# Check to see if any of the characters in `name` is not in [a-zA-Z0-9] | |
if any(c not in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" for c in name): | |
print(f"Warning: {name} contains non-alphanumeric characters", file=sys.stderr) | |
print("}") | |
return d | |
if __name__ == "__main__": | |
extract() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment