Skip to content

Instantly share code, notes, and snippets.

@moble
Last active June 16, 2025 19:12
Show Gist options
  • Save moble/5f42f374df7bbd850e11503c47cb8d37 to your computer and use it in GitHub Desktop.
Save moble/5f42f374df7bbd850e11503c47cb8d37 to your computer and use it in GitHub Desktop.
Download and extract named characters from the Wolfram Language documentation and generate a dictionary mapping those names to their Unicode representations
"""
This script downloads and extracts named characters from the Wolfram
Language documentation and generates a dictionary mapping those names
to their Unicode representations. The named characters are available
at
https://reference.wolfram.com/language/guide/ListingOfNamedCharacters.html
Not all named characters have a direct Unicode equivalent, in which
case the name itself is used as the value. Some characters — "Null"
and "RawEscape" — do not fit nicely in a python script and are
skipped.
"""
import sys
import unicodedata
from bs4 import BeautifulSoup
import requests
# This is a mapping of named characters where the webpage doesn't
# actually include a representation of the character. Commented-out
# entries are do not have a good unicode equivalent.
named_to_unicode = {
# AlignmentMarker
# AutoLeftMatch
# AutoOperand
# AutoRightMatch
# AutoSpace
"DiscretionaryHyphen": "\u00AD",
"DiscretionaryLineSeparator": "\u2028",
"DiscretionaryParagraphSeparator": "\u2029",
# ImplicitPlus
# IndentingNewLine
"InvisibleApplication": "\u2061",
"InvisibleComma": "\u2063",
# InvisiblePostfixScriptBase
# InvisiblePrefixScriptBase
"InvisibleSpace": "\u200B",
"InvisibleTimes": "\u2062",
# LetterSpace
"LineSeparator": "\u2028",
"MediumSpace": "\u2005",
"NewLine": "\n",
"ParagraphSeparator": "\u2029",
# RawReturn
"ThickSpace": "\u2004",
"ThinSpace": "\u2009",
"VeryThinSpace": "\u200A",
# NegativeThinSpace
# NegativeMediumSpace
# NegativeThickSpace
# NegativeVeryThinSpace
"NoBreak": "\u00A0",
"NonBreakingSpace": "\u00A0",
"Null": "\u0000",
}
def extract():
d = {}
resp = requests.get("https://reference.wolfram.com/language/guide/ListingOfNamedCharacters.html")
resp.raise_for_status()
soup = BeautifulSoup(resp.text, "html.parser")
print("{")
for p in soup.find_all("p", class_="singleFunction"):
sc = p.find("span", class_="special-character")
if not sc:
guide = p.find("span", class_="GuideCharacterImage")
if not guide:
# Attempt to extract the name from the ICN span and map it to Unicode
icn = p.find("span", class_="ICN")
if icn:
icn_text = icn.get_text(strip=True)
if icn_text.startswith(r"\[") and icn_text.endswith("]"):
name = icn_text[2:-1]
code = named_to_unicode.get(name, name)
print(f'"{name}":"{code}",')
d[name] = code
continue
print(f"Skipping: {p}", file=sys.stderr)
continue
else:
a = guide.find("a")
if not a:
print(f"Skipping: {p}", file=sys.stderr)
continue
code = a.get_text(strip=True)
icn = p.find("span", class_="ICN")
if not icn:
print(f"Skipping: {p}", file=sys.stderr)
continue
icn_text = icn.get_text(strip=True)
if icn_text.startswith(r"\[") and icn_text.endswith("]"):
name = icn_text[2:-1]
else:
name = icn_text
else:
name = next((c for c in sc["class"] if c not in ("special-character", "formalcharacter")), None)
code = sc.get_text(strip=True)
if name in ["Null", "RawEscape"]:
continue # Don't bother with these, as they screw up the output
if len(code) != 1:
code = named_to_unicode.get(name, name)
elif unicodedata.category(code)=="Co": # This is a private use character
code = name
if code.startswith("FormalScriptCapital"):
try:
code = unicodedata.lookup(
# Don't just use the last character, because we
# want to error if there is more than one.
"MATHEMATICAL SCRIPT CAPITAL " + code[len("FormalScriptCapital"):]
) + "\u0323"
except KeyError:
pass
if code.startswith("FormalScriptCapital"):
try:
code = unicodedata.lookup(
# Don't just use the last character, because we
# want to error if there is more than one.
"SCRIPT CAPITAL " + code[len("FormalScriptCapital"):]
) + "\u0323"
except KeyError:
pass
if code.startswith("ScriptCapital"):
try:
code = unicodedata.lookup(
# Don't just use the last character, because we
# want to error if there is more than one.
"MATHEMATICAL SCRIPT CAPITAL " + code[len("ScriptCapital"):]
)
except KeyError:
pass
if code.startswith("FormalScript"):
try:
code = unicodedata.lookup(
# Don't just use the last character, because we
# want to error if there is more than one.
"MATHEMATICAL SCRIPT SMALL " + code[len("FormalScript"):]
) + "\u0323"
except KeyError:
pass
if code.startswith("FormalScript"):
try:
code = unicodedata.lookup(
# Don't just use the last character, because we
# want to error if there is more than one.
"SCRIPT SMALL " + code[len("FormalScript"):]
) + "\u0323"
except KeyError:
pass
if code.startswith("Script"):
try:
code = unicodedata.lookup(
# Don't just use the last character, because we
# want to error if there is more than one.
"MATHEMATICAL SCRIPT SMALL " + code[len("Script"):]
)
except KeyError:
pass
# Do the same for "Gothic", "GothicCapital", "DoubleStruck", and "DoubleStruckCapital"
if code.startswith("GothicCapital"):
try:
code = unicodedata.lookup(
"MATHEMATICAL FRAKTUR CAPITAL " + code[len("GothicCapital"):]
)
except KeyError:
pass
if code.startswith("Gothic"):
try:
code = unicodedata.lookup(
"MATHEMATICAL FRAKTUR SMALL " + code[len("Gothic"):]
)
except KeyError:
pass
if code.startswith("DoubleStruckCapital"):
try:
code = unicodedata.lookup(
"MATHEMATICAL DOUBLE-STRUCK CAPITAL " + code[len("DoubleStruckCapital"):]
)
except KeyError:
pass
if code.startswith("DoubleStruckCapital"):
try:
code = unicodedata.lookup(
"DOUBLE-STRUCK CAPITAL " + code[len("DoubleStruckCapital"):]
)
except KeyError:
pass
if code.startswith("DoubleStruck"):
try:
code = unicodedata.lookup(
"MATHEMATICAL DOUBLE-STRUCK SMALL " + code[len("DoubleStruck"):]
)
except KeyError:
pass
if name.startswith("Formal") and len(code) == 1:
code = f"{code}\u0323"
if code == "\\":
code = "\\\\"
if code == '"':
code = '\\"'
print(f""""{name}":"{code}",""")
d[name] = code
if not name.isascii() or not name.isprintable():
print(f"Warning: {name} is not printable ASCII: {code!r}", file=sys.stderr)
# Check to see if any of the characters in `name` is not in [a-zA-Z0-9]
if any(c not in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" for c in name):
print(f"Warning: {name} contains non-alphanumeric characters", file=sys.stderr)
print("}")
return d
if __name__ == "__main__":
extract()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment