moble · June 16, 2025 19:12
diff --git a/named_characters.py b/named_characters.py
 """

 This script downloads and extracts named characters from the Wolfram
 Language documentation and generates a dictionary mapping those names
 to their Unicode representations.  The named characters are available
 at

    https://reference.wolfram.com/language/guide/ListingOfNamedCharacters.html

 Not all named characters have a direct Unicode equivalent, in which
 case the name itself is used as the value. Some characters — "Null"
 and "RawEscape" — do not fit nicely in a python script and are
 skipped.

 """


 import sys
 import unicodedata
 from bs4 import BeautifulSoup
 import requests

 # This is a mapping of named characters where the webpage doesn't
 # actually include a representation of the character.  Commented-out
 # entries are do not have a good unicode equivalent.
 named_to_unicode = {
    # AlignmentMarker
    # AutoLeftMatch
    # AutoOperand
    # AutoRightMatch
    # AutoSpace
    "DiscretionaryHyphen": "\u00AD",
    "DiscretionaryLineSeparator": "\u2028",
    "DiscretionaryParagraphSeparator": "\u2029",
    # ImplicitPlus
    # IndentingNewLine
    "InvisibleApplication": "\u2061",
    "InvisibleComma": "\u2063",
    # InvisiblePostfixScriptBase
    # InvisiblePrefixScriptBase
    "InvisibleSpace": "\u200B",
    "InvisibleTimes": "\u2062",
    # LetterSpace
    "LineSeparator": "\u2028",
    "MediumSpace": "\u2005",
    "NewLine": "\n",
    "ParagraphSeparator": "\u2029",
    # RawReturn
    "ThickSpace": "\u2004",
    "ThinSpace": "\u2009",
    "VeryThinSpace": "\u200A",
    # NegativeThinSpace
    # NegativeMediumSpace
    # NegativeThickSpace
    # NegativeVeryThinSpace
    "NoBreak": "\u00A0",
    "NonBreakingSpace": "\u00A0",
    "Null": "\u0000",
 }

 def extract():
    d = {}
    resp = requests.get("https://reference.wolfram.com/language/guide/ListingOfNamedCharacters.html")
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, "html.parser")
    print("{")
    for p in soup.find_all("p", class_="singleFunction"):
        sc = p.find("span", class_="special-character")
        if not sc:
            guide = p.find("span", class_="GuideCharacterImage")
            if not guide:
                # Attempt to extract the name from the ICN span and map it to Unicode
                icn = p.find("span", class_="ICN")
                if icn:
                    icn_text = icn.get_text(strip=True)
                    if icn_text.startswith(r"\[") and icn_text.endswith("]"):
                        name = icn_text[2:-1]
                        code = named_to_unicode.get(name, name)
                        print(f'"{name}":"{code}",')
                        d[name] = code
                        continue
                print(f"Skipping: {p}", file=sys.stderr)
                continue
            else:
                a = guide.find("a")
                if not a:
                    print(f"Skipping: {p}", file=sys.stderr)
                    continue
                code = a.get_text(strip=True)
                icn = p.find("span", class_="ICN")
                if not icn:
                    print(f"Skipping: {p}", file=sys.stderr)
                    continue
                icn_text = icn.get_text(strip=True)
                if icn_text.startswith(r"\[") and icn_text.endswith("]"):
                    name = icn_text[2:-1]
                else:
                    name = icn_text
        else:
            name = next((c for c in sc["class"] if c not in ("special-character", "formalcharacter")), None)
            code = sc.get_text(strip=True)

        if name in ["Null", "RawEscape"]:
            continue  # Don't bother with these, as they screw up the output
        if len(code) != 1:
            code = named_to_unicode.get(name, name)
        elif unicodedata.category(code)=="Co":  # This is a private use character
            code = name
        if code.startswith("FormalScriptCapital"):
            try:
                code = unicodedata.lookup(
                    # Don't just use the last character, because we
                    # want to error if there is more than one.
                    "MATHEMATICAL SCRIPT CAPITAL " + code[len("FormalScriptCapital"):]
                ) + "\u0323"
            except KeyError:
                pass
        if code.startswith("FormalScriptCapital"):
            try:
                code = unicodedata.lookup(
                    # Don't just use the last character, because we
                    # want to error if there is more than one.
                    "SCRIPT CAPITAL " + code[len("FormalScriptCapital"):]
                ) + "\u0323"
            except KeyError:
                pass
        if code.startswith("ScriptCapital"):
            try:
                code = unicodedata.lookup(
                    # Don't just use the last character, because we
                    # want to error if there is more than one.
                    "MATHEMATICAL SCRIPT CAPITAL " + code[len("ScriptCapital"):]
                )
            except KeyError:
                pass
        if code.startswith("FormalScript"):
            try:
                code = unicodedata.lookup(
                    # Don't just use the last character, because we
                    # want to error if there is more than one.
                    "MATHEMATICAL SCRIPT SMALL " + code[len("FormalScript"):]
                ) + "\u0323"
            except KeyError:
                pass
        if code.startswith("FormalScript"):
            try:
                code = unicodedata.lookup(
                    # Don't just use the last character, because we
                    # want to error if there is more than one.
                    "SCRIPT SMALL " + code[len("FormalScript"):]
                ) + "\u0323"
            except KeyError:
                pass
        if code.startswith("Script"):
            try:
                code = unicodedata.lookup(
                    # Don't just use the last character, because we
                    # want to error if there is more than one.
                    "MATHEMATICAL SCRIPT SMALL " + code[len("Script"):]
                )
            except KeyError:
                pass
        # Do the same for "Gothic", "GothicCapital", "DoubleStruck", and "DoubleStruckCapital"
        if code.startswith("GothicCapital"):
            try:
                code = unicodedata.lookup(
                    "MATHEMATICAL FRAKTUR CAPITAL " + code[len("GothicCapital"):]
                )
            except KeyError:
                pass
        if code.startswith("Gothic"):
            try:
                code = unicodedata.lookup(
                    "MATHEMATICAL FRAKTUR SMALL " + code[len("Gothic"):]
                )
            except KeyError:
                pass
        if code.startswith("DoubleStruckCapital"):
            try:
                code = unicodedata.lookup(
                    "MATHEMATICAL DOUBLE-STRUCK CAPITAL " + code[len("DoubleStruckCapital"):]
                )
            except KeyError:
                pass
        if code.startswith("DoubleStruckCapital"):
            try:
                code = unicodedata.lookup(
                    "DOUBLE-STRUCK CAPITAL " + code[len("DoubleStruckCapital"):]
                )
            except KeyError:
                pass
        if code.startswith("DoubleStruck"):
            try:
                code = unicodedata.lookup(
                    "MATHEMATICAL DOUBLE-STRUCK SMALL " + code[len("DoubleStruck"):]
                )
            except KeyError:
                pass
        if name.startswith("Formal") and len(code) == 1:
            code = f"{code}\u0323"

        if code == "\\":
            code = "\\\\"
        if code == '"':
            code = '\\"'

        print(f""""{name}":"{code}",""")
        d[name] = code

        if not name.isascii() or not name.isprintable():
            print(f"Warning: {name} is not printable ASCII: {code!r}", file=sys.stderr)

        # Check to see if any of the characters in `name` is not in [a-zA-Z0-9]
        if any(c not in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" for c in name):
            print(f"Warning: {name} contains non-alphanumeric characters", file=sys.stderr)

    print("}")
    return d

 if __name__ == "__main__":
    extract()
	"""

	This script downloads and extracts named characters from the Wolfram
	Language documentation and generates a dictionary mapping those names
	to their Unicode representations. The named characters are available
	at

	https://reference.wolfram.com/language/guide/ListingOfNamedCharacters.html

	Not all named characters have a direct Unicode equivalent, in which
	case the name itself is used as the value. Some characters — "Null"
	and "RawEscape" — do not fit nicely in a python script and are
	skipped.

	"""


	import sys
	import unicodedata
	from bs4 import BeautifulSoup
	import requests

	# This is a mapping of named characters where the webpage doesn't
	# actually include a representation of the character. Commented-out
	# entries are do not have a good unicode equivalent.
	named_to_unicode = {
	# AlignmentMarker
	# AutoLeftMatch
	# AutoOperand
	# AutoRightMatch
	# AutoSpace
	"DiscretionaryHyphen": "\u00AD",
	"DiscretionaryLineSeparator": "\u2028",
	"DiscretionaryParagraphSeparator": "\u2029",
	# ImplicitPlus
	# IndentingNewLine
	"InvisibleApplication": "\u2061",
	"InvisibleComma": "\u2063",
	# InvisiblePostfixScriptBase
	# InvisiblePrefixScriptBase
	"InvisibleSpace": "\u200B",
	"InvisibleTimes": "\u2062",
	# LetterSpace
	"LineSeparator": "\u2028",
	"MediumSpace": "\u2005",
	"NewLine": "\n",
	"ParagraphSeparator": "\u2029",
	# RawReturn
	"ThickSpace": "\u2004",
	"ThinSpace": "\u2009",
	"VeryThinSpace": "\u200A",
	# NegativeThinSpace
	# NegativeMediumSpace
	# NegativeThickSpace
	# NegativeVeryThinSpace
	"NoBreak": "\u00A0",
	"NonBreakingSpace": "\u00A0",
	"Null": "\u0000",
	}

	def extract():
	d = {}
	resp = requests.get("https://reference.wolfram.com/language/guide/ListingOfNamedCharacters.html")
	resp.raise_for_status()
	soup = BeautifulSoup(resp.text, "html.parser")
	print("{")
	for p in soup.find_all("p", class_="singleFunction"):
	sc = p.find("span", class_="special-character")
	if not sc:
	guide = p.find("span", class_="GuideCharacterImage")
	if not guide:
	# Attempt to extract the name from the ICN span and map it to Unicode
	icn = p.find("span", class_="ICN")
	if icn:
	icn_text = icn.get_text(strip=True)
	if icn_text.startswith(r"\[") and icn_text.endswith("]"):
	name = icn_text[2:-1]
	code = named_to_unicode.get(name, name)
	print(f'"{name}":"{code}",')
	d[name] = code
	continue
	print(f"Skipping: {p}", file=sys.stderr)
	continue
	else:
	a = guide.find("a")
	if not a:
	print(f"Skipping: {p}", file=sys.stderr)
	continue
	code = a.get_text(strip=True)
	icn = p.find("span", class_="ICN")
	if not icn:
	print(f"Skipping: {p}", file=sys.stderr)
	continue
	icn_text = icn.get_text(strip=True)
	if icn_text.startswith(r"\[") and icn_text.endswith("]"):
	name = icn_text[2:-1]
	else:
	name = icn_text
	else:
	name = next((c for c in sc["class"] if c not in ("special-character", "formalcharacter")), None)
	code = sc.get_text(strip=True)

	if name in ["Null", "RawEscape"]:
	continue # Don't bother with these, as they screw up the output
	if len(code) != 1:
	code = named_to_unicode.get(name, name)
	elif unicodedata.category(code)=="Co": # This is a private use character
	code = name
	if code.startswith("FormalScriptCapital"):
	try:
	code = unicodedata.lookup(
	# Don't just use the last character, because we
	# want to error if there is more than one.
	"MATHEMATICAL SCRIPT CAPITAL " + code[len("FormalScriptCapital"):]
	) + "\u0323"
	except KeyError:
	pass
	if code.startswith("FormalScriptCapital"):
	try:
	code = unicodedata.lookup(
	# Don't just use the last character, because we
	# want to error if there is more than one.
	"SCRIPT CAPITAL " + code[len("FormalScriptCapital"):]
	) + "\u0323"
	except KeyError:
	pass
	if code.startswith("ScriptCapital"):
	try:
	code = unicodedata.lookup(
	# Don't just use the last character, because we
	# want to error if there is more than one.
	"MATHEMATICAL SCRIPT CAPITAL " + code[len("ScriptCapital"):]
	)
	except KeyError:
	pass
	if code.startswith("FormalScript"):
	try:
	code = unicodedata.lookup(
	# Don't just use the last character, because we
	# want to error if there is more than one.
	"MATHEMATICAL SCRIPT SMALL " + code[len("FormalScript"):]
	) + "\u0323"
	except KeyError:
	pass
	if code.startswith("FormalScript"):
	try:
	code = unicodedata.lookup(
	# Don't just use the last character, because we
	# want to error if there is more than one.
	"SCRIPT SMALL " + code[len("FormalScript"):]
	) + "\u0323"
	except KeyError:
	pass
	if code.startswith("Script"):
	try:
	code = unicodedata.lookup(
	# Don't just use the last character, because we
	# want to error if there is more than one.
	"MATHEMATICAL SCRIPT SMALL " + code[len("Script"):]
	)
	except KeyError:
	pass
	# Do the same for "Gothic", "GothicCapital", "DoubleStruck", and "DoubleStruckCapital"
	if code.startswith("GothicCapital"):
	try:
	code = unicodedata.lookup(
	"MATHEMATICAL FRAKTUR CAPITAL " + code[len("GothicCapital"):]
	)
	except KeyError:
	pass
	if code.startswith("Gothic"):
	try:
	code = unicodedata.lookup(
	"MATHEMATICAL FRAKTUR SMALL " + code[len("Gothic"):]
	)
	except KeyError:
	pass
	if code.startswith("DoubleStruckCapital"):
	try:
	code = unicodedata.lookup(
	"MATHEMATICAL DOUBLE-STRUCK CAPITAL " + code[len("DoubleStruckCapital"):]
	)
	except KeyError:
	pass
	if code.startswith("DoubleStruckCapital"):
	try:
	code = unicodedata.lookup(
	"DOUBLE-STRUCK CAPITAL " + code[len("DoubleStruckCapital"):]
	)
	except KeyError:
	pass
	if code.startswith("DoubleStruck"):
	try:
	code = unicodedata.lookup(
	"MATHEMATICAL DOUBLE-STRUCK SMALL " + code[len("DoubleStruck"):]
	)
	except KeyError:
	pass
	if name.startswith("Formal") and len(code) == 1:
	code = f"{code}\u0323"

	if code == "\\":
	code = "\\\\"
	if code == '"':
	code = '\\"'

	print(f""""{name}":"{code}",""")
	d[name] = code

	if not name.isascii() or not name.isprintable():
	print(f"Warning: {name} is not printable ASCII: {code!r}", file=sys.stderr)

	# Check to see if any of the characters in `name` is not in [a-zA-Z0-9]
	if any(c not in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" for c in name):
	print(f"Warning: {name} contains non-alphanumeric characters", file=sys.stderr)

	print("}")
	return d

	if __name__ == "__main__":
	extract()