Created
August 5, 2025 04:17
-
-
Save dmyersturnbull/c9575f16317d8c52cf9b93209e921ecf to your computer and use it in GitHub Desktop.
Various regex for JSON key names. Blacklist "problem" Unicode code points (control, surrogate, private use, etc.).
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# SPDX-FileCopyrightText: Copyright 2020-2024, Contributors | |
# SPDX-PackageHomePage: https://gist.github.com/dmyersturnbull | |
# SPDX-License-Identifier: Apache-2.0 | |
"""Collection of regex strings that JSON keys can be matched against. | |
All patterns are maximally high-performance (use the fewest CPU cycles and the least memory). | |
Includes `kebab-case`, `snake_case`, `camelCase`, `PascalCase`, etc. | |
Also has a blacklist of "problem" Unicode code points, which we define as | |
control characters, surrogate characters, private use area (PUA) code points, and non-characters. | |
There are two very general patterns, which use this blacklist: | |
- One or more Unicode characters (among all 16 Planes), except those blacklisted. | |
- One or more characters from the Basic Multilingual Plane (BMP; Plane 0), except those blacklisted. | |
""" | |
import re | |
from dataclasses import dataclass | |
from typing import ClassVar, Final | |
Regex = re.Pattern[str] | |
_problem_utf_ranges: Final = [ | |
r"\x00-\x1F", # C0 control | |
r"\x7F-\x9F", # DEL + C1 control | |
r"\uD800-\uDFFF", # surrogate | |
r"\uE000-\uF8FF", # BMP Private Use Area (PUA) | |
r"\uFDD0-\uFDEF", # BMP non-char (first range) | |
r"\uFFFE-\uFFFF", # BMP non-char (second range) | |
r"\U000F0000-\U000FFFFD", # Plane-15 PUA | |
r"\U00100000-\U0010FFFD", # Plane-16 PUA | |
r"\U0001FFFE-\U0001FFFF", # Plane 1 non-char ... | |
r"\U0002FFFE-\U0002FFFF", | |
r"\U0003FFFE-\U0003FFFF", | |
r"\U0004FFFE-\U0004FFFF", | |
r"\U0005FFFE-\U0005FFFF", | |
r"\U0006FFFE-\U0006FFFF", | |
r"\U0007FFFE-\U0007FFFF", | |
r"\U0008FFFE-\U0008FFFF", | |
r"\U0009FFFE-\U0009FFFF", | |
r"\U000AFFFE-\U000AFFFF", | |
r"\U000BFFFE-\U000BFFFF", | |
r"\U000CFFFE-\U000CFFFF", | |
r"\U000DFFFE-\U000DFFFF", | |
r"\U000EFFFE-\U000EFFFF", | |
r"\U000FFFFE-\U000FFFFF", | |
r"\U0010FFFE-\U0010FFFF", # ... Plane 16 non-char | |
] | |
_safe_bmp_ranges: Final = [ | |
r"\u0020-\u007E", # printable ASCII | |
r"\u00A0-\uD7FF", # from last control char (009F) to first surrogate (D800) | |
r"\uF900-\uFDCF", # from last PUA char to first non-char | |
r"\uFDF0-\uFFFD", # from last non-char of first range to first non-char of second range | |
] | |
@dataclass(frozen=True, slots=True, eq=False) | |
class KeyPatterns: | |
"""Standard and recommended key patterns. | |
Note that all patterns only match non-empty strings. | |
Examples: | |
>>> KeyPatterns.sane.fullmatch("-😀-") | |
<re.Match object; span=(0, 3), match='-😀-'> | |
>>> KeyPatterns.sane.fullmatch("\u0001") | |
>>> KeyPatterns.bmp.fullmatch("-αβγ-") | |
<re.Match object; span=(0, 5), match='-αβγ-'> | |
>>> KeyPatterns.bmp.fullmatch("😀") | |
>>> KeyPatterns.mixed.fullmatch("abC_D-efg") | |
<re.Match object; span=(0, 9), match='abC_D-efg'> | |
>>> KeyPatterns.mixed.fullmatch("-abC") | |
>>> KeyPatterns.mixed.fullmatch("a--b") | |
>>> KeyPatterns.snake.fullmatch("snake_1") | |
<re.Match object; span=(0, 7), match='snake_1'> | |
>>> KeyPatterns.snake.fullmatch("WRONG_CASE") | |
>>> KeyPatterns.snake.fullmatch("wrong__case") | |
>>> KeyPatterns.snake.fullmatch("_") | |
>>> KeyPatterns.scream.fullmatch("SCREAM_CASE_1") | |
<re.Match object; span=(0, 13), match='SCREAM_CASE_1'> | |
>>> KeyPatterns.scream.fullmatch("wrong_case") | |
>>> KeyPatterns.scream.fullmatch("WRONG__CASE") | |
>>> KeyPatterns.scream.fullmatch("_") | |
>>> KeyPatterns.kebab.fullmatch("kebab-case-1") | |
<re.Match object; span=(0, 12), match='kebab-case-1'> | |
>>> KeyPatterns.kebab.fullmatch("-") | |
>>> KeyPatterns.kebab.fullmatch("a--b") | |
>>> KeyPatterns.train.fullmatch("Content-Encoding") | |
<re.Match object; span=(0, 16), match='Content-Encoding'> | |
>>> KeyPatterns.train.fullmatch("Wrong-HEADER") | |
>>> KeyPatterns.camel.fullmatch("workAtCIA") | |
<re.Match object; span=(0, 9), match='workAtCIA'> | |
>>> KeyPatterns.camel.fullmatch("WrongCase") | |
>>> KeyPatterns.pascal.fullmatch("AttendMIT1") | |
<re.Match object; span=(0, 10), match='AttendMIT1'> | |
>>> KeyPatterns.pascal.fullmatch("wrongCase") | |
>>> KeyPatterns.ecma_id.fullmatch("_aBcD_123") | |
<re.Match object; span=(0, 9), match='_aBcD_123'> | |
>>> KeyPatterns.ecma_id.fullmatch("1abc") | |
Attributes: | |
sane: Unicode except "problem" code points (control, surrogate, private use, and non-char). | |
bmp: Basic Multilingual Plane (BMP) except problem code points. | |
mixed: e.g. `abC_D-efg`. (No consecutive, starting, or ending punctuation.) | |
snake: e.g. `snake_1`. (No consecutive, starting, or ending `_`.) | |
scream: e.g. `SCREAM_CASE_1`. Uppercase variant of `snake`. Also called constant case. | |
kebab: e.g. `kebab-case-1`. (No consecutive, starting, or ending `-`.) | |
train: e.g. `Content-Encoding`. Variant of `kebab` but with each first letter capitalized. | |
camel: e.g. `camelCase1`. (Note that `iAmAKid` is valid.) | |
pascal: e.g. `PascalCase1`. (Note that `IAmAKid` is valid.) | |
ecma_id: e.g. `_aBcD_123`; starts with letter or `_`. Can be unquoted in JSONPath and jq. | |
""" | |
sane: ClassVar[Regex] = re.compile(rf"^[^{''.join(_problem_utf_ranges)}]++$") | |
bmp: ClassVar[Regex] = re.compile(rf"^[{''.join(_safe_bmp_ranges)}]++$") | |
mixed: ClassVar[Regex] = re.compile(r"[A-Za-z0-9]++(?:[_-][A-Za-z0-9]++)*+") | |
snake: ClassVar[Regex] = re.compile(r"[a-z0-9]++(?:_[a-z0-9]++)*+") | |
scream: ClassVar[Regex] = re.compile(r"[A-Z0-9]++(?:_[A-Z0-9]++)*+") | |
kebab: ClassVar[Regex] = re.compile(r"[a-z0-9]++(?:-[a-z0-9]++)*+") | |
train: ClassVar[Regex] = re.compile(r"[A-Z0-9][a-z0-9]*+(?:-[A-Z0-9][a-z0-9]*+)*+") | |
camel: ClassVar[Regex] = re.compile(r"(?:[a-z0-9]++[A-Z]*+)++") | |
pascal: ClassVar[Regex] = re.compile(r"(?:[A-Z0-9]++[a-z]*+)++") | |
ecma_id: ClassVar[Regex] = re.compile(r"[A-Za-z_]\w*") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment