Skip to content

Instantly share code, notes, and snippets.

@dmyersturnbull
Created August 5, 2025 04:17
Show Gist options
  • Save dmyersturnbull/c9575f16317d8c52cf9b93209e921ecf to your computer and use it in GitHub Desktop.
Save dmyersturnbull/c9575f16317d8c52cf9b93209e921ecf to your computer and use it in GitHub Desktop.
Various regex for JSON key names. Blacklist "problem" Unicode code points (control, surrogate, private use, etc.).
# SPDX-FileCopyrightText: Copyright 2020-2024, Contributors
# SPDX-PackageHomePage: https://gist.github.com/dmyersturnbull
# SPDX-License-Identifier: Apache-2.0
"""Collection of regex strings that JSON keys can be matched against.
All patterns are maximally high-performance (use the fewest CPU cycles and the least memory).
Includes `kebab-case`, `snake_case`, `camelCase`, `PascalCase`, etc.
Also has a blacklist of "problem" Unicode code points, which we define as
control characters, surrogate characters, private use area (PUA) code points, and non-characters.
There are two very general patterns, which use this blacklist:
- One or more Unicode characters (among all 16 Planes), except those blacklisted.
- One or more characters from the Basic Multilingual Plane (BMP; Plane 0), except those blacklisted.
"""
import re
from dataclasses import dataclass
from typing import ClassVar, Final
Regex = re.Pattern[str]
_problem_utf_ranges: Final = [
r"\x00-\x1F", # C0 control
r"\x7F-\x9F", # DEL + C1 control
r"\uD800-\uDFFF", # surrogate
r"\uE000-\uF8FF", # BMP Private Use Area (PUA)
r"\uFDD0-\uFDEF", # BMP non-char (first range)
r"\uFFFE-\uFFFF", # BMP non-char (second range)
r"\U000F0000-\U000FFFFD", # Plane-15 PUA
r"\U00100000-\U0010FFFD", # Plane-16 PUA
r"\U0001FFFE-\U0001FFFF", # Plane 1 non-char ...
r"\U0002FFFE-\U0002FFFF",
r"\U0003FFFE-\U0003FFFF",
r"\U0004FFFE-\U0004FFFF",
r"\U0005FFFE-\U0005FFFF",
r"\U0006FFFE-\U0006FFFF",
r"\U0007FFFE-\U0007FFFF",
r"\U0008FFFE-\U0008FFFF",
r"\U0009FFFE-\U0009FFFF",
r"\U000AFFFE-\U000AFFFF",
r"\U000BFFFE-\U000BFFFF",
r"\U000CFFFE-\U000CFFFF",
r"\U000DFFFE-\U000DFFFF",
r"\U000EFFFE-\U000EFFFF",
r"\U000FFFFE-\U000FFFFF",
r"\U0010FFFE-\U0010FFFF", # ... Plane 16 non-char
]
_safe_bmp_ranges: Final = [
r"\u0020-\u007E", # printable ASCII
r"\u00A0-\uD7FF", # from last control char (009F) to first surrogate (D800)
r"\uF900-\uFDCF", # from last PUA char to first non-char
r"\uFDF0-\uFFFD", # from last non-char of first range to first non-char of second range
]
@dataclass(frozen=True, slots=True, eq=False)
class KeyPatterns:
"""Standard and recommended key patterns.
Note that all patterns only match non-empty strings.
Examples:
>>> KeyPatterns.sane.fullmatch("-😀-")
<re.Match object; span=(0, 3), match='-😀-'>
>>> KeyPatterns.sane.fullmatch("\u0001")
>>> KeyPatterns.bmp.fullmatch("-αβγ-")
<re.Match object; span=(0, 5), match='-αβγ-'>
>>> KeyPatterns.bmp.fullmatch("😀")
>>> KeyPatterns.mixed.fullmatch("abC_D-efg")
<re.Match object; span=(0, 9), match='abC_D-efg'>
>>> KeyPatterns.mixed.fullmatch("-abC")
>>> KeyPatterns.mixed.fullmatch("a--b")
>>> KeyPatterns.snake.fullmatch("snake_1")
<re.Match object; span=(0, 7), match='snake_1'>
>>> KeyPatterns.snake.fullmatch("WRONG_CASE")
>>> KeyPatterns.snake.fullmatch("wrong__case")
>>> KeyPatterns.snake.fullmatch("_")
>>> KeyPatterns.scream.fullmatch("SCREAM_CASE_1")
<re.Match object; span=(0, 13), match='SCREAM_CASE_1'>
>>> KeyPatterns.scream.fullmatch("wrong_case")
>>> KeyPatterns.scream.fullmatch("WRONG__CASE")
>>> KeyPatterns.scream.fullmatch("_")
>>> KeyPatterns.kebab.fullmatch("kebab-case-1")
<re.Match object; span=(0, 12), match='kebab-case-1'>
>>> KeyPatterns.kebab.fullmatch("-")
>>> KeyPatterns.kebab.fullmatch("a--b")
>>> KeyPatterns.train.fullmatch("Content-Encoding")
<re.Match object; span=(0, 16), match='Content-Encoding'>
>>> KeyPatterns.train.fullmatch("Wrong-HEADER")
>>> KeyPatterns.camel.fullmatch("workAtCIA")
<re.Match object; span=(0, 9), match='workAtCIA'>
>>> KeyPatterns.camel.fullmatch("WrongCase")
>>> KeyPatterns.pascal.fullmatch("AttendMIT1")
<re.Match object; span=(0, 10), match='AttendMIT1'>
>>> KeyPatterns.pascal.fullmatch("wrongCase")
>>> KeyPatterns.ecma_id.fullmatch("_aBcD_123")
<re.Match object; span=(0, 9), match='_aBcD_123'>
>>> KeyPatterns.ecma_id.fullmatch("1abc")
Attributes:
sane: Unicode except "problem" code points (control, surrogate, private use, and non-char).
bmp: Basic Multilingual Plane (BMP) except problem code points.
mixed: e.g. `abC_D-efg`. (No consecutive, starting, or ending punctuation.)
snake: e.g. `snake_1`. (No consecutive, starting, or ending `_`.)
scream: e.g. `SCREAM_CASE_1`. Uppercase variant of `snake`. Also called constant case.
kebab: e.g. `kebab-case-1`. (No consecutive, starting, or ending `-`.)
train: e.g. `Content-Encoding`. Variant of `kebab` but with each first letter capitalized.
camel: e.g. `camelCase1`. (Note that `iAmAKid` is valid.)
pascal: e.g. `PascalCase1`. (Note that `IAmAKid` is valid.)
ecma_id: e.g. `_aBcD_123`; starts with letter or `_`. Can be unquoted in JSONPath and jq.
"""
sane: ClassVar[Regex] = re.compile(rf"^[^{''.join(_problem_utf_ranges)}]++$")
bmp: ClassVar[Regex] = re.compile(rf"^[{''.join(_safe_bmp_ranges)}]++$")
mixed: ClassVar[Regex] = re.compile(r"[A-Za-z0-9]++(?:[_-][A-Za-z0-9]++)*+")
snake: ClassVar[Regex] = re.compile(r"[a-z0-9]++(?:_[a-z0-9]++)*+")
scream: ClassVar[Regex] = re.compile(r"[A-Z0-9]++(?:_[A-Z0-9]++)*+")
kebab: ClassVar[Regex] = re.compile(r"[a-z0-9]++(?:-[a-z0-9]++)*+")
train: ClassVar[Regex] = re.compile(r"[A-Z0-9][a-z0-9]*+(?:-[A-Z0-9][a-z0-9]*+)*+")
camel: ClassVar[Regex] = re.compile(r"(?:[a-z0-9]++[A-Z]*+)++")
pascal: ClassVar[Regex] = re.compile(r"(?:[A-Z0-9]++[a-z]*+)++")
ecma_id: ClassVar[Regex] = re.compile(r"[A-Za-z_]\w*")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment