dmyersturnbull · August 5, 2025 04:17
diff --git a/json_key_patterns.py b/json_key_patterns.py
 # SPDX-FileCopyrightText: Copyright 2020-2024, Contributors
 # SPDX-PackageHomePage: https://gist.github.com/dmyersturnbull
 # SPDX-License-Identifier: Apache-2.0
 """Collection of regex strings that JSON keys can be matched against.

 All patterns are maximally high-performance (use the fewest CPU cycles and the least memory).
 Includes `kebab-case`, `snake_case`, `camelCase`, `PascalCase`, etc.
 Also has a blacklist of "problem" Unicode code points, which we define as
 control characters, surrogate characters, private use area (PUA) code points, and non-characters.
 There are two very general patterns, which use this blacklist:
 - One or more Unicode characters (among all 16 Planes), except those blacklisted.
 - One or more characters from the Basic Multilingual Plane (BMP; Plane 0), except those blacklisted.
 """

 import re
 from dataclasses import dataclass
 from typing import ClassVar, Final

 Regex = re.Pattern[str]
 _problem_utf_ranges: Final = [
    r"\x00-\x1F",  # C0 control
    r"\x7F-\x9F",  # DEL + C1 control
    r"\uD800-\uDFFF",  # surrogate
    r"\uE000-\uF8FF",  # BMP Private Use Area (PUA)
    r"\uFDD0-\uFDEF",  # BMP non-char (first range)
    r"\uFFFE-\uFFFF",  # BMP non-char (second range)
    r"\U000F0000-\U000FFFFD",  # Plane-15 PUA
    r"\U00100000-\U0010FFFD",  # Plane-16 PUA
    r"\U0001FFFE-\U0001FFFF",  # Plane 1 non-char ...
    r"\U0002FFFE-\U0002FFFF",
    r"\U0003FFFE-\U0003FFFF",
    r"\U0004FFFE-\U0004FFFF",
    r"\U0005FFFE-\U0005FFFF",
    r"\U0006FFFE-\U0006FFFF",
    r"\U0007FFFE-\U0007FFFF",
    r"\U0008FFFE-\U0008FFFF",
    r"\U0009FFFE-\U0009FFFF",
    r"\U000AFFFE-\U000AFFFF",
    r"\U000BFFFE-\U000BFFFF",
    r"\U000CFFFE-\U000CFFFF",
    r"\U000DFFFE-\U000DFFFF",
    r"\U000EFFFE-\U000EFFFF",
    r"\U000FFFFE-\U000FFFFF",
    r"\U0010FFFE-\U0010FFFF",  # ... Plane 16 non-char
 ]
 _safe_bmp_ranges: Final = [
    r"\u0020-\u007E",  # printable ASCII
    r"\u00A0-\uD7FF",  # from last control char (009F) to first surrogate (D800)
    r"\uF900-\uFDCF",  # from last PUA char to first non-char
    r"\uFDF0-\uFFFD",  # from last non-char of first range to first non-char of second range
 ]


 @dataclass(frozen=True, slots=True, eq=False)
 class KeyPatterns:
    """Standard and recommended key patterns.

    Note that all patterns only match non-empty strings.

    Examples:
        >>> KeyPatterns.sane.fullmatch("-😀-")
        <re.Match object; span=(0, 3), match='-😀-'>
        >>> KeyPatterns.sane.fullmatch("\u0001")

        >>> KeyPatterns.bmp.fullmatch("-αβγ-")
        <re.Match object; span=(0, 5), match='-αβγ-'>
        >>> KeyPatterns.bmp.fullmatch("😀")

        >>> KeyPatterns.mixed.fullmatch("abC_D-efg")
        <re.Match object; span=(0, 9), match='abC_D-efg'>
        >>> KeyPatterns.mixed.fullmatch("-abC")
        >>> KeyPatterns.mixed.fullmatch("a--b")

        >>> KeyPatterns.snake.fullmatch("snake_1")
        <re.Match object; span=(0, 7), match='snake_1'>
        >>> KeyPatterns.snake.fullmatch("WRONG_CASE")
        >>> KeyPatterns.snake.fullmatch("wrong__case")
        >>> KeyPatterns.snake.fullmatch("_")

        >>> KeyPatterns.scream.fullmatch("SCREAM_CASE_1")
        <re.Match object; span=(0, 13), match='SCREAM_CASE_1'>
        >>> KeyPatterns.scream.fullmatch("wrong_case")
        >>> KeyPatterns.scream.fullmatch("WRONG__CASE")
        >>> KeyPatterns.scream.fullmatch("_")

        >>> KeyPatterns.kebab.fullmatch("kebab-case-1")
        <re.Match object; span=(0, 12), match='kebab-case-1'>
        >>> KeyPatterns.kebab.fullmatch("-")
        >>> KeyPatterns.kebab.fullmatch("a--b")

        >>> KeyPatterns.train.fullmatch("Content-Encoding")
        <re.Match object; span=(0, 16), match='Content-Encoding'>
        >>> KeyPatterns.train.fullmatch("Wrong-HEADER")

        >>> KeyPatterns.camel.fullmatch("workAtCIA")
        <re.Match object; span=(0, 9), match='workAtCIA'>
        >>> KeyPatterns.camel.fullmatch("WrongCase")

        >>> KeyPatterns.pascal.fullmatch("AttendMIT1")
        <re.Match object; span=(0, 10), match='AttendMIT1'>
        >>> KeyPatterns.pascal.fullmatch("wrongCase")

        >>> KeyPatterns.ecma_id.fullmatch("_aBcD_123")
        <re.Match object; span=(0, 9), match='_aBcD_123'>
        >>> KeyPatterns.ecma_id.fullmatch("1abc")

    Attributes:
        sane: Unicode except "problem" code points (control, surrogate, private use, and non-char).
        bmp: Basic Multilingual Plane (BMP) except problem code points.
        mixed: e.g. `abC_D-efg`. (No consecutive, starting, or ending punctuation.)
        snake: e.g. `snake_1`. (No consecutive, starting, or ending `_`.)
        scream: e.g. `SCREAM_CASE_1`. Uppercase variant of `snake`. Also called constant case.
        kebab: e.g. `kebab-case-1`. (No consecutive, starting, or ending `-`.)
        train: e.g. `Content-Encoding`. Variant of `kebab` but with each first letter capitalized.
        camel: e.g. `camelCase1`. (Note that `iAmAKid` is valid.)
        pascal: e.g. `PascalCase1`. (Note that `IAmAKid` is valid.)
        ecma_id: e.g. `_aBcD_123`; starts with letter or `_`. Can be unquoted in JSONPath and jq.
    """

    sane: ClassVar[Regex] = re.compile(rf"^[^{''.join(_problem_utf_ranges)}]++$")
    bmp: ClassVar[Regex] = re.compile(rf"^[{''.join(_safe_bmp_ranges)}]++$")
    mixed: ClassVar[Regex] = re.compile(r"[A-Za-z0-9]++(?:[_-][A-Za-z0-9]++)*+")
    snake: ClassVar[Regex] = re.compile(r"[a-z0-9]++(?:_[a-z0-9]++)*+")
    scream: ClassVar[Regex] = re.compile(r"[A-Z0-9]++(?:_[A-Z0-9]++)*+")
    kebab: ClassVar[Regex] = re.compile(r"[a-z0-9]++(?:-[a-z0-9]++)*+")
    train: ClassVar[Regex] = re.compile(r"[A-Z0-9][a-z0-9]*+(?:-[A-Z0-9][a-z0-9]*+)*+")
    camel: ClassVar[Regex] = re.compile(r"(?:[a-z0-9]++[A-Z]*+)++")
    pascal: ClassVar[Regex] = re.compile(r"(?:[A-Z0-9]++[a-z]*+)++")
    ecma_id: ClassVar[Regex] = re.compile(r"[A-Za-z_]\w*")
	# SPDX-FileCopyrightText: Copyright 2020-2024, Contributors
	# SPDX-PackageHomePage: https://gist.github.com/dmyersturnbull
	# SPDX-License-Identifier: Apache-2.0
	"""Collection of regex strings that JSON keys can be matched against.

	All patterns are maximally high-performance (use the fewest CPU cycles and the least memory).
	Includes `kebab-case`, `snake_case`, `camelCase`, `PascalCase`, etc.
	Also has a blacklist of "problem" Unicode code points, which we define as
	control characters, surrogate characters, private use area (PUA) code points, and non-characters.
	There are two very general patterns, which use this blacklist:
	- One or more Unicode characters (among all 16 Planes), except those blacklisted.
	- One or more characters from the Basic Multilingual Plane (BMP; Plane 0), except those blacklisted.
	"""

	import re
	from dataclasses import dataclass
	from typing import ClassVar, Final

	Regex = re.Pattern[str]
	_problem_utf_ranges: Final = [
	r"\x00-\x1F", # C0 control
	r"\x7F-\x9F", # DEL + C1 control
	r"\uD800-\uDFFF", # surrogate
	r"\uE000-\uF8FF", # BMP Private Use Area (PUA)
	r"\uFDD0-\uFDEF", # BMP non-char (first range)
	r"\uFFFE-\uFFFF", # BMP non-char (second range)
	r"\U000F0000-\U000FFFFD", # Plane-15 PUA
	r"\U00100000-\U0010FFFD", # Plane-16 PUA
	r"\U0001FFFE-\U0001FFFF", # Plane 1 non-char ...
	r"\U0002FFFE-\U0002FFFF",
	r"\U0003FFFE-\U0003FFFF",
	r"\U0004FFFE-\U0004FFFF",
	r"\U0005FFFE-\U0005FFFF",
	r"\U0006FFFE-\U0006FFFF",
	r"\U0007FFFE-\U0007FFFF",
	r"\U0008FFFE-\U0008FFFF",
	r"\U0009FFFE-\U0009FFFF",
	r"\U000AFFFE-\U000AFFFF",
	r"\U000BFFFE-\U000BFFFF",
	r"\U000CFFFE-\U000CFFFF",
	r"\U000DFFFE-\U000DFFFF",
	r"\U000EFFFE-\U000EFFFF",
	r"\U000FFFFE-\U000FFFFF",
	r"\U0010FFFE-\U0010FFFF", # ... Plane 16 non-char
	]
	_safe_bmp_ranges: Final = [
	r"\u0020-\u007E", # printable ASCII
	r"\u00A0-\uD7FF", # from last control char (009F) to first surrogate (D800)
	r"\uF900-\uFDCF", # from last PUA char to first non-char
	r"\uFDF0-\uFFFD", # from last non-char of first range to first non-char of second range
	]


	@dataclass(frozen=True, slots=True, eq=False)
	class KeyPatterns:
	"""Standard and recommended key patterns.

	Note that all patterns only match non-empty strings.

	Examples:
	>>> KeyPatterns.sane.fullmatch("-😀-")
	<re.Match object; span=(0, 3), match='-😀-'>
	>>> KeyPatterns.sane.fullmatch("\u0001")

	>>> KeyPatterns.bmp.fullmatch("-αβγ-")
	<re.Match object; span=(0, 5), match='-αβγ-'>
	>>> KeyPatterns.bmp.fullmatch("😀")

	>>> KeyPatterns.mixed.fullmatch("abC_D-efg")
	<re.Match object; span=(0, 9), match='abC_D-efg'>
	>>> KeyPatterns.mixed.fullmatch("-abC")
	>>> KeyPatterns.mixed.fullmatch("a--b")

	>>> KeyPatterns.snake.fullmatch("snake_1")
	<re.Match object; span=(0, 7), match='snake_1'>
	>>> KeyPatterns.snake.fullmatch("WRONG_CASE")
	>>> KeyPatterns.snake.fullmatch("wrong__case")
	>>> KeyPatterns.snake.fullmatch("_")

	>>> KeyPatterns.scream.fullmatch("SCREAM_CASE_1")
	<re.Match object; span=(0, 13), match='SCREAM_CASE_1'>
	>>> KeyPatterns.scream.fullmatch("wrong_case")
	>>> KeyPatterns.scream.fullmatch("WRONG__CASE")
	>>> KeyPatterns.scream.fullmatch("_")

	>>> KeyPatterns.kebab.fullmatch("kebab-case-1")
	<re.Match object; span=(0, 12), match='kebab-case-1'>
	>>> KeyPatterns.kebab.fullmatch("-")
	>>> KeyPatterns.kebab.fullmatch("a--b")

	>>> KeyPatterns.train.fullmatch("Content-Encoding")
	<re.Match object; span=(0, 16), match='Content-Encoding'>
	>>> KeyPatterns.train.fullmatch("Wrong-HEADER")

	>>> KeyPatterns.camel.fullmatch("workAtCIA")
	<re.Match object; span=(0, 9), match='workAtCIA'>
	>>> KeyPatterns.camel.fullmatch("WrongCase")

	>>> KeyPatterns.pascal.fullmatch("AttendMIT1")
	<re.Match object; span=(0, 10), match='AttendMIT1'>
	>>> KeyPatterns.pascal.fullmatch("wrongCase")

	>>> KeyPatterns.ecma_id.fullmatch("_aBcD_123")
	<re.Match object; span=(0, 9), match='_aBcD_123'>
	>>> KeyPatterns.ecma_id.fullmatch("1abc")

	Attributes:
	sane: Unicode except "problem" code points (control, surrogate, private use, and non-char).
	bmp: Basic Multilingual Plane (BMP) except problem code points.
	mixed: e.g. `abC_D-efg`. (No consecutive, starting, or ending punctuation.)
	snake: e.g. `snake_1`. (No consecutive, starting, or ending `_`.)
	scream: e.g. `SCREAM_CASE_1`. Uppercase variant of `snake`. Also called constant case.
	kebab: e.g. `kebab-case-1`. (No consecutive, starting, or ending `-`.)
	train: e.g. `Content-Encoding`. Variant of `kebab` but with each first letter capitalized.
	camel: e.g. `camelCase1`. (Note that `iAmAKid` is valid.)
	pascal: e.g. `PascalCase1`. (Note that `IAmAKid` is valid.)
	ecma_id: e.g. `_aBcD_123`; starts with letter or `_`. Can be unquoted in JSONPath and jq.
	"""

	sane: ClassVar[Regex] = re.compile(rf"^[^{''.join(_problem_utf_ranges)}]++$")
	bmp: ClassVar[Regex] = re.compile(rf"^[{''.join(_safe_bmp_ranges)}]++$")
	mixed: ClassVar[Regex] = re.compile(r"[A-Za-z0-9]++(?:[_-][A-Za-z0-9]++)*+")
	snake: ClassVar[Regex] = re.compile(r"[a-z0-9]++(?:_[a-z0-9]++)*+")
	scream: ClassVar[Regex] = re.compile(r"[A-Z0-9]++(?:_[A-Z0-9]++)*+")
	kebab: ClassVar[Regex] = re.compile(r"[a-z0-9]++(?:-[a-z0-9]++)*+")
	train: ClassVar[Regex] = re.compile(r"[A-Z0-9][a-z0-9]+(?:-[A-Z0-9][a-z0-9]+)*+")
	camel: ClassVar[Regex] = re.compile(r"(?:[a-z0-9]++[A-Z]*+)++")
	pascal: ClassVar[Regex] = re.compile(r"(?:[A-Z0-9]++[a-z]*+)++")
	ecma_id: ClassVar[Regex] = re.compile(r"[A-Za-z_]\w*")