Last active
March 11, 2025 21:53
-
-
Save rendello/d37552507a389656e248f3255a618127 to your computer and use it in GitHub Desktop.
Unicode codepoints that expand or contract when case is changed in UTF-8. Good for testing parsers. Includes the data `utf8_case_data.rs` and the script to generate it, `generate_utf8.py`.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
Copyright (c) 2024 Rendello | |
Permission to use, copy, modify, and/or distribute this software for any | |
purpose with or without fee is hereby granted. | |
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH | |
REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY | |
AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, | |
INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM | |
LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR | |
OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR | |
PERFORMANCE OF THIS SOFTWARE. | |
*/ | |
// ========================================================================== | |
//! Unicode codepoints that expand or contract when case is changed in UTF-8. | |
// ========================================================================== | |
pub const LOWERCASING_CONTRACTS: [&str; 22] = [ | |
"ẞ", /* ß (3->2), -1 bytes */ | |
"Ω", /* ω (3->2), -1 bytes */ | |
"Å", /* å (3->2), -1 bytes */ | |
"Ɫ", /* ɫ (3->2), -1 bytes */ | |
"Ɽ", /* ɽ (3->2), -1 bytes */ | |
"Ɑ", /* ɑ (3->2), -1 bytes */ | |
"Ɱ", /* ɱ (3->2), -1 bytes */ | |
"Ɐ", /* ɐ (3->2), -1 bytes */ | |
"Ɒ", /* ɒ (3->2), -1 bytes */ | |
"Ȿ", /* ȿ (3->2), -1 bytes */ | |
"Ɀ", /* ɀ (3->2), -1 bytes */ | |
"Ɥ", /* ɥ (3->2), -1 bytes */ | |
"Ɦ", /* ɦ (3->2), -1 bytes */ | |
"Ɜ", /* ɜ (3->2), -1 bytes */ | |
"Ɡ", /* ɡ (3->2), -1 bytes */ | |
"Ɬ", /* ɬ (3->2), -1 bytes */ | |
"Ɪ", /* ɪ (3->2), -1 bytes */ | |
"Ʞ", /* ʞ (3->2), -1 bytes */ | |
"Ʇ", /* ʇ (3->2), -1 bytes */ | |
"Ʝ", /* ʝ (3->2), -1 bytes */ | |
"Ʂ", /* ʂ (3->2), -1 bytes */ | |
"K", /* k (3->1), -2 bytes */ | |
]; | |
pub const LOWERCASING_EXPANDS: [&str; 2] = [ | |
"Ⱥ", /* ⱥ (2->3), +1 bytes */ | |
"Ⱦ", /* ⱦ (2->3), +1 bytes */ | |
]; | |
pub const LOWERCASING_EXPANDS_MULTI_CHAR: [&str; 1] = [ | |
"İ", /* i̇ (2->3), +1 bytes, +1 chars */ | |
]; | |
pub const UPPERCASING_CONTRACTS: [&str; 13] = [ | |
"ı", /* I (2->1), -1 bytes */ | |
"ſ", /* S (2->1), -1 bytes */ | |
"ᲀ", /* В (3->2), -1 bytes */ | |
"ᲁ", /* Д (3->2), -1 bytes */ | |
"ᲂ", /* О (3->2), -1 bytes */ | |
"ᲃ", /* С (3->2), -1 bytes */ | |
"ᲄ", /* Т (3->2), -1 bytes */ | |
"ᲅ", /* Т (3->2), -1 bytes */ | |
"ᲆ", /* Ъ (3->2), -1 bytes */ | |
"ᲇ", /* Ѣ (3->2), -1 bytes */ | |
"ι", /* Ι (3->2), -1 bytes */ | |
"ⱥ", /* Ⱥ (3->2), -1 bytes */ | |
"ⱦ", /* Ⱦ (3->2), -1 bytes */ | |
]; | |
pub const UPPERCASING_CONTRACTS_MULTI_CHAR: [&str; 5] = [ | |
"ff", /* FF (3->2), -1 bytes, +1 chars */ | |
"fi", /* FI (3->2), -1 bytes, +1 chars */ | |
"fl", /* FL (3->2), -1 bytes, +1 chars */ | |
"ſt", /* ST (3->2), -1 bytes, +1 chars */ | |
"st", /* ST (3->2), -1 bytes, +1 chars */ | |
]; | |
pub const UPPERCASING_EXPANDS: [&str; 18] = [ | |
"ȿ", /* Ȿ (2->3), +1 bytes */ | |
"ɀ", /* Ɀ (2->3), +1 bytes */ | |
"ɐ", /* Ɐ (2->3), +1 bytes */ | |
"ɑ", /* Ɑ (2->3), +1 bytes */ | |
"ɒ", /* Ɒ (2->3), +1 bytes */ | |
"ɜ", /* Ɜ (2->3), +1 bytes */ | |
"ɡ", /* Ɡ (2->3), +1 bytes */ | |
"ɥ", /* Ɥ (2->3), +1 bytes */ | |
"ɦ", /* Ɦ (2->3), +1 bytes */ | |
"ɪ", /* Ɪ (2->3), +1 bytes */ | |
"ɫ", /* Ɫ (2->3), +1 bytes */ | |
"ɬ", /* Ɬ (2->3), +1 bytes */ | |
"ɱ", /* Ɱ (2->3), +1 bytes */ | |
"ɽ", /* Ɽ (2->3), +1 bytes */ | |
"ʂ", /* Ʂ (2->3), +1 bytes */ | |
"ʇ", /* Ʇ (2->3), +1 bytes */ | |
"ʝ", /* Ʝ (2->3), +1 bytes */ | |
"ʞ", /* Ʞ (2->3), +1 bytes */ | |
]; | |
pub const UPPERCASING_EXPANDS_MULTI_CHAR: [&str; 89] = [ | |
"ΐ", /* Ϊ́ (2->6), +4 bytes, +2 chars */ | |
"ΰ", /* Ϋ́ (2->6), +4 bytes, +2 chars */ | |
"ὒ", /* Υ̓̀ (3->6), +3 bytes, +2 chars */ | |
"ὔ", /* Υ̓́ (3->6), +3 bytes, +2 chars */ | |
"ὖ", /* Υ̓͂ (3->6), +3 bytes, +2 chars */ | |
"ᾷ", /* Α͂Ι (3->6), +3 bytes, +2 chars */ | |
"ῇ", /* Η͂Ι (3->6), +3 bytes, +2 chars */ | |
"ῒ", /* Ϊ̀ (3->6), +3 bytes, +2 chars */ | |
"ΐ", /* Ϊ́ (3->6), +3 bytes, +2 chars */ | |
"ῗ", /* Ϊ͂ (3->6), +3 bytes, +2 chars */ | |
"ῢ", /* Ϋ̀ (3->6), +3 bytes, +2 chars */ | |
"ΰ", /* Ϋ́ (3->6), +3 bytes, +2 chars */ | |
"ῧ", /* Ϋ͂ (3->6), +3 bytes, +2 chars */ | |
"ῷ", /* Ω͂Ι (3->6), +3 bytes, +2 chars */ | |
"և", /* ԵՒ (2->4), +2 bytes, +1 chars */ | |
"ᾀ", /* ἈΙ (3->5), +2 bytes, +1 chars */ | |
"ᾁ", /* ἉΙ (3->5), +2 bytes, +1 chars */ | |
"ᾂ", /* ἊΙ (3->5), +2 bytes, +1 chars */ | |
"ᾃ", /* ἋΙ (3->5), +2 bytes, +1 chars */ | |
"ᾄ", /* ἌΙ (3->5), +2 bytes, +1 chars */ | |
"ᾅ", /* ἍΙ (3->5), +2 bytes, +1 chars */ | |
"ᾆ", /* ἎΙ (3->5), +2 bytes, +1 chars */ | |
"ᾇ", /* ἏΙ (3->5), +2 bytes, +1 chars */ | |
"ᾈ", /* ἈΙ (3->5), +2 bytes, +1 chars */ | |
"ᾉ", /* ἉΙ (3->5), +2 bytes, +1 chars */ | |
"ᾊ", /* ἊΙ (3->5), +2 bytes, +1 chars */ | |
"ᾋ", /* ἋΙ (3->5), +2 bytes, +1 chars */ | |
"ᾌ", /* ἌΙ (3->5), +2 bytes, +1 chars */ | |
"ᾍ", /* ἍΙ (3->5), +2 bytes, +1 chars */ | |
"ᾎ", /* ἎΙ (3->5), +2 bytes, +1 chars */ | |
"ᾏ", /* ἏΙ (3->5), +2 bytes, +1 chars */ | |
"ᾐ", /* ἨΙ (3->5), +2 bytes, +1 chars */ | |
"ᾑ", /* ἩΙ (3->5), +2 bytes, +1 chars */ | |
"ᾒ", /* ἪΙ (3->5), +2 bytes, +1 chars */ | |
"ᾓ", /* ἫΙ (3->5), +2 bytes, +1 chars */ | |
"ᾔ", /* ἬΙ (3->5), +2 bytes, +1 chars */ | |
"ᾕ", /* ἭΙ (3->5), +2 bytes, +1 chars */ | |
"ᾖ", /* ἮΙ (3->5), +2 bytes, +1 chars */ | |
"ᾗ", /* ἯΙ (3->5), +2 bytes, +1 chars */ | |
"ᾘ", /* ἨΙ (3->5), +2 bytes, +1 chars */ | |
"ᾙ", /* ἩΙ (3->5), +2 bytes, +1 chars */ | |
"ᾚ", /* ἪΙ (3->5), +2 bytes, +1 chars */ | |
"ᾛ", /* ἫΙ (3->5), +2 bytes, +1 chars */ | |
"ᾜ", /* ἬΙ (3->5), +2 bytes, +1 chars */ | |
"ᾝ", /* ἭΙ (3->5), +2 bytes, +1 chars */ | |
"ᾞ", /* ἮΙ (3->5), +2 bytes, +1 chars */ | |
"ᾟ", /* ἯΙ (3->5), +2 bytes, +1 chars */ | |
"ᾠ", /* ὨΙ (3->5), +2 bytes, +1 chars */ | |
"ᾡ", /* ὩΙ (3->5), +2 bytes, +1 chars */ | |
"ᾢ", /* ὪΙ (3->5), +2 bytes, +1 chars */ | |
"ᾣ", /* ὫΙ (3->5), +2 bytes, +1 chars */ | |
"ᾤ", /* ὬΙ (3->5), +2 bytes, +1 chars */ | |
"ᾥ", /* ὭΙ (3->5), +2 bytes, +1 chars */ | |
"ᾦ", /* ὮΙ (3->5), +2 bytes, +1 chars */ | |
"ᾧ", /* ὯΙ (3->5), +2 bytes, +1 chars */ | |
"ᾨ", /* ὨΙ (3->5), +2 bytes, +1 chars */ | |
"ᾩ", /* ὩΙ (3->5), +2 bytes, +1 chars */ | |
"ᾪ", /* ὪΙ (3->5), +2 bytes, +1 chars */ | |
"ᾫ", /* ὫΙ (3->5), +2 bytes, +1 chars */ | |
"ᾬ", /* ὬΙ (3->5), +2 bytes, +1 chars */ | |
"ᾭ", /* ὭΙ (3->5), +2 bytes, +1 chars */ | |
"ᾮ", /* ὮΙ (3->5), +2 bytes, +1 chars */ | |
"ᾯ", /* ὯΙ (3->5), +2 bytes, +1 chars */ | |
"ᾲ", /* ᾺΙ (3->5), +2 bytes, +1 chars */ | |
"ῂ", /* ῊΙ (3->5), +2 bytes, +1 chars */ | |
"ῲ", /* ῺΙ (3->5), +2 bytes, +1 chars */ | |
"ʼn", /* ʼN (2->3), +1 bytes, +1 chars */ | |
"ǰ", /* J̌ (2->3), +1 bytes, +1 chars */ | |
"ὐ", /* Υ̓ (3->4), +1 bytes, +1 chars */ | |
"ᾳ", /* ΑΙ (3->4), +1 bytes, +1 chars */ | |
"ᾴ", /* ΆΙ (3->4), +1 bytes, +1 chars */ | |
"ᾶ", /* Α͂ (3->4), +1 bytes, +1 chars */ | |
"ᾼ", /* ΑΙ (3->4), +1 bytes, +1 chars */ | |
"ῃ", /* ΗΙ (3->4), +1 bytes, +1 chars */ | |
"ῄ", /* ΉΙ (3->4), +1 bytes, +1 chars */ | |
"ῆ", /* Η͂ (3->4), +1 bytes, +1 chars */ | |
"ῌ", /* ΗΙ (3->4), +1 bytes, +1 chars */ | |
"ῖ", /* Ι͂ (3->4), +1 bytes, +1 chars */ | |
"ῤ", /* Ρ̓ (3->4), +1 bytes, +1 chars */ | |
"ῦ", /* Υ͂ (3->4), +1 bytes, +1 chars */ | |
"ῳ", /* ΩΙ (3->4), +1 bytes, +1 chars */ | |
"ῴ", /* ΏΙ (3->4), +1 bytes, +1 chars */ | |
"ῶ", /* Ω͂ (3->4), +1 bytes, +1 chars */ | |
"ῼ", /* ΩΙ (3->4), +1 bytes, +1 chars */ | |
"ﬓ", /* ՄՆ (3->4), +1 bytes, +1 chars */ | |
"ﬔ", /* ՄԵ (3->4), +1 bytes, +1 chars */ | |
"ﬕ", /* ՄԻ (3->4), +1 bytes, +1 chars */ | |
"ﬖ", /* ՎՆ (3->4), +1 bytes, +1 chars */ | |
"ﬗ", /* ՄԽ (3->4), +1 bytes, +1 chars */ | |
]; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Copyright (c) 2024 Rendello | |
Permission to use, copy, modify, and/or distribute this software for any | |
purpose with or without fee is hereby granted. | |
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH | |
REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY | |
AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, | |
INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM | |
LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR | |
OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR | |
PERFORMANCE OF THIS SOFTWARE. | |
""" | |
import sys | |
from dataclasses import dataclass | |
from typing import List, Dict | |
@dataclass | |
class Entry: | |
a: str | |
b: str | |
a_len: int | |
b_len: int | |
delta: int | |
a_char_count: int | |
b_char_count: int | |
delta_char_count: int | |
def sort_entries(l: List[Entry]) -> List[Entry]: | |
""" Sorted by size delta, then alphabetically. """ | |
return sorted(l, key= | |
lambda p: (-(p.delta_char_count), -(p.delta), p.a)) | |
def create_entry_map() -> dict[str, list[Entry]]: | |
entry_map = {} | |
for i in range(sys.maxunicode + 1): | |
a = chr(i) | |
for (case, b) in (('uppercasing', a.upper()), ('lowercasing', a.lower())): | |
attributes = [case] | |
try: | |
a_len = len(a.encode("utf8")) | |
b_len = len(b.encode("utf8")) | |
except UnicodeEncodeError: | |
continue | |
if a_len == b_len: | |
continue | |
delta = b_len - a_len | |
a_char_count = len(a) | |
b_char_count = len(b) | |
delta_char_count = b_char_count - a_char_count | |
if a_len < b_len: | |
attributes.append('expands') | |
elif a_len > b_len: | |
attributes.append('contracts') | |
if b_char_count > 1: | |
attributes.append('multi_char') | |
key = "_".join(attributes) | |
value = Entry(a, b, a_len, b_len, delta, a_char_count, b_char_count, delta_char_count) | |
if key not in entry_map: | |
entry_map[key] = [value] | |
else: | |
entry_map[key].append(value) | |
return entry_map | |
def entry_map_to_string(entry_map: Dict[str, List[Entry]]) -> str: | |
buffer = ( | |
f'''// =======================================================================\n''' | |
f'''//! Automatically generated using `task generate-utf8-case-data`.\n//!\n''' | |
f'''//! Unicode characters that behave oddly when the case is changed, for use\n''' | |
f'''//! with property tests.\n''' | |
f'''// =======================================================================\n\n''' | |
) | |
for key, unsorted_entries in sorted(list(entry_map.items())): | |
entries = sort_entries(unsorted_entries) | |
buffer += f'pub const {key.upper()}: [&str; {len(entries)}] = [\n' | |
for e in entries: | |
ds = "" | |
if e.delta_char_count != 0: | |
ds = f", {e.delta_char_count:+} chars" | |
buffer += f' "{e.a}",\t/* {e.b}\t({e.a_len}->{e.b_len}), {e.delta:+} bytes{ds} */\n' | |
buffer += "];\n\n" | |
return buffer.strip() | |
def generate_utf8_case_data(): | |
return entry_map_to_string(create_entry_map()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
@rept0id If you look at the included
generate_utf8.py
file, it's creating the whole list. Same with the "Unicode Roundtrip" Gist I linked. It's all automatic anyway, so changing the language generator would just be changing the output string format. The main issue would be structuring the repo, should it be just the generators and have the "outputs" be "releases"? Or should the outputs live beside the generators? I feel like having two languages in the same repo might not be useful, but at the same time I might like to use this test code for both Python and Rust.Perhaps the best solution would be to have the generator files, and have them generate the files in the repo itself so they're easily visible, with the caveat in comment form saying they're auto-generated (this is what my current project does). Then, I could potentially use the GH releases features to build libraries for Python, Rust, etc. That way property-cased testing generators (a different kind of generator, basically a type containing random values) could be bundled in.
I don't know 😆
Feel free to remix this code yourself too, the licence is in the files and is basically "do anything".