Last active
January 25, 2017 14:17
-
-
Save Kimtaro/93380e70c84a00735689d34117294357 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl | |
## Depending on which Perl version you're running you'll get different Unicode semantics | |
# Perl 5.18 (OSX default) - Unicode 6.2 | |
# Perl 5.24 (Latest, available through homebrew) - Unicode 8.0 | |
# Perl 5.25 (Bleeding edge, available through perlbrew) - Unicode 9.0 | |
## perlbrew allows you to easily install bleeding edge Perl: | |
# sudo /usr/bin/cpan App::perlbrew | |
# perlbrew install 5.25.9; perlbrew use 5.25.9 | |
use warnings; | |
use strict; | |
use v5.18; # Allow say | |
use English; # English variable names for readability | |
use utf8; # Tell perl that our file is utf8 | |
use Unicode::Normalize; # So we can normalize the string | |
binmode(STDOUT, ":utf8"); # Make sure STDOUT is utf8 | |
# What a "character" is depends entirely on the perceptions of the end user. We'll have to make some assumptions in the code below. | |
# One thing we do not deal with in this implementation is considering the locale of the end user. For example a Danish user might want AE to be counted as equal to Ä. | |
sub is_consisting_of_only_unique_characters { | |
my $string = $ARG[0]; | |
# We'll use a hash for checking if a characters has already been seen. | |
# Depending on the version of Unicode we're targeting the max amount of keys this could end up storing varies, but should be no more than ~130k. | |
my %count_for_character; | |
# Normalize to pre-composed characters, as \X distinguishes pre-composed and decomposed characters. | |
# It's worth noting that this presumes that we want ligatures to be counted as separate from the characters they contain. If we wanted ff to count as if it consisted of literal f's, then we need NFKC normalization instead. | |
$string = NFC($string); | |
# What a humans considers a character is represented by "extended grapheme clusters" in Unicode, so we need to iterate over these. Fortunately Perl gives us the \X matcher for exactly this purpose. | |
while($string =~ /\X/g) { | |
if($count_for_character{$MATCH}) { | |
return 0; | |
} | |
$count_for_character{$MATCH}++; | |
} | |
return 1; | |
} | |
my $string1 = "ÅÄÖ\x{41}\x{308}"; | |
my $string2 = "👨❤️💋👨\x{1F469}\x{200D}\x{2764}\x{FE0F}\x{200D}\x{1F48B}\x{200D}\x{1F469}"; | |
my $string3 = "🇸🇪🏳️🌈"; | |
my $string4 = "ff f"; | |
say is_consisting_of_only_unique_characters($string1) ? "$string1\t is unique" : "$string1\t is not unique"; | |
say is_consisting_of_only_unique_characters($string2) ? "$string2\t is unique" : "$string2\t is not unique"; | |
say is_consisting_of_only_unique_characters($string3) ? "$string3\t is unique" : "$string3\t is not unique"; | |
say is_consisting_of_only_unique_characters($string4) ? "$string4\t is unique" : "$string4\t is not unique"; | |
## Perl 5.24 and earlier: | |
# ÅÄÖÄ is not unique | |
# 👨❤️💋👨👩❤️💋👩 is not unique | |
# 🇸🇪🏳️🌈 is unique | |
# ff f is unique | |
## Perl 5.25 and later: | |
# ÅÄÖÄ is not unique | |
# 👨❤️💋👨👩❤️💋👩 is unique | |
# 🇸🇪🏳️🌈 is unique | |
# ff f is unique |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment