Last active
May 7, 2019 10:32
-
-
Save dtonhofer/29c8d561c911cc93052f2bb2181ee75e to your computer and use it in GitHub Desktop.
Comparing behaviour of Perl Data::Dumper when using "Pure Perl" and "XS" mode for non-iso-8859-1 codepoints
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl | |
# === | |
# Testing what Perl's Data::Dumper does with "high" characters e.g. | |
# | |
# å -> iso-8859-1 : 0xE5 | |
# Unicode UTF-16 : 0x00E5 | |
# Unicode UTF-8 : 0xC3A5 | |
# | |
# See also: | |
# | |
# https://stackoverflow.com/questions/50489062/how-to-display-readable-utf-8-strings-with-datadumper | |
# | |
# Note that the "UTF-8" pragma is on. | |
# The character 'å' is encoded in UTF-8 *in this program file* and | |
# the pragma tells Perl that this is so! | |
# | |
# We also tell Perl that STDERR and STDOUT and the test files to write | |
# are/shall-be UTF-8 encoded. | |
# | |
# We find: | |
# ======== | |
# | |
# The string: | |
# | |
# 'Nuuk (Godthåb)' | |
# | |
# - is written by Data::Dumper, pure Perl implementation, as UTF-8 (not ISO-8559-1) | |
# but for higher characters, e.g. "Ч" (Cyrillic Che), the implementation switches | |
# to ASCII-based escaping (perl string escaping) of UTF-16: "\x{427}" (Unicode 0x427) | |
# | |
# - is written by Data::Dumper, XS implementation, as ASCII-based escaping of | |
# ISO-8859-1 (not UTF-8): "\x{e5}". For higher characters, the implementation | |
# switches to ASCII-based escaping of UTF-16. | |
# The implementation seems to eagerly escape anything beyong 7 bit. | |
# | |
# In both cases, reading back as UTF-8 works! | |
# === | |
use strict; | |
use warnings; | |
use utf8; # Meaning "This lexical scope (i.e. file) contains utf8" | |
use File::Temp qw(tempfile tempdir); | |
use File::Spec::Functions qw(catfile); | |
use Data::Dumper; | |
# --- | |
# To print accented chars correctly to STDERR/STDOUT, supposed to be in UTF-8. | |
# https://perldoc.perl.org/perlunifaq.html | |
# --- | |
binmode STDERR, ':encoding(UTF-8)'; | |
binmode STDOUT, ':encoding(UTF-8)'; | |
# --- | |
# Just call dat main! | |
# --- | |
_main(); | |
# === | |
# 1) Define data | |
# 2) Create temporary directory | |
# 3) For Perl and XS implementation of Data::Dumper: | |
# 1) Dump data to file in said temporary directory | |
# 2) Read file back and eval it | |
# 3) Compare original data and data resulting from eval | |
# === | |
sub _main { | |
my $data = { EGKK => "London Gatwick" | |
,BGAA => "Aasiaat (Egedesminde)" | |
,BGSF => "Kangerlussuaq (Søndre Strømfjord)" | |
,BGGH => "Nuuk (Godthåb)" | |
,USHQ => "Белоя́рский" | |
,USCC => "Челя́бинск" | |
,TFFR => "Aéroport Guadeloupe - Pôle Caraïbes" | |
,BKPR => "Aeroporti Ndërkombëtar i Prishtinës 'Adem Jashari'" | |
}; | |
determineUtf8Flags($data); | |
my $outdir = makeTmpDir(); | |
# $usePerl = 1 --> use pure Perl implementation | |
# $usePerl = 0 --> use XS implementation | |
my $names = { 1 => 'pure_perl', 0 => 'xs' }; | |
for my $usePerl ( qw( 0 1 ) ) { | |
my $fqfn = makeFullyQualifiedFilename($outdir,$$names{$usePerl}); | |
{ | |
open(my $fh,">:encoding(UTF-8)", $fqfn) || die "Could not open file '$fqfn' for writing: $!"; | |
$$data{used} = "Data::Dumper, $$names{$usePerl}"; | |
$$data{file} = $fqfn; | |
print $fh Data::Dumper->new([$data])->Useperl($usePerl)->Purity(1)->Sortkeys(1)->Dump; | |
close $fh || die "Could not close file '$fqfn' after writing: $!" | |
} | |
my $reData = slurpAndEval($fqfn,"data"); | |
determineUtf8Flags($reData); | |
for my $key (sort keys %$data) { | |
next if ($key eq 'used' || $key eq 'data' || $key =~ /utf8/); | |
my $orig = $$data{$key}; | |
die "No key '$key' in data extracted from '$fqfn'" unless exists $$reData{$key}; | |
my $reValue = $$reData{$key}; | |
if ($reValue ne $orig) { | |
print STDERR "Key '$key': Previously '$orig', afterwards '$reValue'\n" | |
} | |
else { | |
print STDERR "Key '$key': No change\n" | |
} | |
} | |
} | |
print STDERR "Running a 'diff --side-by-side'!\n"; | |
system ("diff", "--side-by-side", makeFullyQualifiedFilename($outdir, $$names{0}), makeFullyQualifiedFilename($outdir, $$names{1})); | |
} | |
sub makeTmpDir { | |
my $outdir = tempdir("test_XXXX", DIR => '/tmp') || die "Could not create temporary directory: $!"; | |
print STDERR "Output goes to files in directory '$outdir' (this directory will not be automatically removed later!)\n"; | |
return $outdir | |
} | |
sub makeFullyQualifiedFilename { | |
my($dir,$impl) = @_; | |
return catfile($dir,"$impl.dump") | |
} | |
sub slurpAndEval { | |
my($fn,$name) = @_; | |
my $txt; | |
{ | |
open(my $fh, '<:encoding(UTF-8)', $fn) or die "Could not open file '$fn' for reading: $!"; | |
# https://perlmaven.com/slurp | |
# - undefine the record terminator to NOT break apart input! | |
# - make sure this is a local variable so as not to stress anyone else | |
local $/ = undef; | |
$txt = <$fh>; | |
close $fh; | |
# redefine the record terminator to be '\n' (n.b. this must be a string, not a character!!!) | |
$/ = "\n"; | |
} | |
# Danger Will Robinson!! We are using EVAL, so the data better be gud (i.e. not include a call to rm -rf for example)! | |
# Assume the text to eval assigns $VAR1 | |
# >>> | |
my $VAR1; | |
eval($txt); | |
# <<< | |
die "Error in eval of $name content from file '$fn': $@" unless $VAR1; | |
my $len = scalar (keys %$VAR1); | |
print STDERR "Read '$name' content from file '$fn' ($len elements found in undumped hash)\n"; | |
return $VAR1 | |
} | |
sub determineUtf8Flags { | |
my($data) = @_; | |
for my $key (sort keys %$data) { | |
next if ($key eq 'used' || $key eq 'data' || $key =~ /utf8/); | |
my $str = $$data{$key}; | |
my $val; | |
if (utf8::is_utf8($str)) { $val = 'yes' } else { $val = 'no' } | |
$$data{"${key}_utf8"} = $val | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
test script updated a bit