dtonhofer · May 7, 2019 10:32 · dtonhofer · May 7, 2019
diff --git a/test_dumper.pl b/test_dumper.pl
 #!/usr/bin/perl 

 # ===
 # Testing what Perl's Data::Dumper does with "high" characters e.g.
 #
 # å -> iso-8859-1     : 0xE5
 #      Unicode UTF-16 : 0x00E5
 #      Unicode UTF-8  : 0xC3A5
 #
 # See also:
 # 
 # https://stackoverflow.com/questions/50489062/how-to-display-readable-utf-8-strings-with-datadumper
 #
 # Note that the "UTF-8" pragma is on.
 # The character 'å' is encoded in UTF-8 *in this program file* and
 # the pragma tells Perl that this is so!
 #
 # We also tell Perl that STDERR and STDOUT and the test files to write
 # are/shall-be UTF-8 encoded.
 #
 # We find:
 # ========
 #
 # The string:
 #
 # 'Nuuk (Godthåb)'
 #
 # - is written by Data::Dumper, pure Perl implementation, as UTF-8 (not ISO-8559-1)
 #   but for higher characters, e.g. "Ч" (Cyrillic Che), the implementation switches 
 #   to ASCII-based escaping (perl string escaping) of UTF-16: "\x{427}" (Unicode 0x427)
 # 
 # - is written by Data::Dumper, XS implementation, as ASCII-based escaping of 
 #   ISO-8859-1 (not UTF-8): "\x{e5}". For higher characters, the implementation
 #   switches to ASCII-based escaping of UTF-16.
 #   The implementation seems to eagerly escape anything beyong 7 bit.
 #
 # In both cases, reading back as UTF-8 works!
 # ===

 use strict;
 use warnings;
 use utf8;  # Meaning "This lexical scope (i.e. file) contains utf8"

 use File::Temp qw(tempfile tempdir);
 use File::Spec::Functions qw(catfile);
 use Data::Dumper;

 # ---
 # To print accented chars correctly to STDERR/STDOUT, supposed to be in UTF-8.
 # https://perldoc.perl.org/perlunifaq.html
 # ---

 binmode STDERR, ':encoding(UTF-8)';
 binmode STDOUT, ':encoding(UTF-8)';

 # ---
 # Just call dat main!
 # ---

 _main();

 # ===
 # 1) Define data
 # 2) Create temporary directory
 # 3) For Perl and XS implementation of Data::Dumper:
 #    1) Dump data to file in said temporary directory
 #    2) Read file back and eval it
 #    3) Compare original data and data resulting from eval
 # ===

 sub _main {

   my $data = {  EGKK => "London Gatwick"
                ,BGAA => "Aasiaat (Egedesminde)"
                ,BGSF => "Kangerlussuaq (Søndre Strømfjord)"
                ,BGGH => "Nuuk (Godthåb)"
                ,USHQ => "Белоя́рский"
                ,USCC => "Челя́бинск"
                ,TFFR => "Aéroport Guadeloupe - Pôle Caraïbes"
                ,BKPR => "Aeroporti Ndërkombëtar i Prishtinës 'Adem Jashari'" 
              };

   determineUtf8Flags($data);

   my $outdir = makeTmpDir();

   # $usePerl = 1 --> use pure Perl implementation
   # $usePerl = 0 --> use XS implementation

   my $names = { 1 => 'pure_perl', 0 => 'xs' };

   for my $usePerl ( qw( 0 1 ) ) {
      my $fqfn = makeFullyQualifiedFilename($outdir,$$names{$usePerl});
      {
         open(my $fh,">:encoding(UTF-8)", $fqfn) || die "Could not open file '$fqfn' for writing: $!";
         $$data{used} = "Data::Dumper, $$names{$usePerl}";
         $$data{file} = $fqfn;
         print $fh Data::Dumper->new([$data])->Useperl($usePerl)->Purity(1)->Sortkeys(1)->Dump;
         close $fh || die "Could not close file '$fqfn' after writing: $!"
      }
      my $reData = slurpAndEval($fqfn,"data");
      determineUtf8Flags($reData);
      for my $key (sort keys %$data) {
         next if ($key eq 'used' || $key eq 'data' || $key =~ /utf8/);
         my $orig = $$data{$key};
         die "No key '$key' in data extracted from '$fqfn'" unless exists $$reData{$key};
         my $reValue = $$reData{$key};
         if ($reValue ne $orig) {
            print STDERR "Key '$key': Previously '$orig', afterwards '$reValue'\n"
         }
         else {
            print STDERR "Key '$key': No change\n"
         }
      }
   }
   print STDERR "Running a 'diff --side-by-side'!\n";
   system ("diff", "--side-by-side", makeFullyQualifiedFilename($outdir, $$names{0}), makeFullyQualifiedFilename($outdir, $$names{1}));
 }

 sub makeTmpDir {
   my $outdir = tempdir("test_XXXX", DIR => '/tmp') || die "Could not create temporary directory: $!";
   print STDERR "Output goes to files in directory '$outdir' (this directory will not be automatically removed later!)\n";
   return $outdir
 }

 sub makeFullyQualifiedFilename {
   my($dir,$impl) = @_;
   return catfile($dir,"$impl.dump")
 }

 sub slurpAndEval {
   my($fn,$name) = @_;
   my $txt;
   {
      open(my $fh, '<:encoding(UTF-8)', $fn) or die "Could not open file '$fn' for reading: $!";
      # https://perlmaven.com/slurp
      # - undefine the record terminator to NOT break apart input!
      # - make sure this is a local variable so as not to stress anyone else   
      local $/ = undef;
      $txt = <$fh>;
      close $fh;
      # redefine the record terminator to be '\n' (n.b. this must be a string, not a character!!!)
      $/ = "\n";
   }
   # Danger Will Robinson!! We are using EVAL, so the data better be gud (i.e. not include a call to rm -rf for example)!
   # Assume the text to eval assigns $VAR1
   # >>>
   my $VAR1; 
   eval($txt); 
   # <<<
   die "Error in eval of $name content from file '$fn': $@" unless $VAR1;
   my $len = scalar (keys %$VAR1);
   print STDERR "Read '$name' content from file '$fn' ($len elements found in undumped hash)\n";
   return $VAR1
 }

 sub determineUtf8Flags {
   my($data) = @_;
   for my $key (sort keys %$data) {
      next if ($key eq 'used' || $key eq 'data' || $key =~ /utf8/);
      my $str = $$data{$key};
      my $val;
      if (utf8::is_utf8($str)) { $val = 'yes' } else { $val = 'no' }
      $$data{"${key}_utf8"} = $val
   }
 }
	#!/usr/bin/perl

	# ===
	# Testing what Perl's Data::Dumper does with "high" characters e.g.
	#
	# å -> iso-8859-1 : 0xE5
	# Unicode UTF-16 : 0x00E5
	# Unicode UTF-8 : 0xC3A5
	#
	# See also:
	#
	# https://stackoverflow.com/questions/50489062/how-to-display-readable-utf-8-strings-with-datadumper
	#
	# Note that the "UTF-8" pragma is on.
	# The character 'å' is encoded in UTF-8 in this program file and
	# the pragma tells Perl that this is so!
	#
	# We also tell Perl that STDERR and STDOUT and the test files to write
	# are/shall-be UTF-8 encoded.
	#
	# We find:
	# ========
	#
	# The string:
	#
	# 'Nuuk (Godthåb)'
	#
	# - is written by Data::Dumper, pure Perl implementation, as UTF-8 (not ISO-8559-1)
	# but for higher characters, e.g. "Ч" (Cyrillic Che), the implementation switches
	# to ASCII-based escaping (perl string escaping) of UTF-16: "\x{427}" (Unicode 0x427)
	#
	# - is written by Data::Dumper, XS implementation, as ASCII-based escaping of
	# ISO-8859-1 (not UTF-8): "\x{e5}". For higher characters, the implementation
	# switches to ASCII-based escaping of UTF-16.
	# The implementation seems to eagerly escape anything beyong 7 bit.
	#
	# In both cases, reading back as UTF-8 works!
	# ===

	use strict;
	use warnings;
	use utf8; # Meaning "This lexical scope (i.e. file) contains utf8"

	use File::Temp qw(tempfile tempdir);
	use File::Spec::Functions qw(catfile);
	use Data::Dumper;

	# ---
	# To print accented chars correctly to STDERR/STDOUT, supposed to be in UTF-8.
	# https://perldoc.perl.org/perlunifaq.html
	# ---

	binmode STDERR, ':encoding(UTF-8)';
	binmode STDOUT, ':encoding(UTF-8)';

	# ---
	# Just call dat main!
	# ---

	_main();

	# ===
	# 1) Define data
	# 2) Create temporary directory
	# 3) For Perl and XS implementation of Data::Dumper:
	# 1) Dump data to file in said temporary directory
	# 2) Read file back and eval it
	# 3) Compare original data and data resulting from eval
	# ===

	sub _main {

	my $data = { EGKK => "London Gatwick"
	,BGAA => "Aasiaat (Egedesminde)"
	,BGSF => "Kangerlussuaq (Søndre Strømfjord)"
	,BGGH => "Nuuk (Godthåb)"
	,USHQ => "Белоя́рский"
	,USCC => "Челя́бинск"
	,TFFR => "Aéroport Guadeloupe - Pôle Caraïbes"
	,BKPR => "Aeroporti Ndërkombëtar i Prishtinës 'Adem Jashari'"
	};

	determineUtf8Flags($data);

	my $outdir = makeTmpDir();

	# $usePerl = 1 --> use pure Perl implementation
	# $usePerl = 0 --> use XS implementation

	my $names = { 1 => 'pure_perl', 0 => 'xs' };

	for my $usePerl ( qw( 0 1 ) ) {
	my $fqfn = makeFullyQualifiedFilename($outdir,$$names{$usePerl});
	{
	open(my $fh,">:encoding(UTF-8)", $fqfn) \|\| die "Could not open file '$fqfn' for writing: $!";
	$$data{used} = "Data::Dumper, $$names{$usePerl}";
	$$data{file} = $fqfn;
	print $fh Data::Dumper->new([$data])->Useperl($usePerl)->Purity(1)->Sortkeys(1)->Dump;
	close $fh \|\| die "Could not close file '$fqfn' after writing: $!"
	}
	my $reData = slurpAndEval($fqfn,"data");
	determineUtf8Flags($reData);
	for my $key (sort keys %$data) {
	next if ($key eq 'used' \|\| $key eq 'data' \|\| $key =~ /utf8/);
	my $orig = $$data{$key};
	die "No key '$key' in data extracted from '$fqfn'" unless exists $$reData{$key};
	my $reValue = $$reData{$key};
	if ($reValue ne $orig) {
	print STDERR "Key '$key': Previously '$orig', afterwards '$reValue'\n"
	}
	else {
	print STDERR "Key '$key': No change\n"
	}
	}
	}
	print STDERR "Running a 'diff --side-by-side'!\n";
	system ("diff", "--side-by-side", makeFullyQualifiedFilename($outdir, $$names{0}), makeFullyQualifiedFilename($outdir, $$names{1}));
	}

	sub makeTmpDir {
	my $outdir = tempdir("test_XXXX", DIR => '/tmp') \|\| die "Could not create temporary directory: $!";
	print STDERR "Output goes to files in directory '$outdir' (this directory will not be automatically removed later!)\n";
	return $outdir
	}

	sub makeFullyQualifiedFilename {
	my($dir,$impl) = @_;
	return catfile($dir,"$impl.dump")
	}

	sub slurpAndEval {
	my($fn,$name) = @_;
	my $txt;
	{
	open(my $fh, '<:encoding(UTF-8)', $fn) or die "Could not open file '$fn' for reading: $!";
	# https://perlmaven.com/slurp
	# - undefine the record terminator to NOT break apart input!
	# - make sure this is a local variable so as not to stress anyone else
	local $/ = undef;
	$txt = <$fh>;
	close $fh;
	# redefine the record terminator to be '\n' (n.b. this must be a string, not a character!!!)
	$/ = "\n";
	}
	# Danger Will Robinson!! We are using EVAL, so the data better be gud (i.e. not include a call to rm -rf for example)!
	# Assume the text to eval assigns $VAR1
	# >>>
	my $VAR1;
	eval($txt);
	# <<<
	die "Error in eval of $name content from file '$fn': $@" unless $VAR1;
	my $len = scalar (keys %$VAR1);
	print STDERR "Read '$name' content from file '$fn' ($len elements found in undumped hash)\n";
	return $VAR1
	}

	sub determineUtf8Flags {
	my($data) = @_;
	for my $key (sort keys %$data) {
	next if ($key eq 'used' \|\| $key eq 'data' \|\| $key =~ /utf8/);
	my $str = $$data{$key};
	my $val;
	if (utf8::is_utf8($str)) { $val = 'yes' } else { $val = 'no' }
	$$data{"${key}_utf8"} = $val
	}
	}