/usr/share/perl5/Test/utf8.pm is in libtest-utf8-perl 0.02-1.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 | package Test::utf8;
use 5.007003;
use strict;
use warnings;
use Encode;
use charnames ':full';
use vars qw(@ISA @EXPORT $VERSION %allowed $valid_utf8_regexp);
$VERSION = "0.02";
require Exporter;
@ISA = qw(Exporter);
@EXPORT = qw(is_valid_string is_dodgy_utf8 is_sane_utf8
is_within_ascii is_within_latin1 is_within_latin_1
is_flagged_utf8 isnt_flagged_utf8);
# A Regexp string to match valid UTF8 bytes
# this info comes from page 78 of "The Unicode Standard 4.0"
# published by the Unicode Consortium
$valid_utf8_regexp = <<'.' ;
[\x{00}-\x{7f}]
| [\x{c2}-\x{df}][\x{80}-\x{bf}]
| \x{e0} [\x{a0}-\x{bf}][\x{80}-\x{bf}]
| [\x{e1}-\x{ec}][\x{80}-\x{bf}][\x{80}-\x{bf}]
| \x{ed} [\x{80}-\x{9f}][\x{80}-\x{bf}]
| [\x{ee}-\x{ef}][\x{80}-\x{bf}][\x{80}-\x{bf}]
| \x{f0} [\x{90}-\x{bf}][\x{80}-\x{bf}]
| [\x{f1}-\x{f3}][\x{80}-\x{bf}][\x{80}-\x{bf}][\x{80}-\x{bf}]
| \x{f4} [\x{80}-\x{8f}][\x{80}-\x{bf}][\x{80}-\x{bf}]
.
=head1 NAME
Test::utf8 - handy utf8 tests
=head1 SYNOPSIS
is_valid_string($string); # check the string is valid
is_sane_utf8($string); # check not double encoded
is_flagged_utf8($string); # has utf8 flag set
is_within_latin_1($string); # but only has latin_1 chars in it
=head1 DESCRIPTION
This module is a collection of tests that's useful when dealing
with utf8 strings in Perl.
=head2 Validity
These two tests check if a string is valid, and if you've probably
made a mistake with your string
=over
=item is_valid_string($string, $testname)
This passes and returns true true if and only if the scalar isn't a
invalid string; In short, it checks that the utf8 flag hasn't been set
for a string that isn't a valid utf8 encoding.
=cut
sub is_valid_string($;$)
{
my $string = shift;
my $name = shift || "valid string test";
# check we're a utf8 string - if not, we pass.
unless (Encode::is_utf8($string))
{ return pass($name) }
# work out at what byte (if any) we have an invalid byte sequence
# and return the correct test result
my $pos = _invalid_sequence_at_byte($string);
ok(!defined($pos), $name)
or diag("malformed byte sequence starting at byte $pos");
}
sub _invalid_sequence_at_byte($)
{
my $string = shift;
# examine the bytes that make up the string (not the chars)
# by turning off the utf8 flag (no, use bytes doens't
# work, we're dealing with a regexp)
Encode::_utf8_off($string);
# work out the index of the first non matching byte
my $result = $string =~ m/^($valid_utf8_regexp)*/ogx;
# if we matched all the string return the empty list
my $pos = pos $string || 0;
return if $pos == length($string);
# otherwise return the position we found
return $pos
}
=item is_sane_utf8($string, $name)
This test fails if the string contains something that looks like it
might be dodgy utf8, i.e. containing something that looks like the
multi-byte sequence for a latin-1 character but perl hasn't been
instructed to treat as such. Strings that are not utf8 always
automatically pass.
Some examples may help:
# This will pass as it's a normal latin-1 string
is_sane_utf8("Hello L\x{e9}eon");
# this will fail because the \x{c3}\x{a9} looks like the
# utf8 byte sequence for e-acute
my $string = "Hello L\x{c3}\x{a9}on";
is_sane_utf8($string);
# this will pass because the utf8 is correctly interpreted as utf8
Encode::_utf8_on($string)
is_sane_utf8($string);
Obviously this isn't a hundred percent reliable. The edge case where
this will fail is where you have C<\x{c2}> (which is "LATIN CAPITAL
LETTER WITH CIRCUMFLEX") or C<\x{c3}> (which is "LATIN CAPITAL LETTER
WITH TILDE") followed by one of the latin-1 punctuation symbols.
# a capital letter A with tilde surrounded by smart quotes
# this will fail because it'll see the "\x{c2}\x{94}" and think
# it's actually the utf8 sequence for the end smart quote
is_sane_utf8("\x{93}\x{c2}\x{94}");
However, since this hardly comes up this test is reasonably reliable
in most cases. Still, care should be applied in cases where dynamic
data is placed next to latin-1 punctuation to avoid false negatives.
There exists two situations to cause this test to fail; The string
contains utf8 byte sequences and the string hasn't been flagged as
utf8 (this normally means that you got it from an external source like
a C library; When Perl needs to store a string internally as utf8 it
does it's own encoding and flagging transparently) or a utf8 flagged
string contains byte sequences that when translated to characters
themselves look like a utf8 byte sequence. The test diagnostics tells
you which is the case.
=cut
# build my regular expression out of the latin-1 bytes
# NOTE: This won't work if our locale is nonstandard will it?
my $re_bit = join "|", map { Encode::encode("utf8",chr($_)) } (127..255);
#binmode STDERR, ":utf8";
#print STDERR $re_bit;
sub is_sane_utf8($;$)
{
my $string = shift;
my $name = shift || "sane utf8";
# regexp in scalar context with 'g', meaning this loop will run for
# each match. Should only have to run it once, but will redo if
# the failing case turns out to be allowed in %allowed.
while ($string =~ /($re_bit)/o)
{
# work out what the double encoded string was
my $bytes = $1;
my $index = $+[0] - length($bytes);
my $codes = join '', map { sprintf '<%00x>', ord($_) } split //, $bytes;
# what charecter does that represent?
my $char = Encode::decode("utf8",$bytes);
my $ord = ord($char);
my $hex = sprintf '%00x', $ord;
$char = charnames::viacode($ord);
# print out diagnostic messages
fail($name);
diag(qq{Found dodgy chars "$codes" at char $index\n});
if (Encode::is_utf8($string))
{ diag("Chars in utf8 string look like utf8 byte sequence.") }
else
{ diag("String not flagged as utf8...was it meant to be?\n") }
diag("Probably originally a $char char - codepoint $ord (dec), $hex (hex)\n");
return 0;
}
# got this far, must have passed.
ok(1,$name);
return 1;
}
# historic name of method; deprecated
sub is_dodgy_utf8
{
# report errors not here but further up the calling stack
local $Test::Builder::Level = $Test::Builder::Level + 1;
# call without prototype with all args
&is_sane_utf8(@_);
}
=back
=head2 Checking the Range of Characters in a String
These routines allow you to check the range of characters in a string.
Note that these routines are blind to the actual encoding perl
internally uses to store the characters, they just check if the
string contains only characters that can be represented in the named
encoding.
=over
=item is_within_ascii
Tests that a string only contains characters that are in the ASCII
charecter set.
=cut
sub is_within_ascii($;$)
{
my $string = shift;
my $name = shift || "within ascii";
# look for anything that isn't ascii or pass
$string =~ /([^\x{00}-\x{7f}])/ or return pass($name);
# explain why we failed
my $dec = ord($1);
my $hex = sprintf '%02x', $dec;
fail($name);
diag("Char $+[0] not ASCII (it's $dec dec / $hex hex)");
return 0;
}
=item is_within_latin_1
Tests that a string only contains characters that are in latin-1.
=cut
sub is_within_latin_1($;$)
{
my $string = shift;
my $name = shift || "within latin-1";
# look for anything that isn't ascii or pass
$string =~ /([^\x{00}-\x{ff}])/ or return pass($name);
# explain why we failed
my $dec = ord($1);
my $hex = sprintf '%x', $dec;
fail($name);
diag("Char $+[0] not Latin-1 (it's $dec dec / $hex hex)");
return 0;
}
sub is_within_latin1
{
# report errors not here but further up the calling stack
local $Test::Builder::Level = $Test::Builder::Level + 1;
# call without prototype with all args
&is_within_latin_1(@_);
}
=back
=head2 Simple utf8 Flag Tests
Simply check if a scalar is or isn't flagged as utf8 by perl's
internals.
=over
=item is_flagged_utf8($string, $name)
Passes if the string is flagged by perl's internals as utf8, fails if
it's not.
=cut
sub is_flagged_utf8
{
my $string = shift;
my $name = shift || "flagged as utf8";
return ok(Encode::is_utf8($string),$name);
}
=item isnt_flagged_utf8($string,$name)
The opposite of C<is_flagged_utf8>, passes if and only if the string
isn't flagged as utf8 by perl's internals.
Note: you can refer to this function as C<isn't_flagged_utf8> if you
really want to.
=cut
sub isnt_flagged_utf8($;$)
{
my $string = shift;
my $name = shift || "not flagged as utf8";
return ok(!Encode::is_utf8($string), $name);
}
sub isn::t_flagged_utf8($;$)
{
my $string = shift;
my $name = shift || "not flagged as utf8";
return ok(!Encode::is_utf8($string), $name);
}
=back
=head1 AUTHOR
Copyright Mark Fowler 2004. All rights reserved.
This program is free software; you can redistribute it
and/or modify it under the same terms as Perl itself.
=head1 BUGS
None known. Please report any to me via the CPAN RT system. See
http://rt.cpan.org/ for more details.
=head1 SEE ALSO
L<Test::DoubleEncodedEntities> for testing for double encoded HTML
entities.
=cut
##########
# shortcuts for Test::Builder.
use Test::Builder;
my $Tester = Test::Builder->new();
sub ok
{
local $Test::Builder::Level = $Test::Builder::Level + 1;
$Tester->ok(@_)
}
sub diag
{
local $Test::Builder::Level = $Test::Builder::Level + 1;
$Tester->diag(@_)
}
sub fail
{
local $Test::Builder::Level = $Test::Builder::Level + 1;
ok(0,@_)
}
sub pass
{
local $Test::Builder::Level = $Test::Builder::Level + 1;
ok(1,@_)
}
1;
|