/usr/share/perl5/HTML/Encoding.pm is in libhtml-encoding-perl 0.61-1.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 | package HTML::Encoding;
use strict;
use warnings;
use HTML::Parser qw();
use HTTP::Headers::Util qw(split_header_words);
use Encode qw();
use base qw(Exporter);
our $VERSION = '0.61';
our @EXPORT_OK =
qw/
&encoding_from_meta_element
&xml_declaration_from_octets
&encoding_from_first_chars
&encoding_from_xml_declaration
&encoding_from_byte_order_mark
&encoding_from_content_type
&encoding_from_xml_document
&encoding_from_html_document
&encoding_from_http_message
/;
our $DEFAULT_ENCODINGS = [qw/
ISO-8859-1
UTF-16LE
UTF-16BE
UTF-32LE
UTF-32BE
UTF-8
/];
our %MAP =
(
BM => "\x{FEFF}",
CR => "\x{000D}",
LF => "\x{000A}",
SP => "\x{0020}",
TB => "\x{0009}",
QS => "\x{003F}",
NL => "\x{0085}",
LS => "\x{2028}",
LT => "<", # fixme
GT => ">", # fixme
);
sub _my_encode
{
my $seq;
eval
{
$seq = Encode::encode($_[0],
$_[1],
$_[2]);
};
return $seq unless $@;
return;
}
sub _my_decode
{
my $str;
eval
{
$str = Encode::decode($_[0],
$_[1],
$_[2]);
};
return $str unless $@;
return;
}
sub _make_character_map
{
my $encoding = shift;
my %data;
foreach my $sym (keys %MAP)
{
my $seq = _my_encode($encoding, "$MAP{$sym}", Encode::FB_CROAK);
$data{$sym} = $seq if defined $seq;
}
\%data;
}
# cache for U+XXXX octet sequences
our %CHARACTER_MAP_CACHE = ();
sub _get_character_map
{
my $encoding = shift;
# read from cache
return $CHARACTER_MAP_CACHE{$encoding}
if exists $CHARACTER_MAP_CACHE{$encoding};
# new cache entry
my $map = _make_character_map($encoding);
$CHARACTER_MAP_CACHE{$encoding} = $map;
# return new entry
return $map;
}
sub encoding_from_meta_element
{
my $text = shift;
my $enco = shift;
return unless defined $text;
return unless length $text;
return unless defined $enco;
return unless length $enco;
my $pars = HTML::Parser->new
(
api_version => 3,
@_
);
my $meta = [];
my $leng = length $text;
my $size = 8192;
my $data = '';
my $utf8 = '';
my $i = 0;
# todo: should finish when <body> or logically body//*
$pars->report_tags(qw/meta head/);
$pars->handler(start => $meta, "tagname,attr");
$pars->handler
(
end => sub { $_[0]->eof if $_[1] eq "head" },
"self,tagname"
);
$pars->parse(sub
{
return if $i > $leng;
$data .= substr $text, $i, $size;
$i += $size;
_my_decode($enco, $data, Encode::FB_QUIET);
});
my @resu;
foreach (grep { $_->[0] eq "meta" } @$meta)
{
my %hash = %{$_->[1]};
next unless defined $hash{'content'};
next unless exists $hash{'http-equiv'};
next unless lc $hash{'http-equiv'} eq "content-type";
my $char = encoding_from_content_type($hash{'content'});
push @resu, $char if defined $char and length $char;
}
return unless @resu;
return wantarray ? @resu : $resu[0];
}
sub xml_declaration_from_octets
{
my $text = shift;
my %o = @_;
my $encodings = $o{encodings} || $DEFAULT_ENCODINGS;
my %resu;
return unless defined $text;
return unless length $text;
foreach my $e (@$encodings)
{
my $map = _get_character_map($e);
# search for >
my $end = index $text, $map->{GT};
# search for <?
my $str = index $text, $map->{LT} . $map->{QS};
# skip this encoding unless ...
next unless $end > 0 and $str >= 0 and $end > $str;
# extract tentative XML declaration
my $decl = substr $text, $str, $end - $str + 1;
# decode XML declaration
my $deco = _my_decode($e, $decl, Encode::FB_CROAK);
# skip encoding if decoding failed
next unless defined $deco;
$resu{$deco}++;
}
# No XML declarations found
return unless keys %resu;
# sort by number of matches, most match first
my @sort = sort { $resu{$b} <=> $resu{$a} } keys %resu;
# in array context return all encodings,
# in scalar context return best match.
return wantarray ? @sort : $sort[0];
}
sub encoding_from_first_chars
{
my $text = shift;
my %o = @_;
my $encodings = $o{encodings} || $DEFAULT_ENCODINGS;
my $whitespace = $o{whitespace} || [qw/CR LF TB SP/];
return unless defined $text;
return unless length $text;
my %resu;
foreach my $e (@$encodings)
{
my $m = _get_character_map($e);
my $i = index $text, $m->{LT};
next unless $i >= 0;
my $t = substr $text, 0, $i;
my @y;
# construct \xXX\xXX string from octets, might make sense to
# have this in the map construction process
push@y,"(?:".join("",map{sprintf"\\x%02x",ord}split//,$m->{$_}).")"
foreach grep defined, @$whitespace;
my $x = join "|", @y;
$t =~ s/^($x)+//g;
$resu{$e} = $i + length $m->{LT} unless length $t;
}
# ...
return unless keys %resu;
# sort by match length, longest match first
my @sort = sort { $resu{$b} <=> $resu{$a} } keys %resu;
# in array context return all encodings,
# in scalar context return best match.
return wantarray ? @sort : $sort[0];
}
sub encoding_from_xml_declaration
{
my $decl = shift;
return unless defined $decl;
return unless length $decl;
# todo: move this to some better place...
my $ws = qr/[\x09\x85\x20\x0d\x0a\x{2028}]*/;
# skip if not an XML declaration
return unless $decl =~ /^<\?xml$ws/i;
# attempt to extract encoding pseudo attribute
return unless $decl =~ /encoding$ws=$ws'([^']+)'/i or
$decl =~ /encoding$ws=$ws"([^"]+)"/i;
# no encoding pseudo-attribute
return unless defined $1;
my $enco = $1;
# strip leading/trailing whitespace/quotes
$enco =~ s/^[\s'"]+|[\s'"]+$//g;
# collapse white-space
$enco =~ s/\s+/ /g;
# treat empty charset as if it were unspecified
return unless length $enco;
return $enco;
}
sub encoding_from_byte_order_mark
{
my $text = shift;
my %o = @_;
my $encodings = $o{encodings} || $DEFAULT_ENCODINGS;
my %resu;
return unless defined $text;
return unless length $text;
foreach my $e (@$encodings)
{
my $map = _get_character_map($e);
my $bom = $map->{BM};
# encoding cannot encode U+FEFF
next unless defined $bom;
# remember match length
$resu{$e} = length $bom if $text =~ /^(\Q$bom\E)/;
}
# does not start with BOM
return unless keys %resu;
# sort by match length, longest match first
my @sort = sort { $resu{$b} <=> $resu{$a} } keys %resu;
# in array context return all encodings,
# in scalar context return best match.
return wantarray ? @sort : $sort[0];
}
sub encoding_from_content_type
{
my $text = shift;
# nothing to do...
return unless defined $text and length $text;
# downgrade Unicode strings
$text = Encode::encode_utf8($text) if Encode::is_utf8($text);
# split parameters, only look at the first set
my %data = @{(split_header_words($text))[0]};
# extract first charset parameter if any
my $char;
foreach my $param (keys %data) {
$char = $data{$param} and last if 'charset' eq lc $param;
}
# no charset parameter
return unless defined $char;
# there are no special escapes so just remove \s
$char =~ tr/\\//d;
# strip leading/trailing whitespace/quotes
$char =~ s/^[\s'"]+|[\s'"]+$//g;
# collapse white-space
$char =~ s/\s+/ /g;
# treat empty charset as if it were unspecified
return unless length $char;
return $char
}
sub encoding_from_xml_document
{
my $text = shift;
my %o = @_;
my $encodings = $o{encodings} || $DEFAULT_ENCODINGS;
my %resu;
return unless defined $text;
return unless length $text;
my @boms = encoding_from_byte_order_mark($text, encodings => $encodings);
# BOM determines encoding
return wantarray ? (bom => \@boms) : $boms[0] if @boms;
# no BOM
my @decls = xml_declaration_from_octets($text, encodings => $encodings);
foreach my $decl (@decls)
{
my $enco = encoding_from_xml_declaration($decl);
$resu{$enco}++ if defined $enco and length $enco;
}
return unless keys %resu;
my @sort = sort { $resu{$b} <=> $resu{$a} } keys %resu;
# in array context return all encodings,
# in scalar context return best match.
return wantarray ? (xml => \@sort) : $sort[0];
}
sub encoding_from_html_document
{
my $text = shift;
my %o = @_;
my $encodings = $o{encodings} || $DEFAULT_ENCODINGS;
my $popts = $o{parser_options} || {};
my $xhtml = exists $o{xhtml} ? $o{xhtml} : 1;
return unless defined $text;
return unless length $text;
if ($xhtml)
{
my @xml = wantarray
? encoding_from_xml_document($text, encodings => $encodings)
: scalar encoding_from_xml_document($text, encodings => $encodings);
return wantarray
? @xml
: $xml[0]
if @xml and defined $xml[0];
}
else
{
my @boms = encoding_from_byte_order_mark($text, encodings => $encodings);
# BOM determines encoding
return wantarray ? (bom => \@boms) : $boms[0] if @boms;
}
# no BOM
my @resu;
# sanity check to exclude e.g. UTF-32
my @first = encoding_from_first_chars($text, encodings => $encodings);
# fall back to provided encoding list
@first = @$encodings unless @first;
foreach my $try (@first)
{
push @resu, encoding_from_meta_element($text, $try, %$popts);
}
return unless @resu;
return wantarray ? (meta => \@resu) : $resu[0];
}
sub encoding_from_http_message
{
my $mess = shift;
my %o = @_;
my $encodings = $o{encodings} || $DEFAULT_ENCODINGS;
my $is_html = $o{is_html} || qr{^text/html$}i;
my $is_xml = $o{is_xml} || qr{^.+/(?:.+\+)?xml$}i;
my $is_t_xml = $o{is_text_xml} || qr{^text/(?:.+\+)?xml$}i;
my $html_d = $o{html_default} || "ISO-8859-1";
my $xml_d = $o{xml_default} || "UTF-8";
my $txml = $o{text_xml_default};
my $xhtml = exists $o{xhtml} ? $o{xhtml} : 1;
my $default = exists $o{default} ? $o{default} : 1;
my $type = $mess->header('Content-Type');
my $charset = encoding_from_content_type($type);
if ($mess->content_type =~ $is_xml)
{
return wantarray ? (protocol => $charset) : $charset
if defined $charset;
# special case for text/xml at user option
return wantarray ? (protocol_default => $txml) : $txml
if defined $txml and $mess->content_type =~ $is_t_xml;
if (wantarray)
{
my @xml = encoding_from_xml_document($mess->content, encodings => $encodings);
return @xml if @xml;
}
else
{
my $xml = scalar encoding_from_xml_document($mess->content, encodings => $encodings);
return $xml if defined $xml;
}
return wantarray ? (default => $xml_d) : $xml_d if defined $default;
}
if ($mess->content_type =~ $is_html)
{
return wantarray ? (protocol => $charset) : $charset
if defined $charset;
if (wantarray)
{
my @html = encoding_from_html_document($mess->content, encodings => $encodings, xhtml => $xhtml);
return @html if @html;
}
else
{
my $html = scalar encoding_from_html_document($mess->content, encodings => $encodings, xhtml => $xhtml);
return $html if defined $html;
}
return wantarray ? (default => $html_d) : $html_d if defined $default;
}
return
}
1;
__END__
=pod
=head1 NAME
HTML::Encoding - Determine the encoding of HTML/XML/XHTML documents
=head1 SYNOPSIS
use HTML::Encoding 'encoding_from_http_message';
use LWP::UserAgent;
use Encode;
my $resp = LWP::UserAgent->new->get('http://www.example.org');
my $enco = encoding_from_http_message($resp);
my $utf8 = decode($enco => $resp->content);
=head1 WARNING
The interface and implementation are guranteed to change before this
module reaches version 1.00! Please send feedback to the author of
this module.
=head1 DESCRIPTION
HTML::Encoding helps to determine the encoding of HTML and XML/XHTML
documents...
=head1 DEFAULT ENCODINGS
Most routines need to know some suspected character encodings which
can be provided through the C<encodings> option. This option always
defaults to the $HTML::Encoding::DEFAULT_ENCODINGS array reference
which means the following encodings are considered by default:
* ISO-8859-1
* UTF-16LE
* UTF-16BE
* UTF-32LE
* UTF-32BE
* UTF-8
If you change the values or pass custom values to the routines note
that L<Encode> must support them in order for this module to work
correctly.
=head1 ENCODING SOURCES
C<encoding_from_xml_document>, C<encoding_from_html_document>, and
C<encoding_from_http_message> return in list context the encoding
source and the encoding name, possible encoding sources are
* protocol (Content-Type: text/html;charset=encoding)
* bom (leading U+FEFF)
* xml (<?xml version='1.0' encoding='encoding'?>)
* meta (<meta http-equiv=...)
* default (default fallback value)
* protocol_default (protocol default)
=head1 ROUTINES
Routines exported by this module at user option. By default, nothing
is exported.
=over 2
=item encoding_from_content_type($content_type)
Takes a byte string and uses L<HTTP::Headers::Util> to extract the
charset parameter from the C<Content-Type> header value and returns
its value or C<undef> (or an empty list in list context) if there
is no such value. Only the first component will be examined
(HTTP/1.1 only allows for one component), any backslash escapes in
strings will be unescaped, all leading and trailing quote marks
and white-space characters will be removed, all white-space will be
collapsed to a single space, empty charset values will be ignored
and no case folding is performed.
Examples:
+-----------------------------------------+-----------+
| encoding_from_content_type(...) | returns |
+-----------------------------------------+-----------+
| "text/html" | undef |
| "text/html,text/plain;charset=utf-8" | undef |
| "text/html;charset=" | undef |
| "text/html;charset=\"\\u\\t\\f\\-\\8\"" | 'utf-8' |
| "text/html;charset=utf\\-8" | 'utf\\-8' |
| "text/html;charset='utf-8'" | 'utf-8' |
| "text/html;charset=\" UTF-8 \"" | 'UTF-8' |
+-----------------------------------------+-----------+
If you pass a string with the UTF-8 flag turned on the string will
be converted to bytes before it is passed to L<HTTP::Headers::Util>.
The return value will thus never have the UTF-8 flag turned on (this
might change in future versions).
=item encoding_from_byte_order_mark($octets [, %options])
Takes a sequence of octets and attempts to read a byte order mark
at the beginning of the octet sequence. It will go through the list
of $options{encodings} or the list of default encodings if no
encodings are specified and match the beginning of the string against
any byte order mark octet sequence found.
The result can be ambiguous, for example qq(\xFF\xFE\x00\x00) could
be both, a complete BOM in UTF-32LE or a UTF-16LE BOM followed by a
U+0000 character. It is also possible that C<$octets> starts with
something that looks like a byte order mark but actually is not.
encoding_from_byte_order_mark sorts the list of possible encodings
by the length of their BOM octet sequence and returns in scalar
context only the encoding with the longest match, and all encodings
ordered by length of their BOM octet sequence in list context.
Examples:
+-------------------------+------------+-----------------------+
| Input | Encodings | Result |
+-------------------------+------------+-----------------------+
| "\xFF\xFE\x00\x00" | default | qw(UTF-32LE) |
| "\xFF\xFE\x00\x00" | default | qw(UTF-32LE UTF-16LE) |
| "\xEF\xBB\xBF" | default | qw(UTF-8) |
| "Hello World!" | default | undef |
| "\xDD\x73\x66\x73" | default | undef |
| "\xDD\x73\x66\x73" | UTF-EBCDIC | qw(UTF-EBCDIC) |
| "\x2B\x2F\x76\x38\x2D" | default | undef |
| "\x2B\x2F\x76\x38\x2D" | UTF-7 | qw(UTF-7) |
+-------------------------+------------+-----------------------+
Note however that for UTF-7 it is in theory possible that the U+FEFF
combines with other characters in which case such detection would fail,
for example consider:
+--------------------------------------+-----------+-----------+
| Input | Encodings | Result |
+--------------------------------------+-----------+-----------+
| "\x2B\x2F\x76\x38\x41\x39\x67\x2D" | default | undef |
| "\x2B\x2F\x76\x38\x41\x39\x67\x2D" | UTF-7 | undef |
+--------------------------------------+-----------+-----------+
This might change in future versions, although this is not very
relevant for most applications as there should never be need to use
UTF-7 in the encoding list for existing documents.
If no BOM can be found it returns C<undef> in scalar context and an
empty list in list context. This routine should not be used with
strings with the UTF-8 flag turned on.
=item encoding_from_xml_declaration($declaration)
Attempts to extract the value of the encoding pseudo-attribute in an XML
declaration or text declaration in the character string $declaration. If
there does not appear to be such a value it returns nothing. This would
typically be used with the return values of xml_declaration_from_octets.
Normalizes whitespaces like encoding_from_content_type.
Examples:
+-------------------------------------------+---------+
| encoding_from_xml_declaration(...) | Result |
+-------------------------------------------+---------+
| "<?xml version='1.0' encoding='utf-8'?>" | 'utf-8' |
| "<?xml encoding='utf-8'?>" | 'utf-8' |
| "<?xml encoding=\"utf-8\"?>" | 'utf-8' |
| "<?xml foo='bar' encoding='utf-8'?>" | 'utf-8' |
| "<?xml encoding='a' encoding='b'?>" | 'a' |
| "<?xml encoding=' a b '?>" | 'a b' |
| "<?xml-stylesheet encoding='utf-8'?>" | undef |
| " <?xml encoding='utf-8'?>" | undef |
| "<?xml encoding =\x{2028}'utf-8'?>" | 'utf-8' |
| "<?xml version='1.0' encoding=utf-8?>" | undef |
| "<?xml x='encoding=\"a\"' encoding='b'?>" | 'a' |
+-------------------------------------------+---------+
Note that encoding_from_xml_declaration() determines the encoding even
if the XML declaration is not well-formed or violates other requirements
of the relevant XML specification as long as it can find an encoding
pseudo-attribute in the provided string. This means XML processors must
apply further checks to determine whether the entity is well-formed, etc.
=item xml_declaration_from_octets($octets [, %options])
Attempts to find a ">" character in the byte string $octets using the
encodings in $encodings and upon success attempts to find a preceding
"<" character. Returns all the strings found this way in the order of
number of successful matches in list context and the best match in
scalar context. Should probably be combined with the only user of this
routine, encoding_from_xml_declaration... You can modify the list of
suspected encodings using $options{encodings};
=item encoding_from_first_chars($octets [, %options])
Assuming that documents start with "<" optionally preceded by whitespace
characters, encoding_from_first_chars attempts to determine an encoding
by matching $octets against something like /^[@{$options{whitespace}}]*</
in the various suspected $options{encodings}.
This is useful to distinguish e.g. UTF-16LE from UTF-8 if the byte string
does not start with a byte order mark nor an XML declaration (e.g. if the
document is a HTML document) to get at least a base encoding which can be
used to decode enough of the document to find <meta> elements using
encoding_from_meta_element. $options{whitespace} defaults to qw/CR LF SP TB/.
Returns nothing if unsuccessful. Returns the matching encodings in order
of the number of octets matched in list context and the best match in
scalar context.
Examples:
+---------------+----------+---------------------+
| String | Encoding | Result |
+---------------+----------+---------------------+
| '<!DOCTYPE ' | UTF-16LE | UTF-16LE |
| ' <!DOCTYPE ' | UTF-16LE | UTF-16LE |
| '...' | UTF-16LE | undef |
| '...<' | UTF-16LE | undef |
| '<' | UTF-8 | ISO-8859-1 or UTF-8 |
| "<!--\xF6-->" | UTF-8 | ISO-8859-1 or UTF-8 |
+---------------+----------+---------------------+
=item encoding_from_meta_element($octets, $encname [, %options])
Attempts to find <meta> elements in the document using HTML::Parser.
It will attempt to decode chunks of the byte string using $encname
to characters before passing the data to HTML::Parser. An optional
%options hash can be provided which will be passed to the HTML::Parser
constructor. It will stop processing the document if it encounters
* </head>
* encoding errors
* the end of the input
* ... (see todo)
If relevant <meta> elements, i.e. something like
<meta http-equiv=Content-Type content='...'>
are found, uses encoding_from_content_type to extract the charset
parameter. It returns all such encodings it could find in document
order in list context or the first encoding in scalar context (it
will currently look for others regardless of calling context) or
nothing if that fails for some reason.
Note that there are many edge cases where this does not yield in
"proper" results depending on the capabilities of the HTML::Parser
version and the options you pass for it, for example,
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" [
<!ENTITY content_type "text/html;charset=utf-8">
]>
<meta http-equiv="Content-Type" content="&content_type;">
<title></title>
<p>...</p>
This would likely not detect the C<utf-8> value if HTML::Parser
does not resolve the entity. This should however only be a concern
for documents specifically crafted to break the encoding detection.
=item encoding_from_xml_document($octets, [, %options])
Uses encoding_from_byte_order_mark to detect the encoding using a
byte order mark in the byte string and returns the return value of
that routine if it succeeds. Uses xml_declaration_from_octets and
encoding_from_xml_declaration and returns the encoding for which
the latter routine found most matches in scalar context, and all
encodings ordered by number of occurences in list context. It
does not return a value of neither byte order mark not inbound
declarations declare a character encoding.
Examples:
+----------------------------+----------+-----------+----------+
| Input | Encoding | Encodings | Result |
+----------------------------+----------+-----------+----------+
| "<?xml?>" | UTF-16 | default | UTF-16BE |
| "<?xml?>" | UTF-16LE | default | undef |
| "<?xml encoding='utf-8'?>" | UTF-16LE | default | utf-8 |
| "<?xml encoding='utf-8'?>" | UTF-16 | default | UTF-16BE |
| "<?xml encoding='cp37'?>" | CP37 | default | undef |
| "<?xml encoding='cp37'?>" | CP37 | CP37 | cp37 |
+----------------------------+----------+-----------+----------+
Lacking a return value from this routine and higher-level protocol
information (such as protocol encoding defaults) processors would
be required to assume that the document is UTF-8 encoded.
Note however that the return value depends on the set of suspected
encodings you pass to it. For example, by default, EBCDIC encodings
would not be considered and thus for
<?xml version='1.0' encoding='cp37'?>
this routine would return the undefined value. You can modify the
list of suspected encodings using $options{encodings}.
=item encoding_from_html_document($octets, [, %options])
Uses encoding_from_xml_document and encoding_from_meta_element to
determine the encoding of HTML documents. If $options{xhtml} is
set to a false value uses encoding_from_byte_order_mark and
encoding_from_meta_element to determine the encoding. The xhtml
option is on by default. The $options{encodings} can be used to
modify the suspected encodings and $options{parser_options} can
be used to modify the HTML::Parser options in
encoding_from_meta_element (see the relevant documentation).
Returns nothing if no declaration could be found, the winning
declaration in scalar context and a list of encoding source
and encoding name in list context, see ENCODING SOURCES.
...
Other problems arise from differences between HTML and XHTML syntax
and encoding detection rules, for example, the input could be
Content-Type: text/html
<?xml version='1.0' encoding='utf-8'?>
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN"
"http://www.w3.org/TR/html4/strict.dtd">
<meta http-equiv = "Content-Type"
content = "text/html;charset=iso-8859-2">
<title></title>
<p>...</p>
This is a perfectly legal HTML 4.01 document and implementations
might be expected to consider the document ISO-8859-2 encoded as
XML rules for encoding detection do not apply to HTML documents.
This module attempts to avoid making decisions which rules apply
for a specific document and would thus by default return 'utf-8'
for this input.
On the other hand, if the input omits the encoding declaration,
Content-Type: text/html
<?xml version='1.0'?>
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN"
"http://www.w3.org/TR/html4/strict.dtd">
<meta http-equiv = "Content-Type"
content = "text/html;charset=iso-8859-2">
<title></title>
<p>...</p>
It would return 'iso-8859-2'. Similar problems would arise from
other differences between HTML and XHTML, for example consider
Content-Type: text/html
<?foo >
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html ...
?>
...
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN">
...
If this is processed using HTML rules, the first > will end the
processing instruction and the XHTML document type declaration
would be the relevant declaration for the document, if it is
processed using XHTML rules, the ?> will end the processing
instruction and the HTML document type declaration would be the
relevant declaration.
IOW, an application would need to assume a certain character
encoding (family) to process enough of the document to determine
whether it is XHTML or HTML and the result of this detection would
depend on which processing rules are assumed in order to process it.
It is thus in essence not possible to write a "perfect" detection
algorithm, which is why this routine attempts to avoid making any
decisions on this matter.
=item encoding_from_http_message($message [, %options])
Determines the encoding of HTML / XML / XHTML documents enclosed
in HTTP message. $message is an object compatible to L<HTTP::Message>,
e.g. a L<HTTP::Response> object. %options is a hash with the following
possible entries:
=over 2
=item encodings
array references of suspected character encodings, defaults to
C<$HTML::Encoding::DEFAULT_ENCODINGS>.
=item is_html
Regular expression matched against the content_type of the message
to determine whether to use HTML rules for the entity body, defaults
to C<qr{^text/html$}i>.
=item is_xml
Regular expression matched against the content_type of the message
to determine whether to use XML rules for the entity body, defaults
to C<qr{^.+/(?:.+\+)?xml$}i>.
=item is_text_xml
Regular expression matched against the content_type of the message
to determine whether to use text/html rules for the message, defaults
to C<qr{^text/(?:.+\+)?xml$}i>. This will only be checked if is_xml
matches aswell.
=item html_default
Default encoding for documents determined (by is_html) as HTML,
defaults to C<ISO-8859-1>.
=item xml_default
Default encoding for documents determined (by is_xml) as XML,
defaults to C<UTF-8>.
=item text_xml_default
Default encoding for documents determined (by is_text_xml) as text/xml,
defaults to C<undef> in which case the default is ignored. This should
be set to C<US-ASCII> if desired as this module is by default
inconsistent with RFC 3023 which requires that for text/xml documents
without a charset parameter in the HTTP header C<US-ASCII> is assumed.
This requirement is inconsistent with RFC 2616 (HTTP/1.1) which requires
to assume C<ISO-8859-1>, has been widely ignored and is thus disabled by
default.
=item xhtml
Whether the routine should look for an encoding declaration in the
XML declaration of the document (if any), defaults to C<1>.
=item default
Whether the relevant default value should be returned when no other
information can be determined, defaults to C<1>.
=back
This is furhter possibly inconsistent with XML MIME types that differ
in other ways from application/xml, for example if the MIME Type does
not allow for a charset parameter in which case applications might be
expected to ignore the charset parameter if erroneously provided.
=back
=head1 EBCDIC SUPPORT
By default, this module does not support EBCDIC encodings. To enable
support for EBCDIC encodings you can either change the
$HTML::Encodings::DEFAULT_ENCODINGS array reference or pass the
encodings to the routines you use using the encodings option, for
example
my @try = qw/UTF-8 UTF-16LE cp500 posix-bc .../;
my $enc = encoding_from_xml_document($doc, encodings => \@try);
Note that there are some subtle differences between various EBCDIC
encodings, for example C<!> is mapped to 0x5A in C<posix-bc> and
to 0x4F in C<cp500>; these differences might affect processing in
yet undetermined ways.
=head1 TODO
* bundle with test suite
* optimize some routines to give up once successful
* avoid transcoding for HTML::Parser if e.g. ISO-8859-1
* consider adding a "HTML5" modus of operation?
=head1 SEE ALSO
* http://www.w3.org/TR/REC-xml/#charencoding
* http://www.w3.org/TR/REC-xml/#sec-guessing
* http://www.w3.org/TR/xml11/#charencoding
* http://www.w3.org/TR/xml11/#sec-guessing
* http://www.w3.org/TR/html4/charset.html#h-5.2.2
* http://www.w3.org/TR/xhtml1/#C_9
* http://www.ietf.org/rfc/rfc2616.txt
* http://www.ietf.org/rfc/rfc2854.txt
* http://www.ietf.org/rfc/rfc3023.txt
* perlunicode
* Encode
* HTML::Parser
=head1 AUTHOR / COPYRIGHT / LICENSE
Copyright (c) 2004-2008 Bjoern Hoehrmann <bjoern@hoehrmann.de>.
This module is licensed under the same terms as Perl itself.
=cut
|