/usr/lib/perl5/Encode/CN/HZ.pm is in libencode-perl 2.57-1.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 | package Encode::CN::HZ;
use strict;
use warnings;
use utf8 ();
use vars qw($VERSION);
$VERSION = do { my @r = ( q$Revision: 2.7 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r };
use Encode qw(:fallbacks);
use parent qw(Encode::Encoding);
__PACKAGE__->Define('hz');
# HZ is a combination of ASCII and escaped GB, so we implement it
# with the GB2312(raw) encoding here. Cf. RFCs 1842 & 1843.
# not ported for EBCDIC. Which should be used, "~" or "\x7E"?
sub needs_lines { 1 }
sub decode ($$;$) {
my ( $obj, $str, $chk ) = @_;
my $GB = Encode::find_encoding('gb2312-raw');
my $ret = substr($str, 0, 0); # to propagate taintedness
my $in_ascii = 1; # default mode is ASCII.
while ( length $str ) {
if ($in_ascii) { # ASCII mode
if ( $str =~ s/^([\x00-\x7D\x7F]+)// ) { # no '~' => ASCII
$ret .= $1;
# EBCDIC should need ascii2native, but not ported.
}
elsif ( $str =~ s/^\x7E\x7E// ) { # escaped tilde
$ret .= '~';
}
elsif ( $str =~ s/^\x7E\cJ// ) { # '\cJ' == LF in ASCII
1; # no-op
}
elsif ( $str =~ s/^\x7E\x7B// ) { # '~{'
$in_ascii = 0; # to GB
}
else { # encounters an invalid escape, \x80 or greater
last;
}
}
else { # GB mode; the byte ranges are as in RFC 1843.
no warnings 'uninitialized';
if ( $str =~ s/^((?:[\x21-\x77][\x21-\x7E])+)// ) {
$ret .= $GB->decode( $1, $chk );
}
elsif ( $str =~ s/^\x7E\x7D// ) { # '~}'
$in_ascii = 1;
}
else { # invalid
last;
}
}
}
$_[1] = '' if $chk; # needs_lines guarantees no partial character
return $ret;
}
sub cat_decode {
my ( $obj, undef, $src, $pos, $trm, $chk ) = @_;
my ( $rdst, $rsrc, $rpos ) = \@_[ 1 .. 3 ];
my $GB = Encode::find_encoding('gb2312-raw');
my $ret = '';
my $in_ascii = 1; # default mode is ASCII.
my $ini_pos = pos($$rsrc);
substr( $src, 0, $pos ) = '';
my $ini_len = bytes::length($src);
# $trm is the first of the pair '~~', then 2nd tilde is to be removed.
# XXX: Is better C<$src =~ s/^\x7E// or die if ...>?
$src =~ s/^\x7E// if $trm eq "\x7E";
while ( length $src ) {
my $now;
if ($in_ascii) { # ASCII mode
if ( $src =~ s/^([\x00-\x7D\x7F])// ) { # no '~' => ASCII
$now = $1;
}
elsif ( $src =~ s/^\x7E\x7E// ) { # escaped tilde
$now = '~';
}
elsif ( $src =~ s/^\x7E\cJ// ) { # '\cJ' == LF in ASCII
next;
}
elsif ( $src =~ s/^\x7E\x7B// ) { # '~{'
$in_ascii = 0; # to GB
next;
}
else { # encounters an invalid escape, \x80 or greater
last;
}
}
else { # GB mode; the byte ranges are as in RFC 1843.
if ( $src =~ s/^((?:[\x21-\x77][\x21-\x7F])+)// ) {
$now = $GB->decode( $1, $chk );
}
elsif ( $src =~ s/^\x7E\x7D// ) { # '~}'
$in_ascii = 1;
next;
}
else { # invalid
last;
}
}
next if !defined $now;
$ret .= $now;
if ( $now eq $trm ) {
$$rdst .= $ret;
$$rpos = $ini_pos + $pos + $ini_len - bytes::length($src);
pos($$rsrc) = $ini_pos;
return 1;
}
}
$$rdst .= $ret;
$$rpos = $ini_pos + $pos + $ini_len - bytes::length($src);
pos($$rsrc) = $ini_pos;
return ''; # terminator not found
}
sub encode($$;$) {
my ( $obj, $str, $chk ) = @_;
my $GB = Encode::find_encoding('gb2312-raw');
my $ret = substr($str, 0, 0); # to propagate taintedness;
my $in_ascii = 1; # default mode is ASCII.
no warnings 'utf8'; # $str may be malformed UTF8 at the end of a chunk.
while ( length $str ) {
if ( $str =~ s/^([[:ascii:]]+)// ) {
my $tmp = $1;
$tmp =~ s/~/~~/g; # escapes tildes
if ( !$in_ascii ) {
$ret .= "\x7E\x7D"; # '~}'
$in_ascii = 1;
}
$ret .= pack 'a*', $tmp; # remove UTF8 flag.
}
elsif ( $str =~ s/(.)// ) {
my $s = $1;
my $tmp = $GB->encode( $s, $chk );
last if !defined $tmp;
if ( length $tmp == 2 ) { # maybe a valid GB char (XXX)
if ($in_ascii) {
$ret .= "\x7E\x7B"; # '~{'
$in_ascii = 0;
}
$ret .= $tmp;
}
elsif ( length $tmp ) { # maybe FALLBACK in ASCII (XXX)
if ( !$in_ascii ) {
$ret .= "\x7E\x7D"; # '~}'
$in_ascii = 1;
}
$ret .= $tmp;
}
}
else { # if $str is malformed UTF8 *and* if length $str != 0.
last;
}
}
$_[1] = $str if $chk;
# The state at the end of the chunk is discarded, even if in GB mode.
# That results in the combination of GB-OUT and GB-IN, i.e. "~}~{".
# Parhaps it is harmless, but further investigations may be required...
if ( !$in_ascii ) {
$ret .= "\x7E\x7D"; # '~}'
$in_ascii = 1;
}
utf8::encode($ret); # https://rt.cpan.org/Ticket/Display.html?id=35120
return $ret;
}
1;
__END__
=head1 NAME
Encode::CN::HZ -- internally used by Encode::CN
=cut
|