This file is indexed.

/usr/share/perl5/MARC/Charset/Compiler.pm is in libmarc-charset-perl 1.35-1.

This file is owned by root:root, with mode 0o644.

The actual contents of the file can be viewed below.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
package MARC::Charset::Compiler; 

=head1 NAME

MARC::Charset::Compiler - compile XML mapping rules from LoC

=head1 SYNOPSIS

    $compiler = MARC::Charset::Compiler->new();
    $table = $compiler->compile('codetables.xml');

=head1 DESCRIPTION

MARC::Charset uses mapping rules from the Library of Congress for
generating a MARC::Charset::Table for looking up utf8 values based on the 
source MARC-8 character set and the character.

=head1 METHODS

=cut

use strict;
use warnings;

use base qw( XML::SAX::Base );
use XML::SAX::ParserFactory;
use Unicode::UCD qw(charinfo);
use MARC::Charset::Table;
use MARC::Charset::Code;


=head1 new()

The constructor.

=cut

sub new 
{
    my $self = bless {}, 'MARC::Charset::Compiler';
    $self->{table} = MARC::Charset::Table->brand_new();
    $self->{current_code} = undef;
    $self->{text} = '';
    return $self;
}


=head1 compile()

Pass in the path to an XML file to compile.

=cut

sub compile 
{
    my ($self, $file) = @_;

    my $factory = XML::SAX::ParserFactory->new();
    my $parser = $factory->parser(Handler => $self);
    $parser->parse_uri($file);
}


## SAX event handlers are below

sub start_element 
{
    my ($self, $data) = @_;
    my $name = $data->{Name};
    if ($name eq 'code')
    {
        $self->{current_code} = MARC::Charset::Code->new();
    }
    elsif ($name eq 'characterSet')
    {
        my $charset = $data->{Attributes}{'{}ISOcode'}{Value};
        warn('missing ISOcode in characterSet element') unless $charset;
        $self->{current_charset} = $charset;
    }
}


sub end_element
{
    my ($self, $data) = @_;
    my $name = $data->{Name};

    # normalize some names for method lookup
    $name = 'is_combining' if $name eq 'isCombining';

    # get the existing code if we have one
    my $code = $self->{current_code};

    # if we're ending a code element
    if ($code and $name eq 'code')
    {
        # if there is no ucs code, use what's in alt
        $code->ucs($code->alt()) unless $code->ucs;

        # can't process a code point that lacks a unicode representation
        die("invalid code: " . $code->to_string()) unless $code->ucs;
        
        # set the charset code
        $code->charset($self->{current_charset});

        # lookup the name from perl's character db
        my $info = charinfo(hex($code->ucs()));
        $code->name($info->{name}) if $info;

        # add it to the table
        $self->{table}->add_code($code);

        # start with a clean slate
        $self->{current_code} = undef;
    }
   
    elsif ($code and $name eq 'marc')
    {
        my $codepoint = $self->text();
        if ($self->{current_charset} eq '51' || 
            $self->{current_charset} eq '34' ||
            $self->{current_charset} eq '45')
        {
            # codetables.xml supplied by the Library of Congress mistakenly
            # lists the G1 value of characters in the extended Latin, extended
            # Cyrillic and extended Arabic sets rather than the G0 value.  
            # MARC::Charset's table uses the G0 value internally.

            if (hex($codepoint) >= 0xa1 && hex($codepoint) <= 0xfe) {
                $codepoint = sprintf("%x", hex($codepoint) - 128);
            }
        }
        $code->marc($codepoint);
    }
    # add these elements
    elsif ($code and $name =~ /^(marc|ucs|is_combining|alt|marc_right_half|marc_left_half)$/)
    {
        $code->$name($self->text());
    }

    # ending an element so forget all text
    $self->{text} = '';
}


sub characters 
{
    my ($self, $data) = @_;
    return unless $self->{current_code};
    my $text = $data->{Data};
    $self->{text} .= $data->{Data};
}


sub text 
{
    my $text = shift->{text};
    # collapse whitespace
    $text =~ s/\s\s+/ /g;
    # strip new lines
    $text =~ s/[\r\n]//g;
    # strip leading whitespace
    $text =~ s/^\s+//;
    # strip trailing whitespace
    $text =~ s/\s+$//;
    return $text;
}

1;