This file is indexed.

/usr/bin/tmxuniq is in libxml-tmx-perl 0.31-1.

This file is owned by root:root, with mode 0o755.

The actual contents of the file can be viewed below.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
#!/usr/bin/perl -s
use DB_File;
use Fcntl ;
use Lingua::PT::PLNbase;
use XML::TMX::Reader;
use Digest::MD5 qw(md5_hex);
use Encode;

our ($cont,$id,$dig,$tok,$o,$fast);

if ($cont) {
    tie %dic, 'DB_File', "__tmxuniq_$$.db", O_RDWR|O_CREAT , 0640, $DB_BTREE;
} else {
    tie %dic, 'DB_File', "__tmxuniq_$$.db", O_RDWR|O_CREAT|O_TRUNC , 0640, $DB_BTREE;
}

my $cid = 0;

for my $file (@ARGV){
    my $tm = XML::TMX::Reader->new($file);

    print STDERR "Processing...";

    $tm->for_tu
      (
       { output => $o || "$file._" },
       sub {
           my $tu = shift;
           $cid++;
           $tu->{-prop}{id} = $cid if $id;

           my $key = join("|||", map { n($tu->{$_}{-seg}) } sort grep { !/^-/ } keys %$tu);
           my $digest = md5_hex(encode_utf8($key));

           unless ($cid % 10000) {
               my $size = -s "__tmxuniq_$$.db";
               printf STDERR
                 "\rTotal: %10d  Removed: %8d (%.2f%%)  Database size: %10d bytes",
                   $cid, $rem, (100*$rem/$cid), $size;
           }

           if ($dic{$digest}) {
               $dic{$digest} .= "$cid;" unless $fast;
               $rem ++;
               return undef
           } else {
               $dic{$digest} = "$cid;";
               $tu->{-prop}{digest} = $digest if $dig;
               return {%$tu} ; # used clone.. no idea why
           }
       }
      );

    my $size = -s "__tmxuniq_$$.db";
    if ($cid) {
        printf STDERR "\rTotal: %10d  Removed: %8d (%.2f%%)  Database size: %10d bytes\n",
          $cid, $rem, (100*$rem/$cid), $size;
    } else {
        printf STDERR "\rHuh.. empty TMX?\n";
    }
    undef $tm;
}
untie %h;

sub n {
    my $a = shift;

    $a =~ s/\.{6,}/...../g;

    $a = tokenize( { rs => ' ' } => $a ) if $tok;

    $a =~ s/\s+/ /g;
    $a =~ s/ $//;
    $a =~ s/^ //;
    return $a;
}

__END__

=head1 NAME

tmxuniq - removes duplicated translation units from TMXs

=head1 SYNOPSIS

 tmxuniq [options] -l=en:pt tmx1 ... 

=head1 DESCRIPTION

Removes duplicated translation units from a set of TMX (Translation
Memory eXange format).

=head1 OPTIONS

 -id  -- insert a uniq id property in each TU
 -dig -- insert a digest property in each TU
 -tok -- tokenize/normalize text
 -o=out.tmx -- (with 1 argument) redefine output (default = input._)

=head1 AUTHOR

J.Joao Almeida, jj@di.uminho.pt

=head1 SEE ALSO

perl(1).

=cut