/usr/bin/tmxuniq is in libxml-tmx-perl 0.31-1.
This file is owned by root:root, with mode 0o755.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 | #!/usr/bin/perl -s
use DB_File;
use Fcntl ;
use Lingua::PT::PLNbase;
use XML::TMX::Reader;
use Digest::MD5 qw(md5_hex);
use Encode;
our ($cont,$id,$dig,$tok,$o,$fast);
if ($cont) {
tie %dic, 'DB_File', "__tmxuniq_$$.db", O_RDWR|O_CREAT , 0640, $DB_BTREE;
} else {
tie %dic, 'DB_File', "__tmxuniq_$$.db", O_RDWR|O_CREAT|O_TRUNC , 0640, $DB_BTREE;
}
my $cid = 0;
for my $file (@ARGV){
my $tm = XML::TMX::Reader->new($file);
print STDERR "Processing...";
$tm->for_tu
(
{ output => $o || "$file._" },
sub {
my $tu = shift;
$cid++;
$tu->{-prop}{id} = $cid if $id;
my $key = join("|||", map { n($tu->{$_}{-seg}) } sort grep { !/^-/ } keys %$tu);
my $digest = md5_hex(encode_utf8($key));
unless ($cid % 10000) {
my $size = -s "__tmxuniq_$$.db";
printf STDERR
"\rTotal: %10d Removed: %8d (%.2f%%) Database size: %10d bytes",
$cid, $rem, (100*$rem/$cid), $size;
}
if ($dic{$digest}) {
$dic{$digest} .= "$cid;" unless $fast;
$rem ++;
return undef
} else {
$dic{$digest} = "$cid;";
$tu->{-prop}{digest} = $digest if $dig;
return {%$tu} ; # used clone.. no idea why
}
}
);
my $size = -s "__tmxuniq_$$.db";
if ($cid) {
printf STDERR "\rTotal: %10d Removed: %8d (%.2f%%) Database size: %10d bytes\n",
$cid, $rem, (100*$rem/$cid), $size;
} else {
printf STDERR "\rHuh.. empty TMX?\n";
}
undef $tm;
}
untie %h;
sub n {
my $a = shift;
$a =~ s/\.{6,}/...../g;
$a = tokenize( { rs => ' ' } => $a ) if $tok;
$a =~ s/\s+/ /g;
$a =~ s/ $//;
$a =~ s/^ //;
return $a;
}
__END__
=head1 NAME
tmxuniq - removes duplicated translation units from TMXs
=head1 SYNOPSIS
tmxuniq [options] -l=en:pt tmx1 ...
=head1 DESCRIPTION
Removes duplicated translation units from a set of TMX (Translation
Memory eXange format).
=head1 OPTIONS
-id -- insert a uniq id property in each TU
-dig -- insert a digest property in each TU
-tok -- tokenize/normalize text
-o=out.tmx -- (with 1 argument) redefine output (default = input._)
=head1 AUTHOR
J.Joao Almeida, jj@di.uminho.pt
=head1 SEE ALSO
perl(1).
=cut
|