/usr/bin/tmxuniq is in libxml-tmx-perl 0.36-1.
This file is owned by root:root, with mode 0o755.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 | #!/usr/bin/perl -s
# PODNAME: tmxuniq
# ABSTRACT: removes duplicated translation units from TMXs
BEGIN {
eval "require Lingua::PT::PLNbase";
if ($@) {
print STDERR
"This script requires Lingua::PT::PLNbase to run. We suggest installing\n",
"it using CPAN: http://www.cpan.org/modules/INSTALL.html\n";
exit(0);
}
}
use DB_File;
use Fcntl ;
use XML::TMX::Reader;
use Digest::MD5 qw(md5_hex);
use Encode;
use Lingua::PT::PLNbase
our ($cont,$id,$dig,$tok,$o,$fast);
if ($cont) {
tie %dic, 'DB_File', "__tmxuniq_$$.db", O_RDWR|O_CREAT , 0640, $DB_BTREE;
} else {
tie %dic, 'DB_File', "__tmxuniq_$$.db", O_RDWR|O_CREAT|O_TRUNC , 0640, $DB_BTREE;
}
my $cid = 0;
for my $file (@ARGV){
my $tm = XML::TMX::Reader->new($file);
print STDERR "Processing...";
$tm->for_tu
(
{ output => $o || "$file._" },
sub {
my $tu = shift;
$cid++;
$tu->{-prop}{id} = $cid if $id;
my $key = join("|||", map { n($tu->{$_}{-seg}) } sort grep { !/^-/ } keys %$tu);
my $digest = md5_hex(encode_utf8($key));
unless ($cid % 10000) {
my $size = -s "__tmxuniq_$$.db";
printf STDERR
"\rTotal: %10d Removed: %8d (%.2f%%) Database size: %10d bytes",
$cid, $rem, (100*$rem/$cid), $size;
}
if ($dic{$digest}) {
$dic{$digest} .= "$cid;" unless $fast;
$rem ++;
return undef
} else {
$dic{$digest} = "$cid;";
$tu->{-prop}{digest} = $digest if $dig;
return {%$tu} ; # used clone.. no idea why
}
}
);
my $size = -s "__tmxuniq_$$.db";
if ($cid) {
printf STDERR "\rTotal: %10d Removed: %8d (%.2f%%) Database size: %10d bytes\n",
$cid, $rem, (100*$rem/$cid), $size;
} else {
printf STDERR "\rHuh.. empty TMX?\n";
}
undef $tm;
}
untie %h;
sub n {
my $a = shift;
$a =~ s/\.{6,}/...../g;
$a = tokenize( { rs => ' ' } => $a ) if $tok;
$a =~ s/\s+/ /g;
$a =~ s/ $//;
$a =~ s/^ //;
return $a;
}
__END__
=pod
=encoding UTF-8
=head1 NAME
tmxuniq - removes duplicated translation units from TMXs
=head1 VERSION
version 0.36
=head1 SYNOPSIS
tmxuniq [options] -l=en:pt tmx1 ...
=head1 DESCRIPTION
Removes duplicated translation units from a set of TMX (Translation
Memory eXange format).
=head1 OPTIONS
-id -- insert a uniq id property in each TU
-dig -- insert a digest property in each TU
-tok -- tokenize/normalize text
-o=out.tmx -- (with 1 argument) redefine output (default = input._)
=head1 SEE ALSO
perl(1).
=head1 AUTHORS
=over 4
=item *
Alberto Simões <ambs@cpan.org>
=item *
José João Almeida <jj@di.uminho.pt>
=back
=head1 COPYRIGHT AND LICENSE
This software is copyright (c) 2010-2017 by Projeto Natura <natura@di.uminho.pt>.
This is free software; you can redistribute it and/or modify it under
the same terms as the Perl 5 programming language system itself.
=cut
|