/usr/bin/tmxclean is in libxml-tmx-perl 0.31-1.
This file is owned by root:root, with mode 0o755.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 | #!/usr/bin/perl -s
use v5.10;
use strict;
use warnings;
use XML::TMX::Reader;
our (
$junk, # remove if one of the languages just have junk
$output, # output filename
$eq , # remove if seg(l1) = seg(l2)
$len , # remove if len(li) > 50 ∧ len(lj) > 2len(li)
$v, $verbose
);
my $cleaned = 0;
my $processed = 0;
my $tmx = shift or help();
my $reader = XML::TMX::Reader->new($tmx);
$junk//=1;
$output ||= "_cleaned_$tmx";
print STDERR "loading..." if $v;
$reader->for_tu( {output => $output},
\&cleaner);
printf STDERR "\rRemoved %d/%d (%.3f%%).\n",
$cleaned, $processed, 100*$cleaned/$processed if $v;
sub cleaner {
my $langs = shift;
$processed++;
my $remove = 0;
my %seg=();
my @len=();
for my $k (keys %$langs) {
next if $k =~ /^-/;
$remove = 1 if $eq && $seg{$langs->{$k}{-seg}}++;
$remove = 1 if $junk && $langs->{$k}{-seg} =~ /^[-.,0-9\s]+$/;
$remove = 1 if $junk && $langs->{$k}{-seg} =~ /^\W*$/;
push(@len, length($langs->{$k}{-seg}));
}
@len = sort{$a <=> $b} @len;
$remove = 1 if $len && $len[0] > 50 && $len[0]*2< $len[-1];
$cleaned++ if $remove;
printf STDERR "\rRemoved %d/%d (%.3f%%)...", $cleaned, $processed,
100*$cleaned/$processed if $v && $processed%1000==0;
return $remove ? undef : $langs;
}
sub help {
print " tmxclean [-junk=1] <file.tmx>\n";
exit 1;
}
=encoding UTF-8
=head1 NAME
tmx-clean - clean TMX files ???
=head1 SYNOPSIS
$ tmx-clean file.tmx # ???
=head1 DESCRIPTION
Removes the Translation units that
1. have no letters... (unless -junk=0)
2. seg(L1) = seg(L2) (if -eq)
=head1 SEE ALSO
XML::TMX
=cut
|