/usr/bin/tmx-tokenize is in libxml-tmx-perl 0.36-1.
This file is owned by root:root, with mode 0o755.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 | #!/usr/bin/perl -s
# PODNAME: tmx-tokenize
# ABSTRACT: Tokenizes translation units on a tmx file.
use strict;
use warnings;
our $o;
eval { require FL3 };
die "This XML::TMX script requires Lingua::FreeLing3 to be installed\n" if $@;
FL3->import();
use XML::TMX::Reader '0.25';
my $file = shift or die "You must supply the name of the file to tokenize";
my $reader = XML::TMX::Reader->new($file);
my $output = "t_$file";
$output = $o if $o;
binmode STDOUT, ":utf8";
$reader->for_tu( {
-output => $output,
-prop => { tokenized => "true" },
verbose => 1
},
sub {
my $tu = shift;
for my $lang (keys %$tu) {
if ($lang =~ /(pt|es|it|ru|en|gl)/i) {
my $ln = lc $1;
my $txt = $tu->{$lang}{-seg};
if ($txt !~ /^\s*$/) {
$txt = join(" ",
@{ tokenizer($ln)->tokenize($txt,
to_text => 1)});
}
$tu->{$lang}{-seg} = $txt;
}
}
return $tu;
});
__END__
=pod
=encoding UTF-8
=head1 NAME
tmx-tokenize - Tokenizes translation units on a tmx file.
=head1 VERSION
version 0.36
=head1 SYNOPSIS
tmx-tokenize file.tmx # creates t_file.tmx
tmx-tokenize -o=out.tmx file.tmx
=head1 DESCRIPTION
Although this script is bundled in C<XML::TMX>, it has a soft
dependency on C<Lingua::FreeLing3>. Soft means that the dependency is
not ensured at install time, and other features of the module can
still be used without C<Lingua::FreeLing3>. Nevertheless, if you want
to use this tool you should install that module.
At the moment the supported languages are the same as supported by
FreeLing3: English, Spanish, Russian, Portuguese and Italian.
It your TMX file includes any other language, they will be maintained
without a change. This behavior can change in the future, as a basic
regexp based tokenizer might be implemented.
=head1 SEE ALSO
XML::TMX, Lingua::FreeLing3
=head1 AUTHORS
=over 4
=item *
Alberto Simões <ambs@cpan.org>
=item *
José João Almeida <jj@di.uminho.pt>
=back
=head1 COPYRIGHT AND LICENSE
This software is copyright (c) 2010-2017 by Projeto Natura <natura@di.uminho.pt>.
This is free software; you can redistribute it and/or modify it under
the same terms as the Perl 5 programming language system itself.
=cut
|