This file is indexed.

/usr/share/perl5/Text/Unicode/Equivalents.pm is in libtext-unicode-equivalents-perl 0.05-1.

This file is owned by root:root, with mode 0o644.

The actual contents of the file can be viewed below.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
package Text::Unicode::Equivalents;

use strict;
use warnings;
use utf8;
use Unicode::Normalize qw(NFD getCanon getComposite getCombinClass);
use Unicode::UCD;
use Encode;
use Carp;

require 5.8.0;		# Had some trouble with Unicode character handling in 5.6.

require Exporter;

our @ISA = qw(Exporter);
our @EXPORT_OK = qw( all_strings );

our $VERSION = '0.05';	#   RMH 2011-06-27
#	Changes to Makefile.PL and tests.t to improve portability
#   Added comments about \X being different on 5.10 vs 5.12
# our $VERSION = '0.04';	#   RMH 2011-06-27
#	Perl 5.14 doesn't have unicore/UnicodeData.txt, so changing to use unicode/Decomposition.pl
# our $VERSION = '0.03';	#   RMH 2011-06-24
#   Change module name to Text::Unicode::Equivalents -- more acceptable to CPAN
#   Eliminate all but one public function, which is renamed all_strings()
#   Previous version didn't synthesize singletons
#   Eliminate hard-coding of %nonStarterComposites
#   Eliminate $ignoreSingletons parameter -- not very useful and implementation was squirrely anyay
# our $VERSION = '0.02';	#   RMH 2004-11-08
#   Added equivalents()
#   Rewrote permuteCompositeChar() so composes medial sequences as well as initial
#   As a result, it now composes 0308+0301 -> 0344
# our $VERSION = '0.01';	#   RMH 2003-05-02     Original

=head1 NAME

Text::Unicode::Equivalents - synthesize canonically equivalent strings

=head1 SYNOPSIS

	use Text::Unicode::Equivalents qw( all_strings);
	
	$aref = all_strings ($string);
	map {print "$_\n"} @{$aref};
	
=head1 DESCRIPTION

=cut

# The two things I can't seem to make the Unicode module do are to (1) compose two diacritics, e.g.,
# <0308+0301> => 0344 (Unicode calls such decompositions "non-starters" and won't compose them) and
# (2) *compose* a singleton.  So I use unicore/Decomposition.pl to generate two hashes:

my %sSingletonCompositions;	# keyed by single character string; returns its singleton composite, as a string.
my %cpNonStarterComposites;	# keyed by two-character string that has a non-starter composition; returns codepoint of the composite.

=over

=item all_string($s)

Given an arbitrary string, C<all_strings()> 
returns a reference to an unsorted array of all unique strings that are canonically
equivalent to the argument. 

=cut

sub all_strings
{
	my ($s, $trace) = @_;
	my $i;
	
	# If string starts with combining mark, prefix space so we get a proper cluster:
	my $spaceAdded;
	if ($s =~ /^\pM/)
	{
		$s = ' ' . $s ;
		$spaceAdded = 1;
	}
	
	# Split string into Extended Grapheme Clusters 
	
	# NB:
	# on Perl prior to v5.12, \X matches Unicode "combining character sequence", equivalent to (?>\PM\pM*)
	# on Perl v5.12 and later, \X matches Unicode "eXtended grapheme cluster"
	# Thus \X matches combining hangul jamo sequence such as "\x{1100}\x{1161}\x{11a8}" on 12.0, but not 10.1
	
	my @clusters = ($s =~ m/(\X)/g);	
	
	# Generate all canonically equivalent permutation of each cluster:
	for $i (0 .. $#clusters)
	{
		$clusters[$i] = _permute_cluster ($clusters[$i], $trace);
		# Note: result is a reference to an array!
	}
	
	# Now rebuild all possible combinations of the clusters:
	my $res = _generator (\@clusters);
	if ($spaceAdded)
	{
		# Need to remove that leading space from each:
		foreach $i (0 .. $#{$res})
		{
			$res->[$i] =~ s/^ //o;
		}
	}
	if ($trace)
	{
		map { printMessage ($_) } @$res;
	}
	
	return $res;
}

# Given a reference to a list of arrays of strings, C<generator()> returns reference to an unsorted list 
# of all strings that can be generated by concatenating together one string from each array in the list.

sub _generator
{
	my ($a,				# Initial parameter
	    $res, $i, $s	# Parameters used in recursion
	    ) = @_;
	unless ($res)
	{
		$res = {};
		$i = 0;
		$s = '';
	}
	if ($i > $#{$a})
	{
		$res->{$s} = 1;
	}
	else
	{
		foreach (@{$a->[$i]})
		{
			_generator ($a, $res, $i+1, $s . $_);
		}
	}
	return if $i > 1;
	return [ keys %{$res} ];
}	


# Given an L<Extended Grapheme Cluster|http://unicode.org/glossary/#extended_grapheme_cluster>
# (EGC) C<_permute_cluster()> returns a reference to an unsorted array of all unique strings that
# are canonically equivalent to the EGC
#
# returns undef if the parameter is not an EGC, i.e. does not match C</^\X$/>.

# Implemented by brute force evaluation of all permutations so isn't too clever.
# Could be made more efficient, but since EGCs are short the inefficiency isn't huge.

sub _permute_cluster {
	my ($s, $trace) = @_;
	
	# make sure argument is an EGC
	return undef unless $s =~ /^\X$/;
	
	# retrieve required data from UnicodeData.txt
	_getCompositions() unless %cpNonStarterComposites;
	
	my %res;	# Place to keep result strings (as keys so we eliminate duplicates)

	# compute and save NFD of original -- we'll use it to tell whether a candidate
	# is canonically equivalent to the original.
	my $origNFD = NFD($s);
	
	# Start with fully decomposed string:
	$s = $origNFD;
	if (length($s) == 1)
	{
		# we can short-circuit the computation if the length of the decomposed string == 1
		if (exists $sSingletonCompositions{$s})
		{	return [ $s, $sSingletonCompositions{$s} ];	}
		else
		{	return [ $s ]; }
	}

	# pick up the base character 
	my $base = substr($s, 0, 1);

	# Now calculate all permutations of everything else. We'll figure out whether a given
	# permutation is canonically equivalent to the original in a minute.
	
	my %strList;
	map { $strList{$base . $_ } = 1} @{_permute(substr($s,1))};

	# Try every one of the generated permutations of marks:
	foreach $s (keys %strList)
	{
		next if NFD($s) ne $origNFD;	# Not equivalent to original -- ignore it.
		next if exists $res{$s};		# Already seen this sequence

		# Now the fun! Generate every possible sequence from $s by composing pairs and singletons:
		
		my @work = ( [$s, 0] );
		while ($#work >= 0)
		{
			my ($s, $i) = @{pop @work};
			printMessage ('POP:', $s, $i, length($s)) if $trace;
			if ($i >= length ($s))
			{
				# We've worked our way to the end of the string. At this point we have
				# some combination of composition and decomposition that should be equivalent
				# to the original!
				$res{$s} = 1;	# Here's a keeper!
			}
			else
			{
				push @work, [ $s, $i+1 ];
				if (exists $sSingletonCompositions{substr($s, $i, 1)})
				{
					# recompose this singleton and save result for work
					printMessage("SING at $i:", substr($s, $i, 1), ' -> ', $sSingletonCompositions{substr($s, $i, 1)}) if $trace;
					my $s2 = $s;
					substr($s2, $i, 1) = $sSingletonCompositions{substr($s, $i, 1)};
					push @work, [ $s2, $i ];
				}
				while ($i+1 < length($s))
				{
					# Try to combine two chars:
					my $s2 = substr($s, $i, 2);
					my ($u1, $u2) = unpack ( 'UU', $s2);
					my $u = getComposite($u1, $u2) || $cpNonStarterComposites{$s2};
					printMessage ("COMP at $i:", sprintf('%04X',$u1), sprintf('%04X',$u2), '->', defined $u ? sprintf('%04X',$u) : 'undef') if $trace;
					last unless defined $u;
					my $c = pack('U', $u);
					substr($s, $i, 2) = $c;
					push @work, [$s, $i+1];
					if (exists $sSingletonCompositions{$c})
					{
						printMessage("SING at $i:", $c, '->', $sSingletonCompositions{$c}) if $trace;
						my $s2 = $s;
						substr($s2, $i, 1) = $sSingletonCompositions{$c};
						push @work, [$s2, $i+1];
					}
				}
			}
		}
	}

	# All done. Return the results
	[ keys(%res) ]
}

# I'm not happy with this hack. unicore/Decomposition.pl explicitly says the code is for internal use
# only, but I don't know any other reasonably efficient way to construct lists of Unicode compositions
# other than including my own copy of, for example UnicodeData.txt, but then I couldn't guarantee that
# my copy was in sync with the local Perl installation.  Oh well.

sub _getCompositions {
	# Next few lines stolen shamelessly from Unicode::UCD
    for (split /^/m, do "unicore/Decomposition.pl") {
        my ($start, $end, $decomp) = / ^ (.+?) \t (.*?) \t (.+?)
                                        \s* ( \# .* )?  # Optional comment
                                        $ /x;
        $end = $start if $end eq "";
    
    	if ($decomp =~ /^([[:xdigit:]]{4,6})$/o) {
			# Singleton decomposition -- keep a record of these:
			my $d = $1;
			foreach my $c (hex($start) .. hex($end)) {
				$sSingletonCompositions{pack('U', hex($d))} = pack('U', $c);	# NB: hash values are strings
			}
		}
		elsif ($decomp =~ /^([[:xdigit:]]{4,6})\s+([[:xdigit:]]{4,6})$/o) {
			# Possible non-starter decompsition
			my ($d1, $d2) = map{hex} ($1, $2);
			foreach my $c (hex($start) .. hex($end)) {
				$cpNonStarterComposites{pack('UU', $d1, $d2)} = $c if getCombinClass($c) || getCombinClass($d1);  # NB: hash values are codepoints
			}
		}
	}
}


# Given a string, return a reference to an unsorted array containing 
# all permutations of the string. Does not filter out duplicates which
# can result if one or more chars of the string are the same.

# adaptation of the array permutation algorithm in FAQ 4
#(see "How do I permute N elements of a list?")
#
# I tried to making $list a hash rather than an array so as to eliminate duplicates,
# but Perl 5.6.1 had trouble figuring out that some strings were in fact
# UTF-8, so some data got munged. A hash would probably work on 5.8.

sub _permute {
    my ($src,			# initial parameter
    	$res , $list	# Parameters used in recursion 
    	) = @_;
    unless ($list)
    {
    	$list = {};
    	$res = '';
    }
    unless ($src) {
        $list->{$res} = 1;
    } else {
        my($newsrc,$newres,$i);
        foreach $i (0 .. length($src)-1) {
            $newsrc = $src;
            $newres = $res . substr($newsrc, $i, 1, "");
            _permute($newsrc, $newres, $list);
        }
    }
	# All done. Return the results
	return [ keys %{$list} ];
}

sub printMessage
{
	my $s = join(' ', @_);
	print STDERR encode('ascii', $s, Encode::FB_PERLQQ) . "\n";
}

1;

=back

=head1 BUGS

Uses L<Unicode::Normalize>. On some systems (e.g. ActiveState 5.6.1) Unicode::Normalize is aware 
only of Unicode 3.0 and thus de/compositions introduced since Unicode 3.0 will not be used.

=head1 AUTHOR

Bob Hallissy

=head1 COPYRIGHT

  Copyright(C) 2003-2011, SIL International. 

  This package is published under the terms of the Perl Artistic License.