root/branches/boomer/extlib/I18N/LangTags.pm @ 1098

Revision 1098, 25.0 kB (checked in by hachi, 2 years ago)

Branching for boomer from release-19, rev 62318

Line 
1
2# Time-stamp: "2002-02-02 20:43:03 MST"
3# Sean M. Burke <sburke@cpan.org>
4
5require 5.000;
6package I18N::LangTags;
7use strict;
8use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS $VERSION %Panic);
9require Exporter;
10@ISA = qw(Exporter);
11@EXPORT = qw();
12@EXPORT_OK = qw(is_language_tag same_language_tag
13                extract_language_tags super_languages
14                similarity_language_tag is_dialect_of
15                locale2language_tag alternate_language_tags
16                encode_language_tag panic_languages
17               );
18%EXPORT_TAGS = ('ALL' => \@EXPORT_OK);
19
20$VERSION = "0.27";
21
22=head1 NAME
23
24I18N::LangTags - functions for dealing with RFC3066-style language tags
25
26=head1 SYNOPSIS
27
28    use I18N::LangTags qw(is_language_tag same_language_tag
29                          extract_language_tags super_languages
30                          similarity_language_tag is_dialect_of
31                          locale2language_tag alternate_language_tags
32                          encode_language_tag panic_languages
33                         );
34
35...or whatever of those functions you want to import.  Those are
36all the exportable functions -- you're free to import only some,
37or none at all.  By default, none are imported.  If you say:
38
39    use I18N::LangTags qw(:ALL)
40
41...then all are exported.  (This saves you from having to use
42something less obvious like C<use I18N::LangTags qw(/./)>.)
43
44If you don't import any of these functions, assume a C<&I18N::LangTags::>
45in front of all the function names in the following examples.
46
47=head1 DESCRIPTION
48
49Language tags are a formalism, described in RFC 3066 (obsoleting
501766), for declaring what language form (language and possibly
51dialect) a given chunk of information is in.
52
53This library provides functions for common tasks involving language
54tags as they are needed in a variety of protocols and applications.
55
56Please see the "See Also" references for a thorough explanation
57of how to correctly use language tags.
58
59=over
60
61=cut
62
63###########################################################################
64
65=item * the function is_language_tag($lang1)
66
67Returns true iff $lang1 is a formally valid language tag.
68
69   is_language_tag("fr")            is TRUE
70   is_language_tag("x-jicarilla")   is FALSE
71       (Subtags can be 8 chars long at most -- 'jicarilla' is 9)
72
73   is_language_tag("sgn-US")    is TRUE
74       (That's American Sign Language)
75
76   is_language_tag("i-Klikitat")    is TRUE
77       (True without regard to the fact noone has actually
78        registered Klikitat -- it's a formally valid tag)
79
80   is_language_tag("fr-patois")     is TRUE
81       (Formally valid -- altho descriptively weak!)
82
83   is_language_tag("Spanish")       is FALSE
84   is_language_tag("french-patois") is FALSE
85       (No good -- first subtag has to match
86        /^([xXiI]|[a-zA-Z]{2,3})$/ -- see RFC3066)
87
88   is_language_tag("x-borg-prot2532") is TRUE
89       (Yes, subtags can contain digits, as of RFC3066)
90
91=cut
92
93sub is_language_tag {
94
95  ## Changes in the language tagging standards may have to be reflected here.
96
97  my($tag) = lc($_[0]);
98
99  return 0 if $tag eq "i" or $tag eq "x";
100  # Bad degenerate cases that the following
101  #  regexp would erroneously let pass
102
103  return $tag =~ 
104    /^(?:  # First subtag
105         [xi] | [a-z]{2,3}
106      )
107      (?:  # Subtags thereafter
108         -           # separator
109         [a-z0-9]{1,8}  # subtag 
110      )*
111    $/xs ? 1 : 0;
112}
113
114###########################################################################
115
116=item * the function extract_language_tags($whatever)
117
118Returns a list of whatever looks like formally valid language tags
119in $whatever.  Not very smart, so don't get too creative with
120what you want to feed it.
121
122  extract_language_tags("fr, fr-ca, i-mingo")
123    returns:   ('fr', 'fr-ca', 'i-mingo')
124
125  extract_language_tags("It's like this: I'm in fr -- French!")
126    returns:   ('It', 'in', 'fr')
127  (So don't just feed it any old thing.)
128
129The output is untainted.  If you don't know what tainting is,
130don't worry about it.
131
132=cut
133
134sub extract_language_tags {
135
136  ## Changes in the language tagging standards may have to be reflected here.
137
138  my($text) =
139    $_[0] =~ m/(.+)/  # to make for an untainted result
140    ? $1 : ''
141  ;
142 
143  return grep(!m/^[ixIX]$/s, # 'i' and 'x' aren't good tags
144    $text =~ 
145    m/
146      \b
147      (?:  # First subtag
148         [iIxX] | [a-zA-Z]{2,3}
149      )
150      (?:  # Subtags thereafter
151         -           # separator
152         [a-zA-Z0-9]{1,8}  # subtag 
153      )*
154      \b
155    /xsg
156  );
157}
158
159###########################################################################
160
161=item * the function same_language_tag($lang1, $lang2)
162
163Returns true iff $lang1 and $lang2 are acceptable variant tags
164representing the same language-form.
165
166   same_language_tag('x-kadara', 'i-kadara')  is TRUE
167      (The x/i- alternation doesn't matter)
168   same_language_tag('X-KADARA', 'i-kadara')  is TRUE
169      (...and neither does case)
170   same_language_tag('en',       'en-US')     is FALSE
171      (all-English is not the SAME as US English)
172   same_language_tag('x-kadara', 'x-kadar')   is FALSE
173      (these are totally unrelated tags)
174   same_language_tag('no-bok',    'nb')       is TRUE
175      (no-bok is a legacy tag for nb (Norwegian Bokmal))
176
177C<same_language_tag> works by just seeing whether
178C<encode_language_tag($lang1)> is the same as
179C<encode_language_tag($lang2)>.
180
181(Yes, I know this function is named a bit oddly.  Call it historic
182reasons.)
183
184=cut
185
186sub same_language_tag {
187  my $el1 = &encode_language_tag($_[0]);
188  return 0 unless defined $el1;
189   # this avoids the problem of
190   # encode_language_tag($lang1) eq and encode_language_tag($lang2)
191   # being true if $lang1 and $lang2 are both undef
192
193  return $el1 eq &encode_language_tag($_[1]) ? 1 : 0;
194}
195
196###########################################################################
197
198=item * the function similarity_language_tag($lang1, $lang2)
199
200Returns an integer representing the degree of similarity between
201tags $lang1 and $lang2 (the order of which does not matter), where
202similarity is the number of common elements on the left,
203without regard to case and to x/i- alternation.
204
205   similarity_language_tag('fr', 'fr-ca')           is 1
206      (one element in common)
207   similarity_language_tag('fr-ca', 'fr-FR')        is 1
208      (one element in common)
209
210   similarity_language_tag('fr-CA-joual',
211                           'fr-CA-PEI')             is 2
212   similarity_language_tag('fr-CA-joual', 'fr-CA')  is 2
213      (two elements in common)
214
215   similarity_language_tag('x-kadara', 'i-kadara')  is 1
216      (x/i- doesn't matter)
217
218   similarity_language_tag('en',       'x-kadar')   is 0
219   similarity_language_tag('x-kadara', 'x-kadar')   is 0
220      (unrelated tags -- no similarity)
221
222   similarity_language_tag('i-cree-syllabic',
223                           'i-cherokee-syllabic')   is 0
224      (no B<leftmost> elements in common!)
225
226=cut
227
228sub similarity_language_tag {
229  my $lang1 = &encode_language_tag($_[0]);
230  my $lang2 = &encode_language_tag($_[1]);
231   # And encode_language_tag takes care of the whole
232   #  no-nyn==nn, i-hakka==zh-hakka, etc, things
233   
234  # NB: (i-sil-...)?  (i-sgn-...)?
235
236  return undef if !defined($lang1) and !defined($lang2);
237  return 0 if !defined($lang1) or !defined($lang2);
238
239  my @l1_subtags = split('-', $lang1);
240  my @l2_subtags = split('-', $lang2);
241  my $similarity = 0;
242
243  while(@l1_subtags and @l2_subtags) {
244    if(shift(@l1_subtags) eq shift(@l2_subtags)) {
245      ++$similarity;
246    } else {
247      last;
248    } 
249  }
250  return $similarity;
251}
252
253###########################################################################
254
255=item * the function is_dialect_of($lang1, $lang2)
256
257Returns true iff language tag $lang1 represents a subform of
258language tag $lang2.
259
260B<Get the order right!  It doesn't work the other way around!>
261
262   is_dialect_of('en-US', 'en')            is TRUE
263     (American English IS a dialect of all-English)
264
265   is_dialect_of('fr-CA-joual', 'fr-CA')   is TRUE
266   is_dialect_of('fr-CA-joual', 'fr')      is TRUE
267     (Joual is a dialect of (a dialect of) French)
268
269   is_dialect_of('en', 'en-US')            is FALSE
270     (all-English is a NOT dialect of American English)
271
272   is_dialect_of('fr', 'en-CA')            is FALSE
273
274   is_dialect_of('en',    'en'   )         is TRUE
275   is_dialect_of('en-US', 'en-US')         is TRUE
276     (B<Note:> these are degenerate cases)
277
278   is_dialect_of('i-mingo-tom', 'x-Mingo') is TRUE
279     (the x/i thing doesn't matter, nor does case)
280
281   is_dialect_of('nn', 'no')               is TRUE
282     (because 'nn' (New Norse) is aliased to 'no-nyn',
283      as a special legacy case, and 'no-nyn' is a
284      subform of 'no' (Norwegian))
285
286=cut
287
288sub is_dialect_of {
289
290  my $lang1 = &encode_language_tag($_[0]);
291  my $lang2 = &encode_language_tag($_[1]);
292
293  return undef if !defined($lang1) and !defined($lang2);
294  return 0 if !defined($lang1) or !defined($lang2);
295
296  return 1 if $lang1 eq $lang2;
297  return 0 if length($lang1) < length($lang2);
298
299  $lang1 .= '-';
300  $lang2 .= '-';
301  return
302    (substr($lang1, 0, length($lang2)) eq $lang2) ? 1 : 0;
303}
304
305###########################################################################
306
307=item * the function super_languages($lang1)
308
309Returns a list of language tags that are superordinate tags to $lang1
310-- it gets this by removing subtags from the end of $lang1 until
311nothing (or just "i" or "x") is left.
312
313   super_languages("fr-CA-joual")  is  ("fr-CA", "fr")
314
315   super_languages("en-AU")  is  ("en")
316
317   super_languages("en")  is  empty-list, ()
318
319   super_languages("i-cherokee")  is  empty-list, ()
320    ...not ("i"), which would be illegal as well as pointless.
321
322If $lang1 is not a valid language tag, returns empty-list in
323a list context, undef in a scalar context.
324
325A notable and rather unavoidable problem with this method:
326"x-mingo-tom" has an "x" because the whole tag isn't an
327IANA-registered tag -- but super_languages('x-mingo-tom') is
328('x-mingo') -- which isn't really right, since 'i-mingo' is
329registered.  But this module has no way of knowing that.  (But note
330that same_language_tag('x-mingo', 'i-mingo') is TRUE.)
331
332More importantly, you assume I<at your peril> that superordinates of
333$lang1 are mutually intelligible with $lang1.  Consider this
334carefully.
335
336=cut 
337
338sub super_languages {
339  my $lang1 = $_[0];
340  return() unless defined($lang1) && &is_language_tag($lang1);
341
342  # a hack for those annoying new (2001) tags:
343  $lang1 =~ s/^nb\b/no-bok/i; # yes, backwards
344  $lang1 =~ s/^nn\b/no-nyn/i; # yes, backwards
345  $lang1 =~ s/^[ix](-hakka\b)/zh$1/i; # goes the right way
346   # i-hakka-bork-bjork-bjark => zh-hakka-bork-bjork-bjark
347
348  my @l1_subtags = split('-', $lang1);
349
350  ## Changes in the language tagging standards may have to be reflected here.
351
352  # NB: (i-sil-...)?
353
354  my @supers = ();
355  foreach my $bit (@l1_subtags) {
356    push @supers, 
357      scalar(@supers) ? ($supers[-1] . '-' . $bit) : $bit;
358  }
359  pop @supers if @supers;
360  shift @supers if @supers && $supers[0] =~ m<^[iIxX]$>s;
361  return reverse @supers;
362}
363
364###########################################################################
365
366=item * the function locale2language_tag($locale_identifier)
367
368This takes a locale name (like "en", "en_US", or "en_US.ISO8859-1")
369and maps it to a language tag.  If it's not mappable (as with,
370notably, "C" and "POSIX"), this returns empty-list in a list context,
371or undef in a scalar context.
372
373   locale2language_tag("en") is "en"
374
375   locale2language_tag("en_US") is "en-US"
376
377   locale2language_tag("en_US.ISO8859-1") is "en-US"
378
379   locale2language_tag("C") is undef or ()
380
381   locale2language_tag("POSIX") is undef or ()
382
383   locale2language_tag("POSIX") is undef or ()
384
385I'm not totally sure that locale names map satisfactorily to language
386tags.  Think REAL hard about how you use this.  YOU HAVE BEEN WARNED.
387
388The output is untainted.  If you don't know what tainting is,
389don't worry about it.
390
391=cut 
392
393sub locale2language_tag {
394  my $lang =
395    $_[0] =~ m/(.+)/  # to make for an untainted result
396    ? $1 : ''
397  ;
398
399  return $lang if &is_language_tag($lang); # like "en"
400
401  $lang =~ tr<_><->;  # "en_US" -> en-US
402  $lang =~ s<\.[-_a-zA-Z0-9\.]*><>s;  # "en_US.ISO8859-1" -> en-US
403
404  return $lang if &is_language_tag($lang);
405
406  return;
407}
408
409###########################################################################
410
411=item * the function encode_language_tag($lang1)
412
413This function, if given a language tag, returns an encoding of it such
414that:
415
416* tags representing different languages never get the same encoding.
417
418* tags representing the same language always get the same encoding.
419
420* an encoding of a formally valid language tag always is a string
421value that is defined, has length, and is true if considered as a
422boolean.
423
424Note that the encoding itself is B<not> a formally valid language tag.
425Note also that you cannot, currently, go from an encoding back to a
426language tag that it's an encoding of.
427
428Note also that you B<must> consider the encoded value as atomic; i.e.,
429you should not consider it as anything but an opaque, unanalysable
430string value.  (The internals of the encoding method may change in
431future versions, as the language tagging standard changes over time.)
432
433C<encode_language_tag> returns undef if given anything other than a
434formally valid language tag.
435
436The reason C<encode_language_tag> exists is because different language
437tags may represent the same language; this is normally treatable with
438C<same_language_tag>, but consider this situation:
439
440You have a data file that expresses greetings in different languages.
441Its format is "[language tag]=[how to say 'Hello']", like:
442
443          en-US=Hiho
444          fr=Bonjour
445          i-mingo=Hau'
446
447And suppose you write a program that reads that file and then runs as
448a daemon, answering client requests that specify a language tag and
449then expect the string that says how to greet in that language.  So an
450interaction looks like:
451
452          greeting-client asks:    fr
453          greeting-server answers: Bonjour
454
455So far so good.  But suppose the way you're implementing this is:
456
457          my %greetings;
458          die unless open(IN, "<in.dat");
459          while(<IN>) {
460            chomp;
461            next unless /^([^=]+)=(.+)/s;
462            my($lang, $expr) = ($1, $2);
463            $greetings{$lang} = $expr;
464          }
465          close(IN);
466
467at which point %greetings has the contents:
468
469          "en-US"   => "Hiho"
470          "fr"      => "Bonjour"
471          "i-mingo" => "Hau'"
472
473And suppose then that you answer client requests for language $wanted
474by just looking up $greetings{$wanted}.
475
476If the client asks for "fr", that will look up successfully in
477%greetings, to the value "Bonjour".  And if the client asks for
478"i-mingo", that will look up successfully in %greetings, to the value
479"Hau'".
480
481But if the client asks for "i-Mingo" or "x-mingo", or "Fr", then the
482lookup in %greetings fails.  That's the Wrong Thing.
483
484You could instead do lookups on $wanted with:
485
486          use I18N::LangTags qw(same_language_tag);
487          my $repsonse = '';
488          foreach my $l2 (keys %greetings) {
489            if(same_language_tag($wanted, $l2)) {
490              $response = $greetings{$l2};
491              last;
492            }
493          }
494
495But that's rather inefficient.  A better way to do it is to start your
496program with:
497
498          use I18N::LangTags qw(encode_language_tag);
499          my %greetings;
500          die unless open(IN, "<in.dat");
501          while(<IN>) {
502            chomp;
503            next unless /^([^=]+)=(.+)/s;
504            my($lang, $expr) = ($1, $2);
505            $greetings{
506                        encode_language_tag($lang)
507                      } = $expr;
508          }
509          close(IN);
510
511and then just answer client requests for language $wanted by just
512looking up
513
514          $greetings{encode_language_tag($wanted)}
515
516And that does the Right Thing.
517
518=cut
519
520sub encode_language_tag {
521  # Only similarity_language_tag() is allowed to analyse encodings!
522
523  ## Changes in the language tagging standards may have to be reflected here.
524
525  my($tag) = $_[0] || return undef;
526  return undef unless &is_language_tag($tag);
527
528  # For the moment, these legacy variances are few enough that
529  #  we can just handle them here with regexps.
530  $tag =~ s/^iw\b/he/i; # Hebrew
531  $tag =~ s/^in\b/id/i; # Indonesian
532  $tag =~ s/^[ix]-lux\b/lb/i;  # Luxemburger
533  $tag =~ s/^[ix]-navajo\b/nv/i;  # Navajo
534  $tag =~ s/^ji\b/yi/i;  # Yiddish
535  #
536  # These go FROM the simplex to complex form, to get
537  #  similarity-comparison right.  And that's okay, since
538  #  similarity_language_tag is the only thing that
539  #  analyzes our output.
540  $tag =~ s/^[ix]-hakka\b/zh-hakka/i;  # Hakka
541  $tag =~ s/^nb\b/no-bok/i;  # BACKWARDS for Bokmal
542  $tag =~ s/^nn\b/no-nyn/i;  # BACKWARDS for Nynorsk
543
544  $tag =~ s/^[xiXI]-//s;
545   # Just lop off any leading "x/i-"
546
547  return "~" . uc($tag);
548}
549
550#--------------------------------------------------------------------------
551
552=item * the function alternate_language_tags($lang1)
553
554This function, if given a language tag, returns all language tags that
555are alternate forms of this language tag.  (I.e., tags which refer to
556the same language.)  This is meant to handle legacy tags caused by
557the minor changes in language tag standards over the years; and
558the x-/i- alternation is also dealt with.
559
560Note that this function does I<not> try to equate new (and never-used,
561and unusable)
562ISO639-2 three-letter tags to old (and still in use) ISO639-1
563two-letter equivalents -- like "ara" -> "ar" -- because
564"ara" has I<never> been in use as an Internet language tag,
565and RFC 3066 stipulates that it never should be, since a shorter
566tag ("ar") exists.
567
568Examples:
569
570          alternate_language_tags('no-bok')       is ('nb')
571          alternate_language_tags('nb')           is ('no-bok')
572          alternate_language_tags('he')           is ('iw')
573          alternate_language_tags('iw')           is ('he')
574          alternate_language_tags('i-hakka')      is ('zh-hakka', 'x-hakka')
575          alternate_language_tags('zh-hakka')     is ('i-hakka', 'x-hakka')
576          alternate_language_tags('en')           is ()
577          alternate_language_tags('x-mingo-tom')  is ('i-mingo-tom')
578          alternate_language_tags('x-klikitat')   is ('i-klikitat')
579          alternate_language_tags('i-klikitat')   is ('x-klikitat')
580
581This function returns empty-list if given anything other than a formally
582valid language tag.
583
584=cut
585
586my %alt = qw( i x   x i   I X   X I );
587sub alternate_language_tags {
588  my $tag = $_[0];
589  return() unless &is_language_tag($tag);
590
591  my @em; # push 'em real goood!
592
593  # For the moment, these legacy variances are few enough that
594  #  we can just handle them here with regexps.
595 
596  if(     $tag =~ m/^[ix]-hakka\b(.*)/i) {push @em, "zh-hakka$1";
597  } elsif($tag =~ m/^zh-hakka\b(.*)/i) {  push @em, "x-hakka$1", "i-hakka$1";
598
599  } elsif($tag =~ m/^he\b(.*)/i) { push @em, "iw$1";
600  } elsif($tag =~ m/^iw\b(.*)/i) { push @em, "he$1";
601
602  } elsif($tag =~ m/^in\b(.*)/i) { push @em, "id$1";
603  } elsif($tag =~ m/^id\b(.*)/i) { push @em, "in$1";
604
605  } elsif($tag =~ m/^[ix]-lux\b(.*)/i) { push @em, "lb$1";
606  } elsif($tag =~ m/^lb\b(.*)/i) {       push @em, "i-lux$1", "x-lux$1";
607
608  } elsif($tag =~ m/^[ix]-navajo\b(.*)/i) { push @em, "nv$1";
609  } elsif($tag =~ m/^nv\b(.*)/i) {          push @em, "i-navajo$1", "x-navajo$1";
610
611  } elsif($tag =~ m/^yi\b(.*)/i) { push @em, "ji$1";
612  } elsif($tag =~ m/^ji\b(.*)/i) { push @em, "yi$1";
613
614  } elsif($tag =~ m/^nb\b(.*)/i) {     push @em, "no-bok$1";
615  } elsif($tag =~ m/^no-bok\b(.*)/i) { push @em, "nb$1";
616 
617  } elsif($tag =~ m/^nn\b(.*)/i) {     push @em, "no-nyn$1";
618  } elsif($tag =~ m/^no-nyn\b(.*)/i) { push @em, "nn$1";
619  }
620
621  push @em, $alt{$1} . $2 if $tag =~ /^([XIxi])(-.+)/;
622  return @em;
623}
624
625###########################################################################
626
627{
628  # Init %Panic...
629 
630  my @panic = (  # MUST all be lowercase!
631   # Only large ("national") languages make it in this list.
632   #  If you, as a user, are so bizarre that the /only/ language
633   #  you claim to accept is Galician, then no, we won't do you
634   #  the favor of providing Catalan as a panic-fallback for
635   #  you.  Because if I start trying to add "little languages" in
636   #  here, I'll just go crazy.
637
638   # Scandinavian lgs.  All based on opinion and hearsay.
639   'sv' => [qw(nb no da nn)],
640   'da' => [qw(nb no sv nn)], # I guess
641   [qw(no nn nb)], [qw(no nn nb sv da)],
642   'is' => [qw(da sv no nb nn)],
643   'fo' => [qw(da is no nb nn sv)], # I guess
644   
645   # I think this is about the extent of tolerable intelligibility
646   #  among large modern Romance languages.
647   'pt' => [qw(es ca it fr)], # Portuguese, Spanish, Catalan, Italian, French
648   'ca' => [qw(es pt it fr)],
649   'es' => [qw(ca it fr pt)],
650   'it' => [qw(es fr ca pt)],
651   'fr' => [qw(es it ca pt)],
652   
653   # Also assume that speakers of the main Indian languages prefer
654   #  to read/hear Hindi over English
655   [qw(
656     as bn gu kn ks kok ml mni mr ne or pa sa sd te ta ur
657   )] => 'hi',
658    # Assamese, Bengali, Gujarati, [Hindi,] Kannada (Kanarese), Kashmiri,
659    # Konkani, Malayalam, Meithei (Manipuri), Marathi, Nepali, Oriya,
660    # Punjabi, Sanskrit, Sindhi, Telugu, Tamil, and Urdu.
661   'hi' => [qw(bn pa as or)],
662   # I welcome finer data for the other Indian languages.
663   #  E.g., what should Oriya's list be, besides just Hindi?
664   
665   # And the panic languages for English is, of course, nil!
666
667   # My guesses at Slavic intelligibility:
668   ([qw(ru be uk)]) x 2,  # Russian, Belarusian, Ukranian
669   'sr' => 'hr', 'hr' => 'sr', # Serb + Croat
670   'cs' => 'sk', 'sk' => 'cs', # Czech + Slovak
671
672   'ms' => 'id', 'id' => 'ms', # Malay + Indonesian
673
674   'et' => 'fi', 'fi' => 'et', # Estonian + Finnish
675
676   #?? 'lo' => 'th', 'th' => 'lo', # Lao + Thai
677
678  );
679  my($k,$v);
680  while(@panic) {
681    ($k,$v) = splice(@panic,0,2);
682    foreach my $k (ref($k) ? @$k : $k) {
683      foreach my $v (ref($v) ? @$v : $v) {
684        push @{$Panic{$k} ||= []}, $v unless $k eq $v;
685      }
686    }
687  }
688}
689
690=item * the function @langs = panic_languages(@accept_languages)
691
692This function takes a list of 0 or more language
693tags that constitute a given user's Accept-Language list, and
694returns a list of tags for I<other> (non-super)
695languages that are probably acceptable to the user, to be
696used I<if all else fails>.
697
698For example, if a user accepts only 'ca' (Catalan) and
699'es' (Spanish), and the documents/interfaces you have
700available are just in German, Italian, and Chinese, then
701the user will most likely want the Italian one (and not
702the Chinese or German one!), instead of getting
703nothing.  So C<panic_languages('ca', 'es')> returns
704a list containing 'it' (Italian).
705
706English ('en') is I<always> in the return list, but
707whether it's at the very end or not depends
708on the input languages.  This function works by consulting
709an internal table that stipulates what common
710languages are "close" to each other.
711
712A useful construct you might consider using is:
713
714  @fallbacks = super_languages(@accept_languages);
715  push @fallbacks, panic_languages(
716    @accept_languages, @fallbacks,
717  );
718
719=cut
720
721sub panic_languages {
722  # When in panic or in doubt, run in circles, scream, and shout!
723  my(@out, %seen);
724  foreach my $t (@_) {
725    next unless $t;
726    next if $seen{$t}++; # so we don't return it or hit it again
727    # push @out, super_languages($t); # nah, keep that separate
728    push @out, @{ $Panic{lc $t} || next };
729  }
730  return grep !$seen{$_}++,  @out, 'en';
731}
732
733###########################################################################
7341;
735__END__
736
737=back
738
739=head1 ABOUT LOWERCASING
740
741I've considered making all the above functions that output language
742tags return all those tags strictly in lowercase.  Having all your
743language tags in lowercase does make some things easier.  But you
744might as well just lowercase as you like, or call
745C<encode_language_tag($lang1)> where appropriate.
746
747=head1 ABOUT UNICODE PLAINTEXT LANGUAGE TAGS
748
749In some future version of I18N::LangTags, I plan to include support
750for RFC2482-style language tags -- which are basically just normal
751language tags with their ASCII characters shifted into Plane 14.
752
753=head1 SEE ALSO
754
755* L<I18N::LangTags::List|I18N::LangTags::List>
756
757* RFC 3066, C<ftp://ftp.isi.edu/in-notes/rfc3066.txt>, "Tags for the
758Identification of Languages".  (Obsoletes RFC 1766)
759
760* RFC 2277, C<ftp://ftp.isi.edu/in-notes/rfc2277.txt>, "IETF Policy on
761Character Sets and Languages".
762
763* RFC 2231, C<ftp://ftp.isi.edu/in-notes/rfc2231.txt>, "MIME Parameter
764Value and Encoded Word Extensions: Character Sets, Languages, and
765Continuations".
766
767* RFC 2482, C<ftp://ftp.isi.edu/in-notes/rfc2482.txt>,
768"Language Tagging in Unicode Plain Text".
769
770* Locale::Codes, in
771C<http://www.perl.com/CPAN/modules/by-module/Locale/>
772
773* ISO 639, "Code for the representation of names of languages",
774C<http://www.indigo.ie/egt/standards/iso639/iso639-1-en.html>
775
776* ISO 639-2, "Codes for the representation of names of languages",
777including three-letter codes,
778C<http://lcweb.loc.gov/standards/iso639-2/bibcodes.html>
779
780* The IANA list of registered languages (hopefully up-to-date),
781C<ftp://ftp.isi.edu/in-notes/iana/assignments/languages/>
782
783=head1 COPYRIGHT
784
785Copyright (c) 1998-2001 Sean M. Burke. All rights reserved.
786
787This library is free software; you can redistribute it and/or
788modify it under the same terms as Perl itself.
789
790The programs and documentation in this dist are distributed in
791the hope that they will be useful, but without any warranty; without
792even the implied warranty of merchantability or fitness for a
793particular purpose.
794
795=head1 AUTHOR
796
797Sean M. Burke C<sburke@cpan.org>
798
799=cut
800
Note: See TracBrowser for help on using the browser.