| 1 | |
|---|
| 2 | # Time-stamp: "2002-02-02 20:43:03 MST" |
|---|
| 3 | # Sean M. Burke <sburke@cpan.org> |
|---|
| 4 | |
|---|
| 5 | require 5.000; |
|---|
| 6 | package I18N::LangTags; |
|---|
| 7 | use strict; |
|---|
| 8 | use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS $VERSION %Panic); |
|---|
| 9 | require Exporter; |
|---|
| 10 | @ISA = qw(Exporter); |
|---|
| 11 | @EXPORT = qw(); |
|---|
| 12 | @EXPORT_OK = qw(is_language_tag same_language_tag |
|---|
| 13 | extract_language_tags super_languages |
|---|
| 14 | similarity_language_tag is_dialect_of |
|---|
| 15 | locale2language_tag alternate_language_tags |
|---|
| 16 | encode_language_tag panic_languages |
|---|
| 17 | ); |
|---|
| 18 | %EXPORT_TAGS = ('ALL' => \@EXPORT_OK); |
|---|
| 19 | |
|---|
| 20 | $VERSION = "0.27"; |
|---|
| 21 | |
|---|
| 22 | =head1 NAME |
|---|
| 23 | |
|---|
| 24 | I18N::LangTags - functions for dealing with RFC3066-style language tags |
|---|
| 25 | |
|---|
| 26 | =head1 SYNOPSIS |
|---|
| 27 | |
|---|
| 28 | use I18N::LangTags qw(is_language_tag same_language_tag |
|---|
| 29 | extract_language_tags super_languages |
|---|
| 30 | similarity_language_tag is_dialect_of |
|---|
| 31 | locale2language_tag alternate_language_tags |
|---|
| 32 | encode_language_tag panic_languages |
|---|
| 33 | ); |
|---|
| 34 | |
|---|
| 35 | ...or whatever of those functions you want to import. Those are |
|---|
| 36 | all the exportable functions -- you're free to import only some, |
|---|
| 37 | or none at all. By default, none are imported. If you say: |
|---|
| 38 | |
|---|
| 39 | use I18N::LangTags qw(:ALL) |
|---|
| 40 | |
|---|
| 41 | ...then all are exported. (This saves you from having to use |
|---|
| 42 | something less obvious like C<use I18N::LangTags qw(/./)>.) |
|---|
| 43 | |
|---|
| 44 | If you don't import any of these functions, assume a C<&I18N::LangTags::> |
|---|
| 45 | in front of all the function names in the following examples. |
|---|
| 46 | |
|---|
| 47 | =head1 DESCRIPTION |
|---|
| 48 | |
|---|
| 49 | Language tags are a formalism, described in RFC 3066 (obsoleting |
|---|
| 50 | 1766), for declaring what language form (language and possibly |
|---|
| 51 | dialect) a given chunk of information is in. |
|---|
| 52 | |
|---|
| 53 | This library provides functions for common tasks involving language |
|---|
| 54 | tags as they are needed in a variety of protocols and applications. |
|---|
| 55 | |
|---|
| 56 | Please see the "See Also" references for a thorough explanation |
|---|
| 57 | of how to correctly use language tags. |
|---|
| 58 | |
|---|
| 59 | =over |
|---|
| 60 | |
|---|
| 61 | =cut |
|---|
| 62 | |
|---|
| 63 | ########################################################################### |
|---|
| 64 | |
|---|
| 65 | =item * the function is_language_tag($lang1) |
|---|
| 66 | |
|---|
| 67 | Returns true iff $lang1 is a formally valid language tag. |
|---|
| 68 | |
|---|
| 69 | is_language_tag("fr") is TRUE |
|---|
| 70 | is_language_tag("x-jicarilla") is FALSE |
|---|
| 71 | (Subtags can be 8 chars long at most -- 'jicarilla' is 9) |
|---|
| 72 | |
|---|
| 73 | is_language_tag("sgn-US") is TRUE |
|---|
| 74 | (That's American Sign Language) |
|---|
| 75 | |
|---|
| 76 | is_language_tag("i-Klikitat") is TRUE |
|---|
| 77 | (True without regard to the fact noone has actually |
|---|
| 78 | registered Klikitat -- it's a formally valid tag) |
|---|
| 79 | |
|---|
| 80 | is_language_tag("fr-patois") is TRUE |
|---|
| 81 | (Formally valid -- altho descriptively weak!) |
|---|
| 82 | |
|---|
| 83 | is_language_tag("Spanish") is FALSE |
|---|
| 84 | is_language_tag("french-patois") is FALSE |
|---|
| 85 | (No good -- first subtag has to match |
|---|
| 86 | /^([xXiI]|[a-zA-Z]{2,3})$/ -- see RFC3066) |
|---|
| 87 | |
|---|
| 88 | is_language_tag("x-borg-prot2532") is TRUE |
|---|
| 89 | (Yes, subtags can contain digits, as of RFC3066) |
|---|
| 90 | |
|---|
| 91 | =cut |
|---|
| 92 | |
|---|
| 93 | sub is_language_tag { |
|---|
| 94 | |
|---|
| 95 | ## Changes in the language tagging standards may have to be reflected here. |
|---|
| 96 | |
|---|
| 97 | my($tag) = lc($_[0]); |
|---|
| 98 | |
|---|
| 99 | return 0 if $tag eq "i" or $tag eq "x"; |
|---|
| 100 | # Bad degenerate cases that the following |
|---|
| 101 | # regexp would erroneously let pass |
|---|
| 102 | |
|---|
| 103 | return $tag =~ |
|---|
| 104 | /^(?: # First subtag |
|---|
| 105 | [xi] | [a-z]{2,3} |
|---|
| 106 | ) |
|---|
| 107 | (?: # Subtags thereafter |
|---|
| 108 | - # separator |
|---|
| 109 | [a-z0-9]{1,8} # subtag |
|---|
| 110 | )* |
|---|
| 111 | $/xs ? 1 : 0; |
|---|
| 112 | } |
|---|
| 113 | |
|---|
| 114 | ########################################################################### |
|---|
| 115 | |
|---|
| 116 | =item * the function extract_language_tags($whatever) |
|---|
| 117 | |
|---|
| 118 | Returns a list of whatever looks like formally valid language tags |
|---|
| 119 | in $whatever. Not very smart, so don't get too creative with |
|---|
| 120 | what you want to feed it. |
|---|
| 121 | |
|---|
| 122 | extract_language_tags("fr, fr-ca, i-mingo") |
|---|
| 123 | returns: ('fr', 'fr-ca', 'i-mingo') |
|---|
| 124 | |
|---|
| 125 | extract_language_tags("It's like this: I'm in fr -- French!") |
|---|
| 126 | returns: ('It', 'in', 'fr') |
|---|
| 127 | (So don't just feed it any old thing.) |
|---|
| 128 | |
|---|
| 129 | The output is untainted. If you don't know what tainting is, |
|---|
| 130 | don't worry about it. |
|---|
| 131 | |
|---|
| 132 | =cut |
|---|
| 133 | |
|---|
| 134 | sub extract_language_tags { |
|---|
| 135 | |
|---|
| 136 | ## Changes in the language tagging standards may have to be reflected here. |
|---|
| 137 | |
|---|
| 138 | my($text) = |
|---|
| 139 | $_[0] =~ m/(.+)/ # to make for an untainted result |
|---|
| 140 | ? $1 : '' |
|---|
| 141 | ; |
|---|
| 142 | |
|---|
| 143 | return grep(!m/^[ixIX]$/s, # 'i' and 'x' aren't good tags |
|---|
| 144 | $text =~ |
|---|
| 145 | m/ |
|---|
| 146 | \b |
|---|
| 147 | (?: # First subtag |
|---|
| 148 | [iIxX] | [a-zA-Z]{2,3} |
|---|
| 149 | ) |
|---|
| 150 | (?: # Subtags thereafter |
|---|
| 151 | - # separator |
|---|
| 152 | [a-zA-Z0-9]{1,8} # subtag |
|---|
| 153 | )* |
|---|
| 154 | \b |
|---|
| 155 | /xsg |
|---|
| 156 | ); |
|---|
| 157 | } |
|---|
| 158 | |
|---|
| 159 | ########################################################################### |
|---|
| 160 | |
|---|
| 161 | =item * the function same_language_tag($lang1, $lang2) |
|---|
| 162 | |
|---|
| 163 | Returns true iff $lang1 and $lang2 are acceptable variant tags |
|---|
| 164 | representing the same language-form. |
|---|
| 165 | |
|---|
| 166 | same_language_tag('x-kadara', 'i-kadara') is TRUE |
|---|
| 167 | (The x/i- alternation doesn't matter) |
|---|
| 168 | same_language_tag('X-KADARA', 'i-kadara') is TRUE |
|---|
| 169 | (...and neither does case) |
|---|
| 170 | same_language_tag('en', 'en-US') is FALSE |
|---|
| 171 | (all-English is not the SAME as US English) |
|---|
| 172 | same_language_tag('x-kadara', 'x-kadar') is FALSE |
|---|
| 173 | (these are totally unrelated tags) |
|---|
| 174 | same_language_tag('no-bok', 'nb') is TRUE |
|---|
| 175 | (no-bok is a legacy tag for nb (Norwegian Bokmal)) |
|---|
| 176 | |
|---|
| 177 | C<same_language_tag> works by just seeing whether |
|---|
| 178 | C<encode_language_tag($lang1)> is the same as |
|---|
| 179 | C<encode_language_tag($lang2)>. |
|---|
| 180 | |
|---|
| 181 | (Yes, I know this function is named a bit oddly. Call it historic |
|---|
| 182 | reasons.) |
|---|
| 183 | |
|---|
| 184 | =cut |
|---|
| 185 | |
|---|
| 186 | sub same_language_tag { |
|---|
| 187 | my $el1 = &encode_language_tag($_[0]); |
|---|
| 188 | return 0 unless defined $el1; |
|---|
| 189 | # this avoids the problem of |
|---|
| 190 | # encode_language_tag($lang1) eq and encode_language_tag($lang2) |
|---|
| 191 | # being true if $lang1 and $lang2 are both undef |
|---|
| 192 | |
|---|
| 193 | return $el1 eq &encode_language_tag($_[1]) ? 1 : 0; |
|---|
| 194 | } |
|---|
| 195 | |
|---|
| 196 | ########################################################################### |
|---|
| 197 | |
|---|
| 198 | =item * the function similarity_language_tag($lang1, $lang2) |
|---|
| 199 | |
|---|
| 200 | Returns an integer representing the degree of similarity between |
|---|
| 201 | tags $lang1 and $lang2 (the order of which does not matter), where |
|---|
| 202 | similarity is the number of common elements on the left, |
|---|
| 203 | without regard to case and to x/i- alternation. |
|---|
| 204 | |
|---|
| 205 | similarity_language_tag('fr', 'fr-ca') is 1 |
|---|
| 206 | (one element in common) |
|---|
| 207 | similarity_language_tag('fr-ca', 'fr-FR') is 1 |
|---|
| 208 | (one element in common) |
|---|
| 209 | |
|---|
| 210 | similarity_language_tag('fr-CA-joual', |
|---|
| 211 | 'fr-CA-PEI') is 2 |
|---|
| 212 | similarity_language_tag('fr-CA-joual', 'fr-CA') is 2 |
|---|
| 213 | (two elements in common) |
|---|
| 214 | |
|---|
| 215 | similarity_language_tag('x-kadara', 'i-kadara') is 1 |
|---|
| 216 | (x/i- doesn't matter) |
|---|
| 217 | |
|---|
| 218 | similarity_language_tag('en', 'x-kadar') is 0 |
|---|
| 219 | similarity_language_tag('x-kadara', 'x-kadar') is 0 |
|---|
| 220 | (unrelated tags -- no similarity) |
|---|
| 221 | |
|---|
| 222 | similarity_language_tag('i-cree-syllabic', |
|---|
| 223 | 'i-cherokee-syllabic') is 0 |
|---|
| 224 | (no B<leftmost> elements in common!) |
|---|
| 225 | |
|---|
| 226 | =cut |
|---|
| 227 | |
|---|
| 228 | sub similarity_language_tag { |
|---|
| 229 | my $lang1 = &encode_language_tag($_[0]); |
|---|
| 230 | my $lang2 = &encode_language_tag($_[1]); |
|---|
| 231 | # And encode_language_tag takes care of the whole |
|---|
| 232 | # no-nyn==nn, i-hakka==zh-hakka, etc, things |
|---|
| 233 | |
|---|
| 234 | # NB: (i-sil-...)? (i-sgn-...)? |
|---|
| 235 | |
|---|
| 236 | return undef if !defined($lang1) and !defined($lang2); |
|---|
| 237 | return 0 if !defined($lang1) or !defined($lang2); |
|---|
| 238 | |
|---|
| 239 | my @l1_subtags = split('-', $lang1); |
|---|
| 240 | my @l2_subtags = split('-', $lang2); |
|---|
| 241 | my $similarity = 0; |
|---|
| 242 | |
|---|
| 243 | while(@l1_subtags and @l2_subtags) { |
|---|
| 244 | if(shift(@l1_subtags) eq shift(@l2_subtags)) { |
|---|
| 245 | ++$similarity; |
|---|
| 246 | } else { |
|---|
| 247 | last; |
|---|
| 248 | } |
|---|
| 249 | } |
|---|
| 250 | return $similarity; |
|---|
| 251 | } |
|---|
| 252 | |
|---|
| 253 | ########################################################################### |
|---|
| 254 | |
|---|
| 255 | =item * the function is_dialect_of($lang1, $lang2) |
|---|
| 256 | |
|---|
| 257 | Returns true iff language tag $lang1 represents a subform of |
|---|
| 258 | language tag $lang2. |
|---|
| 259 | |
|---|
| 260 | B<Get the order right! It doesn't work the other way around!> |
|---|
| 261 | |
|---|
| 262 | is_dialect_of('en-US', 'en') is TRUE |
|---|
| 263 | (American English IS a dialect of all-English) |
|---|
| 264 | |
|---|
| 265 | is_dialect_of('fr-CA-joual', 'fr-CA') is TRUE |
|---|
| 266 | is_dialect_of('fr-CA-joual', 'fr') is TRUE |
|---|
| 267 | (Joual is a dialect of (a dialect of) French) |
|---|
| 268 | |
|---|
| 269 | is_dialect_of('en', 'en-US') is FALSE |
|---|
| 270 | (all-English is a NOT dialect of American English) |
|---|
| 271 | |
|---|
| 272 | is_dialect_of('fr', 'en-CA') is FALSE |
|---|
| 273 | |
|---|
| 274 | is_dialect_of('en', 'en' ) is TRUE |
|---|
| 275 | is_dialect_of('en-US', 'en-US') is TRUE |
|---|
| 276 | (B<Note:> these are degenerate cases) |
|---|
| 277 | |
|---|
| 278 | is_dialect_of('i-mingo-tom', 'x-Mingo') is TRUE |
|---|
| 279 | (the x/i thing doesn't matter, nor does case) |
|---|
| 280 | |
|---|
| 281 | is_dialect_of('nn', 'no') is TRUE |
|---|
| 282 | (because 'nn' (New Norse) is aliased to 'no-nyn', |
|---|
| 283 | as a special legacy case, and 'no-nyn' is a |
|---|
| 284 | subform of 'no' (Norwegian)) |
|---|
| 285 | |
|---|
| 286 | =cut |
|---|
| 287 | |
|---|
| 288 | sub is_dialect_of { |
|---|
| 289 | |
|---|
| 290 | my $lang1 = &encode_language_tag($_[0]); |
|---|
| 291 | my $lang2 = &encode_language_tag($_[1]); |
|---|
| 292 | |
|---|
| 293 | return undef if !defined($lang1) and !defined($lang2); |
|---|
| 294 | return 0 if !defined($lang1) or !defined($lang2); |
|---|
| 295 | |
|---|
| 296 | return 1 if $lang1 eq $lang2; |
|---|
| 297 | return 0 if length($lang1) < length($lang2); |
|---|
| 298 | |
|---|
| 299 | $lang1 .= '-'; |
|---|
| 300 | $lang2 .= '-'; |
|---|
| 301 | return |
|---|
| 302 | (substr($lang1, 0, length($lang2)) eq $lang2) ? 1 : 0; |
|---|
| 303 | } |
|---|
| 304 | |
|---|
| 305 | ########################################################################### |
|---|
| 306 | |
|---|
| 307 | =item * the function super_languages($lang1) |
|---|
| 308 | |
|---|
| 309 | Returns a list of language tags that are superordinate tags to $lang1 |
|---|
| 310 | -- it gets this by removing subtags from the end of $lang1 until |
|---|
| 311 | nothing (or just "i" or "x") is left. |
|---|
| 312 | |
|---|
| 313 | super_languages("fr-CA-joual") is ("fr-CA", "fr") |
|---|
| 314 | |
|---|
| 315 | super_languages("en-AU") is ("en") |
|---|
| 316 | |
|---|
| 317 | super_languages("en") is empty-list, () |
|---|
| 318 | |
|---|
| 319 | super_languages("i-cherokee") is empty-list, () |
|---|
| 320 | ...not ("i"), which would be illegal as well as pointless. |
|---|
| 321 | |
|---|
| 322 | If $lang1 is not a valid language tag, returns empty-list in |
|---|
| 323 | a list context, undef in a scalar context. |
|---|
| 324 | |
|---|
| 325 | A notable and rather unavoidable problem with this method: |
|---|
| 326 | "x-mingo-tom" has an "x" because the whole tag isn't an |
|---|
| 327 | IANA-registered tag -- but super_languages('x-mingo-tom') is |
|---|
| 328 | ('x-mingo') -- which isn't really right, since 'i-mingo' is |
|---|
| 329 | registered. But this module has no way of knowing that. (But note |
|---|
| 330 | that same_language_tag('x-mingo', 'i-mingo') is TRUE.) |
|---|
| 331 | |
|---|
| 332 | More importantly, you assume I<at your peril> that superordinates of |
|---|
| 333 | $lang1 are mutually intelligible with $lang1. Consider this |
|---|
| 334 | carefully. |
|---|
| 335 | |
|---|
| 336 | =cut |
|---|
| 337 | |
|---|
| 338 | sub super_languages { |
|---|
| 339 | my $lang1 = $_[0]; |
|---|
| 340 | return() unless defined($lang1) && &is_language_tag($lang1); |
|---|
| 341 | |
|---|
| 342 | # a hack for those annoying new (2001) tags: |
|---|
| 343 | $lang1 =~ s/^nb\b/no-bok/i; # yes, backwards |
|---|
| 344 | $lang1 =~ s/^nn\b/no-nyn/i; # yes, backwards |
|---|
| 345 | $lang1 =~ s/^[ix](-hakka\b)/zh$1/i; # goes the right way |
|---|
| 346 | # i-hakka-bork-bjork-bjark => zh-hakka-bork-bjork-bjark |
|---|
| 347 | |
|---|
| 348 | my @l1_subtags = split('-', $lang1); |
|---|
| 349 | |
|---|
| 350 | ## Changes in the language tagging standards may have to be reflected here. |
|---|
| 351 | |
|---|
| 352 | # NB: (i-sil-...)? |
|---|
| 353 | |
|---|
| 354 | my @supers = (); |
|---|
| 355 | foreach my $bit (@l1_subtags) { |
|---|
| 356 | push @supers, |
|---|
| 357 | scalar(@supers) ? ($supers[-1] . '-' . $bit) : $bit; |
|---|
| 358 | } |
|---|
| 359 | pop @supers if @supers; |
|---|
| 360 | shift @supers if @supers && $supers[0] =~ m<^[iIxX]$>s; |
|---|
| 361 | return reverse @supers; |
|---|
| 362 | } |
|---|
| 363 | |
|---|
| 364 | ########################################################################### |
|---|
| 365 | |
|---|
| 366 | =item * the function locale2language_tag($locale_identifier) |
|---|
| 367 | |
|---|
| 368 | This takes a locale name (like "en", "en_US", or "en_US.ISO8859-1") |
|---|
| 369 | and maps it to a language tag. If it's not mappable (as with, |
|---|
| 370 | notably, "C" and "POSIX"), this returns empty-list in a list context, |
|---|
| 371 | or undef in a scalar context. |
|---|
| 372 | |
|---|
| 373 | locale2language_tag("en") is "en" |
|---|
| 374 | |
|---|
| 375 | locale2language_tag("en_US") is "en-US" |
|---|
| 376 | |
|---|
| 377 | locale2language_tag("en_US.ISO8859-1") is "en-US" |
|---|
| 378 | |
|---|
| 379 | locale2language_tag("C") is undef or () |
|---|
| 380 | |
|---|
| 381 | locale2language_tag("POSIX") is undef or () |
|---|
| 382 | |
|---|
| 383 | locale2language_tag("POSIX") is undef or () |
|---|
| 384 | |
|---|
| 385 | I'm not totally sure that locale names map satisfactorily to language |
|---|
| 386 | tags. Think REAL hard about how you use this. YOU HAVE BEEN WARNED. |
|---|
| 387 | |
|---|
| 388 | The output is untainted. If you don't know what tainting is, |
|---|
| 389 | don't worry about it. |
|---|
| 390 | |
|---|
| 391 | =cut |
|---|
| 392 | |
|---|
| 393 | sub locale2language_tag { |
|---|
| 394 | my $lang = |
|---|
| 395 | $_[0] =~ m/(.+)/ # to make for an untainted result |
|---|
| 396 | ? $1 : '' |
|---|
| 397 | ; |
|---|
| 398 | |
|---|
| 399 | return $lang if &is_language_tag($lang); # like "en" |
|---|
| 400 | |
|---|
| 401 | $lang =~ tr<_><->; # "en_US" -> en-US |
|---|
| 402 | $lang =~ s<\.[-_a-zA-Z0-9\.]*><>s; # "en_US.ISO8859-1" -> en-US |
|---|
| 403 | |
|---|
| 404 | return $lang if &is_language_tag($lang); |
|---|
| 405 | |
|---|
| 406 | return; |
|---|
| 407 | } |
|---|
| 408 | |
|---|
| 409 | ########################################################################### |
|---|
| 410 | |
|---|
| 411 | =item * the function encode_language_tag($lang1) |
|---|
| 412 | |
|---|
| 413 | This function, if given a language tag, returns an encoding of it such |
|---|
| 414 | that: |
|---|
| 415 | |
|---|
| 416 | * tags representing different languages never get the same encoding. |
|---|
| 417 | |
|---|
| 418 | * tags representing the same language always get the same encoding. |
|---|
| 419 | |
|---|
| 420 | * an encoding of a formally valid language tag always is a string |
|---|
| 421 | value that is defined, has length, and is true if considered as a |
|---|
| 422 | boolean. |
|---|
| 423 | |
|---|
| 424 | Note that the encoding itself is B<not> a formally valid language tag. |
|---|
| 425 | Note also that you cannot, currently, go from an encoding back to a |
|---|
| 426 | language tag that it's an encoding of. |
|---|
| 427 | |
|---|
| 428 | Note also that you B<must> consider the encoded value as atomic; i.e., |
|---|
| 429 | you should not consider it as anything but an opaque, unanalysable |
|---|
| 430 | string value. (The internals of the encoding method may change in |
|---|
| 431 | future versions, as the language tagging standard changes over time.) |
|---|
| 432 | |
|---|
| 433 | C<encode_language_tag> returns undef if given anything other than a |
|---|
| 434 | formally valid language tag. |
|---|
| 435 | |
|---|
| 436 | The reason C<encode_language_tag> exists is because different language |
|---|
| 437 | tags may represent the same language; this is normally treatable with |
|---|
| 438 | C<same_language_tag>, but consider this situation: |
|---|
| 439 | |
|---|
| 440 | You have a data file that expresses greetings in different languages. |
|---|
| 441 | Its format is "[language tag]=[how to say 'Hello']", like: |
|---|
| 442 | |
|---|
| 443 | en-US=Hiho |
|---|
| 444 | fr=Bonjour |
|---|
| 445 | i-mingo=Hau' |
|---|
| 446 | |
|---|
| 447 | And suppose you write a program that reads that file and then runs as |
|---|
| 448 | a daemon, answering client requests that specify a language tag and |
|---|
| 449 | then expect the string that says how to greet in that language. So an |
|---|
| 450 | interaction looks like: |
|---|
| 451 | |
|---|
| 452 | greeting-client asks: fr |
|---|
| 453 | greeting-server answers: Bonjour |
|---|
| 454 | |
|---|
| 455 | So far so good. But suppose the way you're implementing this is: |
|---|
| 456 | |
|---|
| 457 | my %greetings; |
|---|
| 458 | die unless open(IN, "<in.dat"); |
|---|
| 459 | while(<IN>) { |
|---|
| 460 | chomp; |
|---|
| 461 | next unless /^([^=]+)=(.+)/s; |
|---|
| 462 | my($lang, $expr) = ($1, $2); |
|---|
| 463 | $greetings{$lang} = $expr; |
|---|
| 464 | } |
|---|
| 465 | close(IN); |
|---|
| 466 | |
|---|
| 467 | at which point %greetings has the contents: |
|---|
| 468 | |
|---|
| 469 | "en-US" => "Hiho" |
|---|
| 470 | "fr" => "Bonjour" |
|---|
| 471 | "i-mingo" => "Hau'" |
|---|
| 472 | |
|---|
| 473 | And suppose then that you answer client requests for language $wanted |
|---|
| 474 | by just looking up $greetings{$wanted}. |
|---|
| 475 | |
|---|
| 476 | If the client asks for "fr", that will look up successfully in |
|---|
| 477 | %greetings, to the value "Bonjour". And if the client asks for |
|---|
| 478 | "i-mingo", that will look up successfully in %greetings, to the value |
|---|
| 479 | "Hau'". |
|---|
| 480 | |
|---|
| 481 | But if the client asks for "i-Mingo" or "x-mingo", or "Fr", then the |
|---|
| 482 | lookup in %greetings fails. That's the Wrong Thing. |
|---|
| 483 | |
|---|
| 484 | You could instead do lookups on $wanted with: |
|---|
| 485 | |
|---|
| 486 | use I18N::LangTags qw(same_language_tag); |
|---|
| 487 | my $repsonse = ''; |
|---|
| 488 | foreach my $l2 (keys %greetings) { |
|---|
| 489 | if(same_language_tag($wanted, $l2)) { |
|---|
| 490 | $response = $greetings{$l2}; |
|---|
| 491 | last; |
|---|
| 492 | } |
|---|
| 493 | } |
|---|
| 494 | |
|---|
| 495 | But that's rather inefficient. A better way to do it is to start your |
|---|
| 496 | program with: |
|---|
| 497 | |
|---|
| 498 | use I18N::LangTags qw(encode_language_tag); |
|---|
| 499 | my %greetings; |
|---|
| 500 | die unless open(IN, "<in.dat"); |
|---|
| 501 | while(<IN>) { |
|---|
| 502 | chomp; |
|---|
| 503 | next unless /^([^=]+)=(.+)/s; |
|---|
| 504 | my($lang, $expr) = ($1, $2); |
|---|
| 505 | $greetings{ |
|---|
| 506 | encode_language_tag($lang) |
|---|
| 507 | } = $expr; |
|---|
| 508 | } |
|---|
| 509 | close(IN); |
|---|
| 510 | |
|---|
| 511 | and then just answer client requests for language $wanted by just |
|---|
| 512 | looking up |
|---|
| 513 | |
|---|
| 514 | $greetings{encode_language_tag($wanted)} |
|---|
| 515 | |
|---|
| 516 | And that does the Right Thing. |
|---|
| 517 | |
|---|
| 518 | =cut |
|---|
| 519 | |
|---|
| 520 | sub encode_language_tag { |
|---|
| 521 | # Only similarity_language_tag() is allowed to analyse encodings! |
|---|
| 522 | |
|---|
| 523 | ## Changes in the language tagging standards may have to be reflected here. |
|---|
| 524 | |
|---|
| 525 | my($tag) = $_[0] || return undef; |
|---|
| 526 | return undef unless &is_language_tag($tag); |
|---|
| 527 | |
|---|
| 528 | # For the moment, these legacy variances are few enough that |
|---|
| 529 | # we can just handle them here with regexps. |
|---|
| 530 | $tag =~ s/^iw\b/he/i; # Hebrew |
|---|
| 531 | $tag =~ s/^in\b/id/i; # Indonesian |
|---|
| 532 | $tag =~ s/^[ix]-lux\b/lb/i; # Luxemburger |
|---|
| 533 | $tag =~ s/^[ix]-navajo\b/nv/i; # Navajo |
|---|
| 534 | $tag =~ s/^ji\b/yi/i; # Yiddish |
|---|
| 535 | # |
|---|
| 536 | # These go FROM the simplex to complex form, to get |
|---|
| 537 | # similarity-comparison right. And that's okay, since |
|---|
| 538 | # similarity_language_tag is the only thing that |
|---|
| 539 | # analyzes our output. |
|---|
| 540 | $tag =~ s/^[ix]-hakka\b/zh-hakka/i; # Hakka |
|---|
| 541 | $tag =~ s/^nb\b/no-bok/i; # BACKWARDS for Bokmal |
|---|
| 542 | $tag =~ s/^nn\b/no-nyn/i; # BACKWARDS for Nynorsk |
|---|
| 543 | |
|---|
| 544 | $tag =~ s/^[xiXI]-//s; |
|---|
| 545 | # Just lop off any leading "x/i-" |
|---|
| 546 | |
|---|
| 547 | return "~" . uc($tag); |
|---|
| 548 | } |
|---|
| 549 | |
|---|
| 550 | #-------------------------------------------------------------------------- |
|---|
| 551 | |
|---|
| 552 | =item * the function alternate_language_tags($lang1) |
|---|
| 553 | |
|---|
| 554 | This function, if given a language tag, returns all language tags that |
|---|
| 555 | are alternate forms of this language tag. (I.e., tags which refer to |
|---|
| 556 | the same language.) This is meant to handle legacy tags caused by |
|---|
| 557 | the minor changes in language tag standards over the years; and |
|---|
| 558 | the x-/i- alternation is also dealt with. |
|---|
| 559 | |
|---|
| 560 | Note that this function does I<not> try to equate new (and never-used, |
|---|
| 561 | and unusable) |
|---|
| 562 | ISO639-2 three-letter tags to old (and still in use) ISO639-1 |
|---|
| 563 | two-letter equivalents -- like "ara" -> "ar" -- because |
|---|
| 564 | "ara" has I<never> been in use as an Internet language tag, |
|---|
| 565 | and RFC 3066 stipulates that it never should be, since a shorter |
|---|
| 566 | tag ("ar") exists. |
|---|
| 567 | |
|---|
| 568 | Examples: |
|---|
| 569 | |
|---|
| 570 | alternate_language_tags('no-bok') is ('nb') |
|---|
| 571 | alternate_language_tags('nb') is ('no-bok') |
|---|
| 572 | alternate_language_tags('he') is ('iw') |
|---|
| 573 | alternate_language_tags('iw') is ('he') |
|---|
| 574 | alternate_language_tags('i-hakka') is ('zh-hakka', 'x-hakka') |
|---|
| 575 | alternate_language_tags('zh-hakka') is ('i-hakka', 'x-hakka') |
|---|
| 576 | alternate_language_tags('en') is () |
|---|
| 577 | alternate_language_tags('x-mingo-tom') is ('i-mingo-tom') |
|---|
| 578 | alternate_language_tags('x-klikitat') is ('i-klikitat') |
|---|
| 579 | alternate_language_tags('i-klikitat') is ('x-klikitat') |
|---|
| 580 | |
|---|
| 581 | This function returns empty-list if given anything other than a formally |
|---|
| 582 | valid language tag. |
|---|
| 583 | |
|---|
| 584 | =cut |
|---|
| 585 | |
|---|
| 586 | my %alt = qw( i x x i I X X I ); |
|---|
| 587 | sub alternate_language_tags { |
|---|
| 588 | my $tag = $_[0]; |
|---|
| 589 | return() unless &is_language_tag($tag); |
|---|
| 590 | |
|---|
| 591 | my @em; # push 'em real goood! |
|---|
| 592 | |
|---|
| 593 | # For the moment, these legacy variances are few enough that |
|---|
| 594 | # we can just handle them here with regexps. |
|---|
| 595 | |
|---|
| 596 | if( $tag =~ m/^[ix]-hakka\b(.*)/i) {push @em, "zh-hakka$1"; |
|---|
| 597 | } elsif($tag =~ m/^zh-hakka\b(.*)/i) { push @em, "x-hakka$1", "i-hakka$1"; |
|---|
| 598 | |
|---|
| 599 | } elsif($tag =~ m/^he\b(.*)/i) { push @em, "iw$1"; |
|---|
| 600 | } elsif($tag =~ m/^iw\b(.*)/i) { push @em, "he$1"; |
|---|
| 601 | |
|---|
| 602 | } elsif($tag =~ m/^in\b(.*)/i) { push @em, "id$1"; |
|---|
| 603 | } elsif($tag =~ m/^id\b(.*)/i) { push @em, "in$1"; |
|---|
| 604 | |
|---|
| 605 | } elsif($tag =~ m/^[ix]-lux\b(.*)/i) { push @em, "lb$1"; |
|---|
| 606 | } elsif($tag =~ m/^lb\b(.*)/i) { push @em, "i-lux$1", "x-lux$1"; |
|---|
| 607 | |
|---|
| 608 | } elsif($tag =~ m/^[ix]-navajo\b(.*)/i) { push @em, "nv$1"; |
|---|
| 609 | } elsif($tag =~ m/^nv\b(.*)/i) { push @em, "i-navajo$1", "x-navajo$1"; |
|---|
| 610 | |
|---|
| 611 | } elsif($tag =~ m/^yi\b(.*)/i) { push @em, "ji$1"; |
|---|
| 612 | } elsif($tag =~ m/^ji\b(.*)/i) { push @em, "yi$1"; |
|---|
| 613 | |
|---|
| 614 | } elsif($tag =~ m/^nb\b(.*)/i) { push @em, "no-bok$1"; |
|---|
| 615 | } elsif($tag =~ m/^no-bok\b(.*)/i) { push @em, "nb$1"; |
|---|
| 616 | |
|---|
| 617 | } elsif($tag =~ m/^nn\b(.*)/i) { push @em, "no-nyn$1"; |
|---|
| 618 | } elsif($tag =~ m/^no-nyn\b(.*)/i) { push @em, "nn$1"; |
|---|
| 619 | } |
|---|
| 620 | |
|---|
| 621 | push @em, $alt{$1} . $2 if $tag =~ /^([XIxi])(-.+)/; |
|---|
| 622 | return @em; |
|---|
| 623 | } |
|---|
| 624 | |
|---|
| 625 | ########################################################################### |
|---|
| 626 | |
|---|
| 627 | { |
|---|
| 628 | # Init %Panic... |
|---|
| 629 | |
|---|
| 630 | my @panic = ( # MUST all be lowercase! |
|---|
| 631 | # Only large ("national") languages make it in this list. |
|---|
| 632 | # If you, as a user, are so bizarre that the /only/ language |
|---|
| 633 | # you claim to accept is Galician, then no, we won't do you |
|---|
| 634 | # the favor of providing Catalan as a panic-fallback for |
|---|
| 635 | # you. Because if I start trying to add "little languages" in |
|---|
| 636 | # here, I'll just go crazy. |
|---|
| 637 | |
|---|
| 638 | # Scandinavian lgs. All based on opinion and hearsay. |
|---|
| 639 | 'sv' => [qw(nb no da nn)], |
|---|
| 640 | 'da' => [qw(nb no sv nn)], # I guess |
|---|
| 641 | [qw(no nn nb)], [qw(no nn nb sv da)], |
|---|
| 642 | 'is' => [qw(da sv no nb nn)], |
|---|
| 643 | 'fo' => [qw(da is no nb nn sv)], # I guess |
|---|
| 644 | |
|---|
| 645 | # I think this is about the extent of tolerable intelligibility |
|---|
| 646 | # among large modern Romance languages. |
|---|
| 647 | 'pt' => [qw(es ca it fr)], # Portuguese, Spanish, Catalan, Italian, French |
|---|
| 648 | 'ca' => [qw(es pt it fr)], |
|---|
| 649 | 'es' => [qw(ca it fr pt)], |
|---|
| 650 | 'it' => [qw(es fr ca pt)], |
|---|
| 651 | 'fr' => [qw(es it ca pt)], |
|---|
| 652 | |
|---|
| 653 | # Also assume that speakers of the main Indian languages prefer |
|---|
| 654 | # to read/hear Hindi over English |
|---|
| 655 | [qw( |
|---|
| 656 | as bn gu kn ks kok ml mni mr ne or pa sa sd te ta ur |
|---|
| 657 | )] => 'hi', |
|---|
| 658 | # Assamese, Bengali, Gujarati, [Hindi,] Kannada (Kanarese), Kashmiri, |
|---|
| 659 | # Konkani, Malayalam, Meithei (Manipuri), Marathi, Nepali, Oriya, |
|---|
| 660 | # Punjabi, Sanskrit, Sindhi, Telugu, Tamil, and Urdu. |
|---|
| 661 | 'hi' => [qw(bn pa as or)], |
|---|
| 662 | # I welcome finer data for the other Indian languages. |
|---|
| 663 | # E.g., what should Oriya's list be, besides just Hindi? |
|---|
| 664 | |
|---|
| 665 | # And the panic languages for English is, of course, nil! |
|---|
| 666 | |
|---|
| 667 | # My guesses at Slavic intelligibility: |
|---|
| 668 | ([qw(ru be uk)]) x 2, # Russian, Belarusian, Ukranian |
|---|
| 669 | 'sr' => 'hr', 'hr' => 'sr', # Serb + Croat |
|---|
| 670 | 'cs' => 'sk', 'sk' => 'cs', # Czech + Slovak |
|---|
| 671 | |
|---|
| 672 | 'ms' => 'id', 'id' => 'ms', # Malay + Indonesian |
|---|
| 673 | |
|---|
| 674 | 'et' => 'fi', 'fi' => 'et', # Estonian + Finnish |
|---|
| 675 | |
|---|
| 676 | #?? 'lo' => 'th', 'th' => 'lo', # Lao + Thai |
|---|
| 677 | |
|---|
| 678 | ); |
|---|
| 679 | my($k,$v); |
|---|
| 680 | while(@panic) { |
|---|
| 681 | ($k,$v) = splice(@panic,0,2); |
|---|
| 682 | foreach my $k (ref($k) ? @$k : $k) { |
|---|
| 683 | foreach my $v (ref($v) ? @$v : $v) { |
|---|
| 684 | push @{$Panic{$k} ||= []}, $v unless $k eq $v; |
|---|
| 685 | } |
|---|
| 686 | } |
|---|
| 687 | } |
|---|
| 688 | } |
|---|
| 689 | |
|---|
| 690 | =item * the function @langs = panic_languages(@accept_languages) |
|---|
| 691 | |
|---|
| 692 | This function takes a list of 0 or more language |
|---|
| 693 | tags that constitute a given user's Accept-Language list, and |
|---|
| 694 | returns a list of tags for I<other> (non-super) |
|---|
| 695 | languages that are probably acceptable to the user, to be |
|---|
| 696 | used I<if all else fails>. |
|---|
| 697 | |
|---|
| 698 | For example, if a user accepts only 'ca' (Catalan) and |
|---|
| 699 | 'es' (Spanish), and the documents/interfaces you have |
|---|
| 700 | available are just in German, Italian, and Chinese, then |
|---|
| 701 | the user will most likely want the Italian one (and not |
|---|
| 702 | the Chinese or German one!), instead of getting |
|---|
| 703 | nothing. So C<panic_languages('ca', 'es')> returns |
|---|
| 704 | a list containing 'it' (Italian). |
|---|
| 705 | |
|---|
| 706 | English ('en') is I<always> in the return list, but |
|---|
| 707 | whether it's at the very end or not depends |
|---|
| 708 | on the input languages. This function works by consulting |
|---|
| 709 | an internal table that stipulates what common |
|---|
| 710 | languages are "close" to each other. |
|---|
| 711 | |
|---|
| 712 | A useful construct you might consider using is: |
|---|
| 713 | |
|---|
| 714 | @fallbacks = super_languages(@accept_languages); |
|---|
| 715 | push @fallbacks, panic_languages( |
|---|
| 716 | @accept_languages, @fallbacks, |
|---|
| 717 | ); |
|---|
| 718 | |
|---|
| 719 | =cut |
|---|
| 720 | |
|---|
| 721 | sub panic_languages { |
|---|
| 722 | # When in panic or in doubt, run in circles, scream, and shout! |
|---|
| 723 | my(@out, %seen); |
|---|
| 724 | foreach my $t (@_) { |
|---|
| 725 | next unless $t; |
|---|
| 726 | next if $seen{$t}++; # so we don't return it or hit it again |
|---|
| 727 | # push @out, super_languages($t); # nah, keep that separate |
|---|
| 728 | push @out, @{ $Panic{lc $t} || next }; |
|---|
| 729 | } |
|---|
| 730 | return grep !$seen{$_}++, @out, 'en'; |
|---|
| 731 | } |
|---|
| 732 | |
|---|
| 733 | ########################################################################### |
|---|
| 734 | 1; |
|---|
| 735 | __END__ |
|---|
| 736 | |
|---|
| 737 | =back |
|---|
| 738 | |
|---|
| 739 | =head1 ABOUT LOWERCASING |
|---|
| 740 | |
|---|
| 741 | I've considered making all the above functions that output language |
|---|
| 742 | tags return all those tags strictly in lowercase. Having all your |
|---|
| 743 | language tags in lowercase does make some things easier. But you |
|---|
| 744 | might as well just lowercase as you like, or call |
|---|
| 745 | C<encode_language_tag($lang1)> where appropriate. |
|---|
| 746 | |
|---|
| 747 | =head1 ABOUT UNICODE PLAINTEXT LANGUAGE TAGS |
|---|
| 748 | |
|---|
| 749 | In some future version of I18N::LangTags, I plan to include support |
|---|
| 750 | for RFC2482-style language tags -- which are basically just normal |
|---|
| 751 | language tags with their ASCII characters shifted into Plane 14. |
|---|
| 752 | |
|---|
| 753 | =head1 SEE ALSO |
|---|
| 754 | |
|---|
| 755 | * L<I18N::LangTags::List|I18N::LangTags::List> |
|---|
| 756 | |
|---|
| 757 | * RFC 3066, C<ftp://ftp.isi.edu/in-notes/rfc3066.txt>, "Tags for the |
|---|
| 758 | Identification of Languages". (Obsoletes RFC 1766) |
|---|
| 759 | |
|---|
| 760 | * RFC 2277, C<ftp://ftp.isi.edu/in-notes/rfc2277.txt>, "IETF Policy on |
|---|
| 761 | Character Sets and Languages". |
|---|
| 762 | |
|---|
| 763 | * RFC 2231, C<ftp://ftp.isi.edu/in-notes/rfc2231.txt>, "MIME Parameter |
|---|
| 764 | Value and Encoded Word Extensions: Character Sets, Languages, and |
|---|
| 765 | Continuations". |
|---|
| 766 | |
|---|
| 767 | * RFC 2482, C<ftp://ftp.isi.edu/in-notes/rfc2482.txt>, |
|---|
| 768 | "Language Tagging in Unicode Plain Text". |
|---|
| 769 | |
|---|
| 770 | * Locale::Codes, in |
|---|
| 771 | C<http://www.perl.com/CPAN/modules/by-module/Locale/> |
|---|
| 772 | |
|---|
| 773 | * ISO 639, "Code for the representation of names of languages", |
|---|
| 774 | C<http://www.indigo.ie/egt/standards/iso639/iso639-1-en.html> |
|---|
| 775 | |
|---|
| 776 | * ISO 639-2, "Codes for the representation of names of languages", |
|---|
| 777 | including three-letter codes, |
|---|
| 778 | C<http://lcweb.loc.gov/standards/iso639-2/bibcodes.html> |
|---|
| 779 | |
|---|
| 780 | * The IANA list of registered languages (hopefully up-to-date), |
|---|
| 781 | C<ftp://ftp.isi.edu/in-notes/iana/assignments/languages/> |
|---|
| 782 | |
|---|
| 783 | =head1 COPYRIGHT |
|---|
| 784 | |
|---|
| 785 | Copyright (c) 1998-2001 Sean M. Burke. All rights reserved. |
|---|
| 786 | |
|---|
| 787 | This library is free software; you can redistribute it and/or |
|---|
| 788 | modify it under the same terms as Perl itself. |
|---|
| 789 | |
|---|
| 790 | The programs and documentation in this dist are distributed in |
|---|
| 791 | the hope that they will be useful, but without any warranty; without |
|---|
| 792 | even the implied warranty of merchantability or fitness for a |
|---|
| 793 | particular purpose. |
|---|
| 794 | |
|---|
| 795 | =head1 AUTHOR |
|---|
| 796 | |
|---|
| 797 | Sean M. Burke C<sburke@cpan.org> |
|---|
| 798 | |
|---|
| 799 | =cut |
|---|
| 800 | |
|---|