# -*- indent-tabs-mode: nil; -*- # vim:ft=perl:et:sw=4 # $Id$ # Sympa - SYsteme de Multi-Postage Automatique # # Copyright (c) 1997, 1998, 1999 Institut Pasteur & Christophe Wolfhugel # Copyright (c) 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, # 2006, 2007, 2008, 2009, 2010, 2011 Comite Reseau des Universites # Copyright (c) 2011, 2012, 2013, 2014, 2015, 2016, 2017 GIP RENATER # Copyright 2017, 2018 The Sympa Community. See the AUTHORS.md file at the # top-level directory of this distribution and at # . # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . package Sympa::Language; use strict; use warnings; use base qw(Class::Singleton); use Encode qw(); use Locale::Messages; use POSIX qw(); use Sympa::Constants; BEGIN { ## Using the Pure Perl implementation of gettext ## This is required on Solaris : native implementation of gettext does not ## map ll_RR with ll. # libintl-perl 1.22 (virtually 1.23) or later is recommended to use # 'gettext_dumb' package which is independent from POSIX locale. If older # version is used. falls back to 'gettext_pp'. my $package = Locale::Messages->select_package('gettext_dumb'); Locale::Messages->select_package('gettext_pp') unless $package and $package eq 'gettext_dumb'; ## Workaround: Prevent from searching catalogs in /usr/share/locale. undef $Locale::gettext_pp::__gettext_pp_default_dir; ## Define what catalogs are used Locale::Messages::bindtextdomain(sympa => Sympa::Constants::LOCALEDIR); Locale::Messages::bindtextdomain(web_help => Sympa::Constants::LOCALEDIR); Locale::Messages::textdomain('sympa'); ## Get translations by internal encoding. Locale::Messages::bind_textdomain_codeset(sympa => 'utf-8'); Locale::Messages::bind_textdomain_codeset(web_help => 'utf-8'); } # Constructor for Class::Singleton. sub _new_instance { my $class = shift; my $self = $class->SUPER::_new_instance(); ## Initialize lang/locale. $self->set_lang('en'); return $self; } ## The map to get language tag from older non-POSIX locale naming. my %language_equiv = ( 'cn' => 'zh-CN', 'tw' => 'zh-TW', 'cz' => 'cs', 'us' => 'en-US', ); ## The map to get appropriate POSIX locale name from language code. ## Why this is required is that on many systems locales often have canonic ## "ll_RR.ENCODING" names only. n.b. This format can not express all ## languages in proper way, e.g. Common Arabic ("ar"), Esperanto ("eo"). ## ## This map is also used to convert old-style Sympa "locales" to language ## tags ('en' is special case. cf. set_lang()). my %lang2oldlocale = ( 'af' => 'af_ZA', 'ar' => 'ar_SY', 'br' => 'br_FR', 'bg' => 'bg_BG', 'ca' => 'ca_ES', 'cs' => 'cs_CZ', 'de' => 'de_DE', 'el' => 'el_GR', 'es' => 'es_ES', 'et' => 'et_EE', 'eu' => 'eu_ES', 'fi' => 'fi_FI', 'fr' => 'fr_FR', 'gl' => 'gl_ES', 'hu' => 'hu_HU', 'id' => 'id_ID', 'it' => 'it_IT', 'ja' => 'ja_JP', 'ko' => 'ko_KR', 'la' => 'la_VA', # from OpenOffice.org 'ml' => 'ml_IN', 'nb' => 'nb_NO', 'nn' => 'nn_NO', 'nl' => 'nl_NL', 'oc' => 'oc_FR', 'pl' => 'pl_PL', 'pt' => 'pt_PT', 'rm' => 'rm_CH', # CLDR 'ro' => 'ro_RO', 'ru' => 'ru_RU', 'sv' => 'sv_SE', 'tr' => 'tr_TR', 'vi' => 'vi_VN', ); ## Regexp for old style canonical locale used by Sympa-6.2a or earlier. my $oldlocale_re = qr/^([a-z]{2})_([A-Z]{2})(?![A-Z])/i; ## Regexp for IETF language tag described in RFC 5646 (BCP 47), modified. my $language_tag_re = qr/^ ([a-z]{2}(?:-[a-z]{3}){1,3} | [a-z]{2,3}) # language (and ext.) (?:-([a-z]{4}))? # script (?:-([a-z]{2}))? # region (no UN M.49) (?:-( # variant (?:[a-z0-9]{5,} | [0-9][a-z0-9]{3,}) (?:-[a-z0-9]{5,} | -[0-9][a-z0-9]{3,})* ))? $/ix; ## A tiny subset of script codes and gettext modifier names. ## Keys are ISO 15924 script codes (Titlecased, four characters). ## Values are property value aliases standardised by Unicode Consortium ## (lowercased). cf. . my %script2modifier = ( 'Arab' => 'arabic', 'Cyrl' => 'cyrillic', 'Deva' => 'devanagari', 'Dsrt' => 'deseret', 'Glag' => 'glagolitic', 'Grek' => 'greek', 'Guru' => 'gurmukhi', 'Hebr' => 'hebrew', 'Latn' => 'latin', 'Mong' => 'mongolian', 'Shaw' => 'shaw', # found in Debian "en@shaw" locale. 'Tfng' => 'tifinagh', ); sub canonic_lang { my $lang = shift; return unless $lang; ## Compatibility: older non-POSIX locale names. if ($language_equiv{$lang}) { $lang = $language_equiv{$lang}; } ## Compatibility: names used as "lang" or "locale" by Sympa <= 6.2a. elsif ($lang =~ $oldlocale_re) { $lang = _oldlocale2lang(lc($1) . '_' . uc($2)); } my @subtags; # unknown format. return unless @subtags = ($lang =~ $language_tag_re); ## Canonicalize cases of subtags: ll-ext-Scri-RR-variant-... $subtags[0] = lc $subtags[0]; $subtags[1] =~ s/^(\w)(\w+)/uc($1) . lc($2)/e if $subtags[1]; $subtags[2] = uc $subtags[2] if $subtags[2]; $subtags[3] = lc $subtags[3] if $subtags[3]; ##XXX Maybe more canonicalizations here. ## Check subtags, # won't support language extension subtags. return unless $subtags[0] =~ /^[a-z]{2,3}$/; # won't allow multiple variant subtags. $subtags[3] =~ s/-.+// if $subtags[3]; ##XXX Maybe more checks here. return @subtags if wantarray; return join '-', grep {$_} @subtags; } sub implicated_langs { my @langs = @_; die 'missing langs parameter' unless @langs; my @implicated_langs = (); foreach my $lang (@langs) { my @subtags = canonic_lang($lang); while (@subtags) { my $l = join '-', grep {$_} @subtags; @implicated_langs = ((grep { $_ ne $l } @implicated_langs), $l); ## Workaround: ## - "zh-Hans-CN", "zh-Hant-TW", ... may occasionally be ## identified with "zh-CN", "zh-TW" etc. Add them to ## implication list. if ($l =~ /^zh-(Hans|Hant)-[A-Z]{2}\b/) { $l = join '-', grep {$_} @subtags[0, 2 .. $#subtags]; @implicated_langs = ((grep { $_ ne $l } @implicated_langs), $l); } 1 until pop @subtags; } } return @implicated_langs; } ## Parses content of HTTP 1.1 Accept-Charset, Accept-Encoding or ## Accept-Language request header field. ## Returns an array of arrayrefs [ITEM, WEIGHT]. ## ## NOTE: This might be moved to utility package such as tools.pm. sub parse_http_accept_string { my $accept_string = shift || ''; $accept_string =~ s/^\s+//; $accept_string =~ s/\s+$//; $accept_string ||= '*'; my @pairs = split /\s*,\s*/, $accept_string; my @ret = (); foreach my $pair (@pairs) { my ($item, $weight) = split /\s*;\s*/, $pair, 2; if ( defined $weight and $weight =~ s/^q\s*=\s*//i and $weight =~ /^(\d+(\.\d*)?|\.\d+)$/) { $weight += 0.0; } else { $weight = 1.0; } push @ret, [$item => $weight]; } return @ret; } sub negotiate_lang { my $accept_string = shift || '*'; my @supported_languages = grep {$_} map { split /[\s,]+/, $_ } @_; ## parse Accept-Language: header field. ## unknown languages are ignored. my @accept_languages = grep { $_->[0] eq '*' or $_->[0] = canonic_lang($_->[0]) } parse_http_accept_string($accept_string); return unless @accept_languages; ## try to find the best language. my $best_lang = undef; my $best_weight = 0.0; foreach my $supported_lang (@supported_languages) { my @supported_pfxs = implicated_langs($supported_lang); foreach my $pair (@accept_languages) { my ($accept_lang, $weight) = @$pair; if ($accept_lang eq '*' or grep { $accept_lang eq $_ } @supported_pfxs) { unless ($best_lang and $weight <= $best_weight) { $best_lang = $supported_pfxs[0]; # canonic form $best_weight = $weight; } } } } return $best_lang; } ##sub GetSupportedLanguages { ##DEPRECATED: use Sympa::get_supported_languages(). ## Supported languages are defined by 'supported_lang' sympa.conf parameter. ## Old name: PushLang() sub push_lang { my $self = shift; my @langs = @_; push @{$self->{previous_lang}}, $self->get_lang; $self->set_lang(@langs); return 1; } ## Old name: PopLang() sub pop_lang { my $self = shift; die 'calling pop_lang() without push_lang()' unless @{$self->{previous_lang}}; my $lang = pop @{$self->{previous_lang}}; $self->set_lang($lang); return 1; } ## Old name: SetLang() sub set_lang { my $self = shift; my @langs = @_; my $locale; foreach my $lang (@langs) { # Canonicalize lang. # Note: 'en' is always allowed. Use 'en-US' and so on to provide NLS # for English. next unless $lang = canonic_lang($lang); # Try to set POSIX locale and gettext locale, and get lang actually # set. # Note: Macrolanguage 'zh', 'zh-Hans' or 'zh-Hant' may fallback to # lang with available region. if ($locale = _resolve_gettext_locale(lang2locale($lang))) { ($lang) = grep { lang2locale($_) eq $locale } implicated_langs($lang); } elsif ($lang =~ /^zh\b/) { my @rr; if ($lang =~ /^zh-Hans\b/) { @rr = qw(CN SG HK MO TW); # try simp. first } elsif ($lang =~ /^zh-Hant\b/) { @rr = qw(HK MO TW CN SG); # try trad. first } else { @rr = qw(CN HK MO SG TW); } foreach my $rr (@rr) { $lang = "zh-$rr"; last if $locale = _resolve_gettext_locale(lang2locale($lang)); } } next unless $locale and $lang; # The locale is the gettext catalog name; lang is the IETF language # tag. Ex: locale = pt_BR ; lang = pt-BR # locale_numeric and locale_time are POSIX locales for LC_NUMERIC and # POSIX::LC_TIME catogories, respectively. As of 6.2b, they became # optional: # If setting each of them failed, 'C' locale will be set. $self->{lang} = $lang; $self->{locale} = $locale; $self->{locale_numeric} = _find_posix_locale(POSIX::LC_NUMERIC(), $locale) || 'C'; $self->{locale_time} = _find_posix_locale(POSIX::LC_TIME(), $locale) || 'C'; return $lang; } return; } ## Trys to set gettext locale and returns actually set locale. ## Mandatory parameter is gettext locale name. sub _resolve_gettext_locale { my $locale = shift or die 'missing locale parameter'; # 'en' is always allowed. return $locale if $locale eq 'en'; # Workaround: # - "nb" and "nn" are recommended not to have "_NO" region suffix: # Both of them are official languages in Norway. However, current Sympa # provides "nb_NO" NLS catalog. $locale =~ s/^(nb|nn)\b/${1}_NO/; ## Check if catalog is loaded. local %ENV; $ENV{'LANGUAGE'} = $locale; my $metadata = Locale::Messages::gettext(''); # get header unless ($metadata) { ## If a sub-locale of 'en' (en-CA, en@shaw, ...) failed, fallback to ## 'en'. Otherwise fails. if ($locale =~ /^en(?![a-z])/) { $locale = 'en'; } else { return; } } elsif ($metadata =~ /(?:\A|\n)Language:\s*([\@\w]+)/i) { ## Get precise name of gettext locale if possible. $locale = $1; } ## Workaround for "nb" and "nn": See above. $locale =~ s/^(nb|nn)_NO\b/$1/; return $locale; } # Trys to set POSIX locale which affects to strftime, sprintf etc. sub _find_posix_locale { my $type = shift; my $locale = shift; # Special case: 'en' is an alias of 'C' locale. Use 'en_US' and so on for # real English. return 'C' if $locale eq 'en'; my $orig_locale = POSIX::setlocale($type); ## From "ll@modifier", gets "ll", "ll_RR" and "@modifier". my ($loc, $mod) = split /(?=\@)/, $locale, 2; my $machloc = $loc; $machloc =~ s/^([a-z]{2,3})(?!_)/$lang2oldlocale{$1} || $1/e; $mod ||= ''; ## Set POSIX locale my $posix_locale; my @try; ## Add codeset. ## UpperCase required for FreeBSD; dashless required on HP-UX; ## null codeset is last resort. foreach my $cs ('.utf-8', '.UTF-8', '.utf8', '') { ## Truncate locale similarly in gettext: full locale, and omit ## region then modifier. push @try, map { sprintf $_, $cs } ("$machloc%s$mod", "$loc%s$mod", "$loc%s"); } foreach my $try (@try) { if (POSIX::setlocale($type, $try)) { $posix_locale = $try; last; } } POSIX::setlocale($type, $orig_locale); return $posix_locale; } ## Old name: GetLangName() ## Note: Optional $lang argument was deprecated. sub native_name { my $self = shift; die 'extra argument(s)' if @_; my $name; unless ($self->{lang} and $self->{lang} ne 'en') { $name = 'English'; } else { ## Workaround for nb/nn. my $locale = $self->{locale}; $locale =~ s/^(nb|nn)\b/${1}_NO/; local %ENV; $ENV{'LANGUAGE'} = $locale; my $metadata = Locale::Messages::gettext(''); # get header if ($metadata =~ /(?:\A|\n)Language-Team:\s*(.+)/i) { $name = $1; $name =~ s/\s*\<\S+\>//; } } return (defined $name and $name =~ /\S/) ? $name : ''; } ## Old name: GetLang() sub get_lang { my $self = shift; return $self->{lang} || 'en'; # the last resort } # DEPRECATED: use Conf::lang2charset(). # sub GetCharset; ## DEPRECATED: Use canonic_lang(). ## sub Locale2Lang; # Internal function. # Convert language tag to gettext locale name. sub lang2locale { my $lang = shift; my $locale; my @subtags; ## unknown format. return unless @subtags = canonic_lang($lang); ## convert from "ll-Scri-RR" to "ll_RR@scriptname", or ## from "ll-RR-variant" to "ll_RR@variant". $locale = $subtags[0]; if ($subtags[2]) { $locale .= '_' . $subtags[2]; } if ($subtags[1]) { $locale .= '@' . ($script2modifier{$subtags[1]} || $subtags[1]); } elsif ($subtags[3]) { $locale .= '@' . $subtags[3]; } return $locale; } # Internal function. # Get language tag from old-style "locale". # Note: Old name is Locale2Lang(). # Note: Use canonic_lang(). sub _oldlocale2lang { my $oldlocale = shift; my @parts = split /[\W_]/, $oldlocale; my $lang; if ($lang = {reverse %lang2oldlocale}->{$oldlocale}) { return $lang; } elsif (scalar @parts > 1 and length $parts[1]) { return join '-', lc $parts[0], uc $parts[1]; } else { return lc $parts[0]; } } # Convert language tag to old style "locale". # Note: This function in earlier releases was named Lang2Locale(). sub lang2oldlocale { my $lang = shift; my $oldlocale; my @subtags; ## unknown format. return unless @subtags = canonic_lang($lang); ## 'zh-Hans' and 'zh-Hant' cannot map to useful POSIX locale. Map them to ## 'zh_CN' and 'zh_TW'. ## 'zh' cannot map. if ($subtags[0] eq 'zh' and $subtags[1] and not $subtags[2]) { if ($subtags[1] eq 'Hans') { $subtags[2] = 'CN'; } elsif ($subtags[1] eq 'Hant') { $subtags[2] = 'TW'; } } unless ($subtags[2]) { if ($lang2oldlocale{$subtags[0]}) { return $lang2oldlocale{$subtags[0]}; } } else { return join '_', $subtags[0], $subtags[2]; } ## unconvertible locale name return; } # Note: older name is sympa_dgettext(). sub dgettext { my $self = shift; my $textdomain = shift; my $msgid = shift; # Returns meta information on the catalog. # Note: currently, charset is always 'utf-8'; encoding won't be used. unless (defined $msgid) { return; } elsif ($msgid eq '') { # prevents meta information to be returned return ''; } elsif ($msgid eq '_language_') { return $self->native_name; } elsif ($msgid eq '_charset_') { return 'UTF-8'; } elsif ($msgid eq '_encoding_') { return '8bit'; } my $gettext_locale; unless ($self->{lang} and $self->{lang} ne 'en') { $gettext_locale = 'en_US'; } else { $gettext_locale = $self->{locale}; # Workaround for nb/nn. $gettext_locale =~ s/^(nb|nn)\b/${1}_NO/; } local %ENV; $ENV{'LANGUAGE'} = $gettext_locale; return Locale::Messages::dgettext($textdomain, $msgid); } sub gettext { my $self = shift; my $msgid = shift; return $self->dgettext('', $msgid); } sub gettext_sprintf { my $self = shift; my $format = shift; my @args = @_; my $orig_locale = POSIX::setlocale(POSIX::LC_NUMERIC()); ## if lang has not been set or 'en' is set, fallback to native sprintf(). unless ($self->{lang} and $self->{lang} ne 'en') { POSIX::setlocale(POSIX::LC_NUMERIC(), 'C'); } else { $format = $self->gettext($format); POSIX::setlocale(POSIX::LC_NUMERIC(), $self->{locale_numeric}); } my $ret = sprintf($format, @args); POSIX::setlocale(POSIX::LC_NUMERIC(), $orig_locale); return $ret; } my %date_part_names = ( '%a' => { 'index' => 6, 'gettext_id' => 'Sun:Mon:Tue:Wed:Thu:Fri:Sat' }, '%A' => { 'index' => 6, 'gettext_id' => 'Sunday:Monday:Tuesday:Wednesday:Thursday:Friday:Saturday' }, '%b' => { 'index' => 4, 'gettext_id' => 'Jan:Feb:Mar:Apr:May:Jun:Jul:Aug:Sep:Oct:Nov:Dec' }, '%B' => { 'index' => 4, 'gettext_id' => 'January:February:March:April:May:June:July:August:September:October:November:December' }, '%p' => { 'index' => 2, 'gettext_id' => 'AM:PM' }, ); sub gettext_strftime { my $self = shift; my $format = shift; my @args = @_; my $orig_locale = POSIX::setlocale(POSIX::LC_TIME()); ## if lang has not been set or 'en' is set, fallback to native ## POSIX::strftime(). unless ($self->{lang} and $self->{lang} ne 'en') { POSIX::setlocale(POSIX::LC_TIME(), 'C'); } else { $format = $self->gettext($format); ## If POSIX locale was not set, emulate format strings. unless ($self->{locale_time} and $self->{locale_time} ne 'C' and $self->{locale_time} ne 'POSIX') { my %names; foreach my $k (keys %date_part_names) { $names{$k} = [ split /:/, $self->gettext($date_part_names{$k}->{'gettext_id'}) ]; } $format =~ s{(\%[EO]?.)}{ my $index; if ( $names{$1} and defined( $index = $args[$date_part_names{$1}->{'index'}] ) ) { $index = ($index < 12) ? 0 : 1 if $1 eq '%p'; $names{$1}->[$index]; } else { $1; } }eg; } POSIX::setlocale(POSIX::LC_TIME(), $self->{locale_time}); } my $ret = POSIX::strftime($format, @args); Encode::_utf8_off($ret); POSIX::setlocale(POSIX::LC_TIME(), $orig_locale); return $ret; } sub maketext { my $self = shift; my $textdomain = shift; my $template = shift; my @args = @_; my $orig_locale = POSIX::setlocale(POSIX::LC_NUMERIC()); unless ($self->{lang} and $self->{lang} ne 'en') { POSIX::setlocale(POSIX::LC_NUMERIC(), 'C'); } else { $template = $self->dgettext($textdomain, $template); POSIX::setlocale(POSIX::LC_NUMERIC(), $self->{locale_numeric}); } my $ret = $template; # replace parameters in string $ret =~ s/[%]([%]|\d+)/($1 eq '%') ? '%' : $args[$1 - 1]/eg; POSIX::setlocale(POSIX::LC_NUMERIC(), $orig_locale); return $ret; } 1; __END__ =encoding utf-8 =head1 NAME Sympa::Language - Handling languages and locales =head1 SYNOPSIS use Sympa::Language; my $language = Sympa::Language->instance; $language->set_lang('zh-TW', 'zh', 'en'); print $language->gettext('Lorem ipsum dolor sit amet.'); =head1 DESCRIPTION This package provides interfaces for i18n (internationalization) of Sympa. The language tags are used to determine each language. A language tag consists of one or more subtags: language, script, region and variant. Below are some examples. =over 4 =item * C - Arabic language =item * C - Ainu language =item * C - Portuguese language in Brazil =item * C - Belarusian language in Latin script =item * C - Valencian variant of Catalan =back Other two sorts of identifiers are derived from language tags: gettext locales and POSIX locales. The gettext locales determine each translation catalog. It consists of one to three parts: language, territory and modifier. For example, their equivalents of language tags above are C, C, C, C and C, respectively. The POSIX locales determine each I. They have similar forms to gettext locales and are used by this package internally. =head2 Functions =head3 Manipulating language tags =over 4 =item canonic_lang ( $lang ) I. Canonicalizes language tag according to RFC 5646 (BCP 47) and returns it. Parameter: =over =item $lang Language tag or similar thing. Old style "locale" by Sympa (see also L) will also be accepted. =back Returns: Canonicalized language tag. In array context, returns an array C<(I, I