943 lines
		
	
	
		
			22 KiB
		
	
	
	
		
			Perl
		
	
	
	
	
	
			
		
		
	
	
			943 lines
		
	
	
		
			22 KiB
		
	
	
	
		
			Perl
		
	
	
	
	
	
| # -*- indent-tabs-mode: nil; -*-
 | |
| # vim:ft=perl:et:sw=4
 | |
| # $Id$
 | |
| 
 | |
| # Sympa - SYsteme de Multi-Postage Automatique
 | |
| #
 | |
| # Copyright (c) 1997, 1998, 1999 Institut Pasteur & Christophe Wolfhugel
 | |
| # Copyright (c) 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005,
 | |
| # 2006, 2007, 2008, 2009, 2010, 2011 Comite Reseau des Universites
 | |
| # Copyright (c) 2011, 2012, 2013, 2014, 2015, 2016, 2017 GIP RENATER
 | |
| # Copyright 2018, 2020 The Sympa Community. See the AUTHORS.md
 | |
| # file at the top-level directory of this distribution and at
 | |
| # <https://github.com/sympa-community/sympa.git>.
 | |
| #
 | |
| # This program is free software; you can redistribute it and/or modify
 | |
| # it under the terms of the GNU General Public License as published by
 | |
| # the Free Software Foundation; either version 2 of the License, or
 | |
| # (at your option) any later version.
 | |
| #
 | |
| # This program is distributed in the hope that it will be useful,
 | |
| # but WITHOUT ANY WARRANTY; without even the implied warranty of
 | |
| # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | |
| # GNU General Public License for more details.
 | |
| #
 | |
| # You should have received a copy of the GNU General Public License
 | |
| # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 | |
| 
 | |
| package Sympa::Tools::Text;
 | |
| 
 | |
| use strict;
 | |
| use warnings;
 | |
| use Encode qw();
 | |
| use English qw(-no_match_vars);
 | |
| use Encode::MIME::Header;    # 'MIME-Q' encoding.
 | |
| use HTML::Entities qw();
 | |
| use MIME::EncWords;
 | |
| use Text::LineFold;
 | |
| use Unicode::GCString;
 | |
| use URI::Escape qw();
 | |
| use if ($] < 5.016), qw(Unicode::CaseFold fc);
 | |
| use if (5.016 <= $]), qw(feature fc);
 | |
| BEGIN { eval 'use Unicode::Normalize qw()'; }
 | |
| BEGIN { eval 'use Unicode::UTF8 qw()'; }
 | |
| 
 | |
| use Sympa::Language;
 | |
| use Sympa::Regexps;
 | |
| 
 | |
| # Old name: tools::addrencode().
 | |
| sub addrencode {
 | |
|     my $addr    = shift;
 | |
|     my $phrase  = (shift || '');
 | |
|     my $charset = (shift || 'utf8');
 | |
|     my $comment = (shift || '');
 | |
| 
 | |
|     return undef unless $addr =~ /\S/;
 | |
| 
 | |
|     if ($phrase =~ /[^\s\x21-\x7E]/) {
 | |
|         $phrase = MIME::EncWords::encode_mimewords(
 | |
|             Encode::decode('utf8', $phrase),
 | |
|             'Encoding'    => 'A',
 | |
|             'Charset'     => $charset,
 | |
|             'Replacement' => 'FALLBACK',
 | |
|             'Field'       => 'Resent-Sender', # almost longest
 | |
|             'Minimal'     => 'DISPNAME',      # needs MIME::EncWords >= 1.012.
 | |
|         );
 | |
|     } elsif ($phrase =~ /\S/) {
 | |
|         $phrase =~ s/([\\\"])/\\$1/g;
 | |
|         $phrase = '"' . $phrase . '"';
 | |
|     }
 | |
|     if ($comment =~ /[^\s\x21-\x27\x2A-\x5B\x5D-\x7E]/) {
 | |
|         $comment = MIME::EncWords::encode_mimewords(
 | |
|             Encode::decode('utf8', $comment),
 | |
|             'Encoding'    => 'A',
 | |
|             'Charset'     => $charset,
 | |
|             'Replacement' => 'FALLBACK',
 | |
|             'Minimal'     => 'DISPNAME',
 | |
|         );
 | |
|     } elsif ($comment =~ /\S/) {
 | |
|         $comment =~ s/([\\\"])/\\$1/g;
 | |
|     }
 | |
| 
 | |
|     return
 | |
|           ($phrase =~ /\S/  ? "$phrase "    : '')
 | |
|         . ($comment =~ /\S/ ? "($comment) " : '')
 | |
|         . "<$addr>";
 | |
| }
 | |
| 
 | |
| # Old names: tools::clean_email(), tools::get_canonical_email().
 | |
| sub canonic_email {
 | |
|     my $email = shift;
 | |
| 
 | |
|     return undef unless defined $email;
 | |
| 
 | |
|     # Remove leading and trailing white spaces.
 | |
|     $email =~ s/\A\s+//;
 | |
|     $email =~ s/\s+\z//;
 | |
| 
 | |
|     # Lower-case.
 | |
|     $email =~ tr/A-Z/a-z/;
 | |
| 
 | |
|     return (length $email) ? $email : undef;
 | |
| }
 | |
| 
 | |
| # Old name: tools::clean_msg_id().
 | |
| sub canonic_message_id {
 | |
|     my $msg_id = shift;
 | |
| 
 | |
|     return $msg_id unless defined $msg_id;
 | |
| 
 | |
|     chomp $msg_id;
 | |
| 
 | |
|     if ($msg_id =~ /\<(.+)\>/) {
 | |
|         $msg_id = $1;
 | |
|     }
 | |
| 
 | |
|     return $msg_id;
 | |
| }
 | |
| 
 | |
| sub canonic_text {
 | |
|     my $text = shift;
 | |
| 
 | |
|     return undef unless defined $text;
 | |
| 
 | |
|     # Normalize text. See also discussion on
 | |
|     # https://listes.renater.fr/sympa/arc/sympa-developpers/2018-03/thrd1.html
 | |
|     #
 | |
|     # N.B.: Corresponding modules are optional by now, and should be
 | |
|     # mandatory in the future.
 | |
|     my $utext;
 | |
|     if (Encode::is_utf8($text)) {
 | |
|         $utext = $text;
 | |
|     } elsif ($Unicode::UTF8::VERSION) {
 | |
|         no warnings 'utf8';
 | |
|         $utext = Unicode::UTF8::decode_utf8($text);
 | |
|     } else {
 | |
|         $utext = Encode::decode_utf8($text);
 | |
|     }
 | |
|     if ($Unicode::Normalize::VERSION) {
 | |
|         $utext = Unicode::Normalize::normalize('NFC', $utext);
 | |
|     }
 | |
| 
 | |
|     # Remove DOS linefeeds (^M) that cause problems with Outlook 98, AOL,
 | |
|     # and EIMS:
 | |
|     $utext =~ s/\r\n|\r/\n/g;
 | |
| 
 | |
|     if (Encode::is_utf8($text)) {
 | |
|         return $utext;
 | |
|     } else {
 | |
|         return Encode::encode_utf8($utext);
 | |
|     }
 | |
| }
 | |
| 
 | |
| sub slurp {
 | |
|     my $path = shift;
 | |
| 
 | |
|     my $ifh;
 | |
|     return undef unless open $ifh, '<', $path;
 | |
|     my $text = do { local $RS; <$ifh> };
 | |
|     close $ifh;
 | |
| 
 | |
|     return canonic_text($text);
 | |
| }
 | |
| 
 | |
| sub wrap_text {
 | |
|     my $text = shift;
 | |
|     my $init = shift;
 | |
|     my $subs = shift;
 | |
|     my $cols = shift;
 | |
| 
 | |
|     $init //= '';
 | |
|     $subs //= '';
 | |
|     $cols //= 78;
 | |
|     return $text unless $cols;
 | |
| 
 | |
|     my $email_re = Sympa::Regexps::email();
 | |
|     my $linefold = Text::LineFold->new(
 | |
|         Language   => Sympa::Language->instance->get_lang,
 | |
|         Prep       => 'NONBREAKURI',
 | |
|         prep       => [$email_re, sub { shift; @_ }],
 | |
|         ColumnsMax => $cols,
 | |
|         Format     => sub {
 | |
|             shift;
 | |
|             my $event = shift;
 | |
|             my $str   = shift;
 | |
|             if ($event =~ /^eo/)     { return "\n"; }
 | |
|             if ($event =~ /^so[tp]/) { return $init . $str; }
 | |
|             if ($event eq 'sol')     { return $subs . $str; }
 | |
|             undef;
 | |
|         },
 | |
|     );
 | |
| 
 | |
|     my $t = Encode::is_utf8($text) ? $text : Encode::decode_utf8($text);
 | |
| 
 | |
|     my $ret = '';
 | |
|     while (1000 < length $t) {
 | |
|         my $s = substr $t, 0, 1000;
 | |
|         $ret .= $linefold->break_partial($s);
 | |
|         $t = substr $t, 1000;
 | |
|     }
 | |
|     $ret .= $linefold->break_partial($t) if length $t;
 | |
|     $ret .= $linefold->break_partial(undef);
 | |
| 
 | |
|     return Encode::is_utf8($text) ? $ret : Encode::encode_utf8($ret);
 | |
| }
 | |
| 
 | |
| sub decode_filesystem_safe {
 | |
|     my $str = shift;
 | |
|     return '' unless defined $str and length $str;
 | |
| 
 | |
|     $str = Encode::encode_utf8($str) if Encode::is_utf8($str);
 | |
|     # On case-insensitive filesystem "_XX" along with "_xx" should be decoded.
 | |
|     $str =~ s/_([0-9A-Fa-f]{2})/chr hex "0x$1"/eg;
 | |
|     return $str;
 | |
| }
 | |
| 
 | |
| sub decode_html {
 | |
|     my $str = shift;
 | |
| 
 | |
|     Encode::encode_utf8(
 | |
|         HTML::Entities::decode_entities(Encode::decode_utf8($str)));
 | |
| }
 | |
| 
 | |
| sub encode_filesystem_safe {
 | |
|     my $str = shift;
 | |
|     return '' unless defined $str and length $str;
 | |
| 
 | |
|     $str = Encode::encode_utf8($str) if Encode::is_utf8($str);
 | |
|     $str =~ s/([^-+.0-9\@A-Za-z])/sprintf '_%02x', ord $1/eg;
 | |
|     return $str;
 | |
| }
 | |
| 
 | |
| sub encode_html {
 | |
|     my $str = shift;
 | |
|     my $additional_unsafe = shift || '';
 | |
| 
 | |
|     HTML::Entities::encode_entities($str, '<>&"' . $additional_unsafe);
 | |
| }
 | |
| 
 | |
| sub encode_uri {
 | |
|     my $str     = shift;
 | |
|     my %options = @_;
 | |
| 
 | |
|     # Note: URI-1.35 (URI::Escape 3.28) or later is required.
 | |
|     return Encode::encode_utf8(
 | |
|         URI::Escape::uri_escape_utf8(
 | |
|             Encode::decode_utf8($str),
 | |
|             '^-A-Za-z0-9._~' . (exists $options{omit} ? $options{omit} : '')
 | |
|         )
 | |
|     );
 | |
| }
 | |
| 
 | |
| # Old name: tools::escape_chars().
 | |
| sub escape_chars {
 | |
|     my $s          = shift;
 | |
|     my $except     = shift;                            ## Exceptions
 | |
|     my $ord_except = ord $except if defined $except;
 | |
| 
 | |
|     ## Escape chars
 | |
|     ##  !"#$%&'()+,:;<=>?[] AND accented chars
 | |
|     ## escape % first
 | |
|     foreach my $i (
 | |
|         0x25,
 | |
|         0x20 .. 0x24,
 | |
|         0x26 .. 0x2c,
 | |
|         0x3a .. 0x3f,
 | |
|         0x5b, 0x5d,
 | |
|         0x80 .. 0x9f,
 | |
|         0xa0 .. 0xff
 | |
|     ) {
 | |
|         next if defined $ord_except and $i == $ord_except;
 | |
|         my $hex_i = sprintf "%lx", $i;
 | |
|         $s =~ s/\x$hex_i/%$hex_i/g;
 | |
|     }
 | |
|     ## Special traetment for '/'
 | |
|     $s =~ s/\//%a5/g unless defined $except and $except eq '/';
 | |
| 
 | |
|     return $s;
 | |
| }
 | |
| 
 | |
| # Old name: tt2::escape_url().
 | |
| # DEPRECATED.  Use Sympa::Tools::Text::escape_uri() or
 | |
| # Sympa::Tools::Text::mailtourl().
 | |
| #sub escape_url;
 | |
| 
 | |
| sub foldcase {
 | |
|     my $str = shift;
 | |
| 
 | |
|     return '' unless defined $str and length $str;
 | |
|     # Perl 5.16.0 and later have built-in fc(). Earlier uses Unicode::CaseFold.
 | |
|     return Encode::encode_utf8(fc(Encode::decode_utf8($str)));
 | |
| }
 | |
| 
 | |
| my %legacy_charsets = (
 | |
|     'ar'    => [qw(iso-8859-6)],
 | |
|     'bs'    => [qw(iso-8859-2)],
 | |
|     'cs'    => [qw(iso-8859-2)],
 | |
|     'eo'    => [qw(iso-8859-3)],
 | |
|     'et'    => [qw(iso-8859-4)],
 | |
|     'he'    => [qw(iso-8859-8)],
 | |
|     'hr'    => [qw(iso-8859-2)],
 | |
|     'hu'    => [qw(iso-8859-2)],
 | |
|     'ja'    => [qw(euc-jp cp932 MacJapanese)],
 | |
|     'kl'    => [qw(iso-8859-4)],
 | |
|     'ko'    => [qw(cp949)],
 | |
|     'lt'    => [qw(iso-8859-4)],
 | |
|     'lv'    => [qw(iso-8859-4)],
 | |
|     'mt'    => [qw(iso-8859-3)],
 | |
|     'pl'    => [qw(iso-8859-2)],
 | |
|     'ro'    => [qw(iso-8859-2)],
 | |
|     'ru'    => [qw(koi8-r cp1251)],               # cp866? MacCyrillic?
 | |
|     'sk'    => [qw(iso-8859-2)],
 | |
|     'sl'    => [qw(iso-8859-2)],
 | |
|     'th'    => [qw(iso-8859-11 cp874 MacThai)],
 | |
|     'tr'    => [qw(iso-8859-9)],
 | |
|     'uk'    => [qw(koi8-u)],                      # MacUkrainian?
 | |
|     'zh-CN' => [qw(euc-cn)],
 | |
|     'zh-TW' => [qw(big5-eten)],
 | |
| );
 | |
| 
 | |
| sub guessed_to_utf8 {
 | |
|     my $text  = shift;
 | |
|     my @langs = @_;
 | |
| 
 | |
|     return Encode::encode_utf8($text) if Encode::is_utf8($text);
 | |
|     return $text
 | |
|         unless defined $text
 | |
|         and length $text
 | |
|         and $text =~ /[^\x00-\x7F]/;
 | |
| 
 | |
|     my $utf8;
 | |
|     if ($Unicode::UTF8::VERSION) {
 | |
|         $utf8 =
 | |
|             eval { Unicode::UTF8::decode_utf8($text, Encode::FB_CROAK()) };
 | |
|     }
 | |
|     unless (defined $utf8) {
 | |
|         foreach my $charset (map { $_ ? @$_ : () } @legacy_charsets{@langs}) {
 | |
|             $utf8 =
 | |
|                 eval { Encode::decode($charset, $text, Encode::FB_CROAK()) };
 | |
|             last if defined $utf8;
 | |
|         }
 | |
|     }
 | |
|     unless (defined $utf8) {
 | |
|         $utf8 = Encode::decode('iso-8859-1', $text);
 | |
|     }
 | |
| 
 | |
|     # Apply NFC: e.g. for modified-NFD by Mac OS X.
 | |
|     $utf8 = Unicode::Normalize::normalize('NFC', $utf8)
 | |
|         if $Unicode::Normalize::VERSION;
 | |
| 
 | |
|     return Encode::encode_utf8($utf8);
 | |
| }
 | |
| 
 | |
| sub mailtourl {
 | |
|     my $text    = shift;
 | |
|     my %options = @_;
 | |
| 
 | |
|     my $dtext =
 | |
|           (not defined $text)   ? ''
 | |
|         : $options{decode_html} ? Sympa::Tools::Text::decode_html($text)
 | |
|         :                         $text;
 | |
|     $dtext =~ s/\A\s+//;
 | |
|     $dtext =~ s/\s+\z//;
 | |
|     $dtext =~ s/(?:\r\n|\r|\n)(?=[ \t])//g;
 | |
|     $dtext =~ s/\r\n|\r|\n/ /g;
 | |
| 
 | |
|     # The ``@'' in email address should not be encoded because some MUAs
 | |
|     # aren't able to decode ``%40'' in e-mail address of mailto: URL.
 | |
|     # Contrary, ``@'' in query component should be encoded because some
 | |
|     # MUAs take it for a delimiter to separate URL from the rest.
 | |
|     my ($format, $utext, $qsep);
 | |
|     if ($dtext =~ /[()<>\[\]:;,\"\s]/) {
 | |
|         # Use "to" header if source text includes any of RFC 5322
 | |
|         # "specials", minus ``@'' and ``\'', plus whitespaces.
 | |
|         $format = 'mailto:?to=%s%s';
 | |
|         $utext  = Sympa::Tools::Text::encode_uri($dtext);
 | |
|         $qsep   = '&';
 | |
|     } else {
 | |
|         $format = 'mailto:%s%s';
 | |
|         $utext  = Sympa::Tools::Text::encode_uri($dtext, omit => '@');
 | |
|         $qsep   = '?';
 | |
|     }
 | |
|     my $qstring = _url_query_string(
 | |
|         $options{query},
 | |
|         decode_html => $options{decode_html},
 | |
|         leadchar    => $qsep,
 | |
|         sepchar     => '&',
 | |
|         trim_values => 1,
 | |
|     );
 | |
| 
 | |
|     return sprintf $format, $utext, $qstring;
 | |
| }
 | |
| 
 | |
| sub _url_query_string {
 | |
|     my $query   = shift;
 | |
|     my %options = @_;
 | |
| 
 | |
|     unless (ref $query eq 'HASH' and %$query) {
 | |
|         return '';
 | |
|     } else {
 | |
|         my $decode_html = $options{decode_html};
 | |
|         my $trim_values = $options{trim_values};
 | |
|         return ($options{leadchar} || '?') . join(
 | |
|             ($options{sepchar} || ';'),
 | |
|             map {
 | |
|                 my ($dkey, $dval) = map {
 | |
|                           (not defined $_) ? ''
 | |
|                         : $decode_html ? Sympa::Tools::Text::decode_html($_)
 | |
|                         :                $_;
 | |
|                 } ($_, $query->{$_});
 | |
|                 if ($trim_values and lc $dkey ne 'body') {
 | |
|                     $dval =~ s/\A\s+//;
 | |
|                     $dval =~ s/\s+\z//;
 | |
|                     $dval =~ s/(?:\r\n|\r|\n)(?=[ \t])//g;
 | |
|                     $dval =~ s/\r\n|\r|\n/ /g;
 | |
|                 }
 | |
| 
 | |
|                 sprintf '%s=%s',
 | |
|                     Sympa::Tools::Text::encode_uri($dkey),
 | |
|                     Sympa::Tools::Text::encode_uri($dval);
 | |
|             } sort keys %$query
 | |
|         );
 | |
|     }
 | |
| }
 | |
| 
 | |
| sub pad {
 | |
|     my $str   = shift;
 | |
|     my $width = shift;
 | |
| 
 | |
|     return $str unless $width and defined $str;
 | |
| 
 | |
|     my $ustr = Encode::is_utf8($str) ? $str : Encode::decode_utf8($str);
 | |
|     my $cols = Unicode::GCString->new($ustr)->columns;
 | |
| 
 | |
|     unless ($cols < abs $width) {
 | |
|         return $str;
 | |
|     } elsif ($width < 0) {
 | |
|         return $str . (' ' x (-$width - $cols));
 | |
|     } else {
 | |
|         return (' ' x ($width - $cols)) . $str;
 | |
|     }
 | |
| }
 | |
| 
 | |
| # Old name: tools::qdecode_filename().
 | |
| sub qdecode_filename {
 | |
|     my $filename = shift;
 | |
| 
 | |
|     ## We don't use MIME::Words here because it does not encode properly
 | |
|     ## Unicode
 | |
|     ## Check if string is already Q-encoded first
 | |
|     #if ($filename =~ /\=\?UTF-8\?/) {
 | |
|     $filename = Encode::encode_utf8(Encode::decode('MIME-Q', $filename));
 | |
|     #}
 | |
| 
 | |
|     return $filename;
 | |
| }
 | |
| 
 | |
| # Old name: tools::qencode_filename().
 | |
| sub qencode_filename {
 | |
|     my $filename = shift;
 | |
| 
 | |
|     ## We don't use MIME::Words here because it does not encode properly
 | |
|     ## Unicode
 | |
|     ## Check if string is already Q-encoded first
 | |
|     ## Also check if the string contains 8bit chars
 | |
|     unless ($filename =~ /\=\?UTF-8\?/
 | |
|         || $filename =~ /^[\x00-\x7f]*$/) {
 | |
| 
 | |
|         ## Don't encode elements such as .desc. or .url or .moderate
 | |
|         ## or .extension
 | |
|         my $part = $filename;
 | |
|         my ($leading, $trailing);
 | |
|         $leading  = $1 if ($part =~ s/^(\.desc\.)//);    ## leading .desc
 | |
|         $trailing = $1 if ($part =~ s/((\.\w+)+)$//);    ## trailing .xx
 | |
| 
 | |
|         my $encoded_part = MIME::EncWords::encode_mimewords(
 | |
|             $part,
 | |
|             Charset    => 'utf8',
 | |
|             Encoding   => 'q',
 | |
|             MaxLineLen => 1000,
 | |
|             Minimal    => 'NO'
 | |
|         );
 | |
| 
 | |
|         $filename = $leading . $encoded_part . $trailing;
 | |
|     }
 | |
| 
 | |
|     return $filename;
 | |
| }
 | |
| 
 | |
| # Old name: tools::unescape_chars().
 | |
| sub unescape_chars {
 | |
|     my $s = shift;
 | |
| 
 | |
|     $s =~ s/%a5/\//g;    ## Special traetment for '/'
 | |
|     foreach my $i (0x20 .. 0x2c, 0x3a .. 0x3f, 0x5b, 0x5d, 0x80 .. 0x9f,
 | |
|         0xa0 .. 0xff) {
 | |
|         my $hex_i = sprintf "%lx", $i;
 | |
|         my $hex_s = sprintf "%c",  $i;
 | |
|         $s =~ s/%$hex_i/$hex_s/g;
 | |
|     }
 | |
| 
 | |
|     return $s;
 | |
| }
 | |
| 
 | |
| # Old name: tools::valid_email().
 | |
| sub valid_email {
 | |
|     my $email = shift;
 | |
| 
 | |
|     my $email_re = Sympa::Regexps::email();
 | |
|     return undef unless $email =~ /^${email_re}$/;
 | |
| 
 | |
|     # Forbidden characters.
 | |
|     return undef if $email =~ /[\|\$\*\?\!]/;
 | |
| 
 | |
|     return 1;
 | |
| }
 | |
| 
 | |
| sub weburl {
 | |
|     my $base    = shift;
 | |
|     my $paths   = shift;
 | |
|     my %options = @_;
 | |
| 
 | |
|     my @paths = map {
 | |
|         Sympa::Tools::Text::encode_uri(
 | |
|               (not defined $_)      ? ''
 | |
|             : $options{decode_html} ? Sympa::Tools::Text::decode_html($_)
 | |
|             :                         $_
 | |
|         );
 | |
|     } @{$paths || []};
 | |
| 
 | |
|     my $qstring = _url_query_string(
 | |
|         $options{query},
 | |
|         decode_html => $options{decode_html},
 | |
|         sepchar     => '&',
 | |
|     );
 | |
| 
 | |
|     my $fstring;
 | |
|     my $fragment = $options{fragment};
 | |
|     if (defined $fragment) {
 | |
|         $fstring = '#'
 | |
|             . Sympa::Tools::Text::encode_uri(
 | |
|             $options{decode_html}
 | |
|             ? Sympa::Tools::Text::decode_html($fragment)
 | |
|             : $fragment
 | |
|             );
 | |
|     } else {
 | |
|         $fstring = '';
 | |
|     }
 | |
| 
 | |
|     return sprintf '%s%s%s', join('/', grep { defined $_ } ($base, @paths)),
 | |
|         $qstring, $fstring;
 | |
| }
 | |
| 
 | |
| 1;
 | |
| __END__
 | |
| 
 | |
| =encoding utf-8
 | |
| 
 | |
| =head1 NAME
 | |
| 
 | |
| Sympa::Tools::Text - Text-related functions
 | |
| 
 | |
| =head1 DESCRIPTION
 | |
| 
 | |
| This package provides some text-related functions.
 | |
| 
 | |
| =head2 Functions
 | |
| 
 | |
| =over
 | |
| 
 | |
| =item addrencode ( $addr, [ $phrase, [ $charset, [ $comment ] ] ] )
 | |
| 
 | |
| Returns formatted (and encoded) name-addr as RFC5322 3.4.
 | |
| 
 | |
| =item canonic_email ( $email )
 | |
| 
 | |
| I<Function>.
 | |
| Returns canonical form of e-mail address.
 | |
| 
 | |
| Leading and trailing white spaces are removed.
 | |
| Latin letters without accents are lower-cased.
 | |
| 
 | |
| For malformed inputs returns C<undef>.
 | |
| 
 | |
| =item canonic_message_id ( $message_id )
 | |
| 
 | |
| Returns canonical form of message ID without trailing or leading whitespaces
 | |
| or C<E<lt>>, C<E<gt>>.
 | |
| 
 | |
| =item canonic_text ( $text )
 | |
| 
 | |
| Canonicalizes text.
 | |
| C<$text> should be a binary string encoded by UTF-8 character set or
 | |
| a Unicode string.
 | |
| Forbidden sequences in binary string will be replaced by
 | |
| U+FFFD REPLACEMENT CHARACTERs, and Normalization Form C (NFC) will be applied.
 | |
| 
 | |
| =item decode_filesystem_safe ( $str )
 | |
| 
 | |
| I<Function>.
 | |
| Decodes a string encoded by encode_filesystem_safe().
 | |
| 
 | |
| Parameter:
 | |
| 
 | |
| =over
 | |
| 
 | |
| =item $str
 | |
| 
 | |
| String to be decoded.
 | |
| 
 | |
| =back
 | |
| 
 | |
| Returns:
 | |
| 
 | |
| Decoded string, stripped C<utf8> flag if any.
 | |
| 
 | |
| =item decode_html ( $str )
 | |
| 
 | |
| I<Function>.
 | |
| Decodes HTML entities in a string encoded by UTF-8 or a Unicode string.
 | |
| 
 | |
| Parameter:
 | |
| 
 | |
| =over
 | |
| 
 | |
| =item $str
 | |
| 
 | |
| String to be decoded.
 | |
| 
 | |
| =back
 | |
| 
 | |
| Returns:
 | |
| 
 | |
| Decoded string, stripped C<utf8> flag if any.
 | |
| 
 | |
| =item encode_filesystem_safe ( $str )
 | |
| 
 | |
| I<Function>.
 | |
| Encodes a string $str to be suitable for filesystem.
 | |
| 
 | |
| Parameter:
 | |
| 
 | |
| =over
 | |
| 
 | |
| =item $str
 | |
| 
 | |
| String to be encoded.
 | |
| 
 | |
| =back
 | |
| 
 | |
| Returns:
 | |
| 
 | |
| Encoded string, stripped C<utf8> flag if any.
 | |
| All bytes except C<'-'>, C<'+'>, C<'.'>, C<'@'>
 | |
| and alphanumeric characters are encoded to sequences C<'_'> followed by
 | |
| two hexdigits.
 | |
| 
 | |
| Note that C<'/'> will also be encoded.
 | |
| 
 | |
| =item encode_html ( $str, [ $additional_unsafe ] )
 | |
| 
 | |
| I<Function>.
 | |
| Encodes characters in a string $str to HTML entities.
 | |
| By default
 | |
| C<'E<lt>'>, C<'E<gt>'>, C<'E<amp>'> and C<'E<quot>'> are encoded.
 | |
| 
 | |
| Parameter:
 | |
| 
 | |
| =over
 | |
| 
 | |
| =item $str
 | |
| 
 | |
| String to be encoded.
 | |
| 
 | |
| =item $additional_unsafe
 | |
| 
 | |
| Character or range of characters additionally encoded as entity references.
 | |
| 
 | |
| This optional parameter was introduced on Sympa 6.2.37b.3.
 | |
| 
 | |
| =back
 | |
| 
 | |
| Returns:
 | |
| 
 | |
| Encoded string, I<not> stripping utf8 flag if any.
 | |
| 
 | |
| =item encode_uri ( $str, [ omit => $chars ] )
 | |
| 
 | |
| I<Function>.
 | |
| Encodes potentially unsafe characters in the string using "percent" encoding
 | |
| suitable for URIs.
 | |
| 
 | |
| Parameters:
 | |
| 
 | |
| =over
 | |
| 
 | |
| =item $str
 | |
| 
 | |
| String to be encoded.
 | |
| 
 | |
| =item omit =E<gt> $chars
 | |
| 
 | |
| By default, all characters except those defined as "unreserved" in RFC 3986
 | |
| are encoded, that is, C<[^-A-Za-z0-9._~]>.
 | |
| If this parameter is given, it will prevent encoding additional characters.
 | |
| 
 | |
| =back
 | |
| 
 | |
| Returns:
 | |
| 
 | |
| Encoded string, stripped C<utf8> flag if any.
 | |
| 
 | |
| =item escape_chars ( $str )
 | |
| 
 | |
| Escape weird characters.
 | |
| 
 | |
| ToDo: This should be obsoleted in the future release: Would be better to use
 | |
| L</encode_filesystem_safe>.
 | |
| 
 | |
| =item escape_url ( $str )
 | |
| 
 | |
| DEPRECATED.
 | |
| Would be better to use L</"encode_uri"> or L</"mailtourl">.
 | |
| 
 | |
| =item foldcase ( $str )
 | |
| 
 | |
| I<Function>.
 | |
| Returns "fold-case" string suitable for case-insensitive match.
 | |
| For example, a code below looks for a needle in haystack not regarding case,
 | |
| even if they are non-ASCII UTF-8 strings.
 | |
| 
 | |
|   $haystack = Sympa::Tools::Text::foldcase($HayStack);
 | |
|   $needle   = Sympa::Tools::Text::foldcase($NeedLe);
 | |
|   if (index $haystack, $needle >= 0) {
 | |
|       ...
 | |
|   }
 | |
| 
 | |
| Parameter:
 | |
| 
 | |
| =over
 | |
| 
 | |
| =item $str
 | |
| 
 | |
| A string.
 | |
| 
 | |
| =back
 | |
| 
 | |
| =item guessed_to_utf8( $text, [ lang, ... ] )
 | |
| 
 | |
| I<Function>.
 | |
| Guesses text charset considering language context
 | |
| and returns the text reencoded by UTF-8.
 | |
| 
 | |
| Parameters:
 | |
| 
 | |
| =over
 | |
| 
 | |
| =item $text
 | |
| 
 | |
| Text to be reencoded.
 | |
| 
 | |
| =item lang, ...
 | |
| 
 | |
| Language tag(s) which may be given by L<Sympa::Language/"implicated_langs">.
 | |
| 
 | |
| =back
 | |
| 
 | |
| Returns:
 | |
| 
 | |
| Reencoded text.
 | |
| If any charsets could not be guessed, C<iso-8859-1> will be used
 | |
| as the last resort, just because it covers full range of 8-bit.
 | |
| 
 | |
| =item mailtourl ( $email, [ decode_html =E<gt> 1 ],
 | |
| [ query =E<gt> {key =E<gt> val, ...} ] )
 | |
| 
 | |
| I<Function>.
 | |
| Constructs a C<mailto:> URL for given e-mail.
 | |
| 
 | |
| Parameters:
 | |
| 
 | |
| =over
 | |
| 
 | |
| =item $email
 | |
| 
 | |
| E-mail address.
 | |
| 
 | |
| =item decode_html =E<gt> 1
 | |
| 
 | |
| If set, arguments are assumed to include HTML entities.
 | |
| 
 | |
| =item query =E<gt> {key =E<gt> val, ...}
 | |
| 
 | |
| Optional query.
 | |
| 
 | |
| =back
 | |
| 
 | |
| Returns:
 | |
| 
 | |
| Constructed URL.
 | |
| 
 | |
| =item pad ( $str, $width )
 | |
| 
 | |
| Pads space a string so that result will not be narrower than given width.
 | |
| 
 | |
| Parameters:
 | |
| 
 | |
| =over
 | |
| 
 | |
| =item $str
 | |
| 
 | |
| A string.
 | |
| 
 | |
| =item $width
 | |
| 
 | |
| If $width is false value or width of $str is not less than $width,
 | |
| does nothing.
 | |
| If $width is less than C<0>, pads right.
 | |
| Otherwise, pads left.
 | |
| 
 | |
| =back
 | |
| 
 | |
| Returns:
 | |
| 
 | |
| Padded string.
 | |
| 
 | |
| =item qdecode_filename ( $filename )
 | |
| 
 | |
| Q-Decodes web file name.
 | |
| 
 | |
| ToDo:
 | |
| This should be obsoleted in the future release: Would be better to use
 | |
| L</decode_filesystem_safe>.
 | |
| 
 | |
| =item qencode_filename ( $filename )
 | |
| 
 | |
| Q-Encodes web file name.
 | |
| 
 | |
| ToDo:
 | |
| This should be obsoleted in the future release: Would be better to use
 | |
| L</encode_filesystem_safe>.
 | |
| 
 | |
| =item slurp ( $file )
 | |
| 
 | |
| Get entire content of the file.
 | |
| Normalization by canonic_text() is applied.
 | |
| C<$file> is the path to text file.
 | |
| 
 | |
| =item unescape_chars ( $str )
 | |
| 
 | |
| Unescape weird characters.
 | |
| 
 | |
| ToDo: This should be obsoleted in the future release: Would be better to use
 | |
| L</decode_filesystem_safe>.
 | |
| 
 | |
| =item valid_email ( $string )
 | |
| 
 | |
| Basic check of an email address.
 | |
| 
 | |
| =item weburl ( $base, \@paths, [ decode_html =E<gt> 1 ],
 | |
| [ fragment =E<gt> $fragment ], [ query =E<gt> \%query ] )
 | |
| 
 | |
| Constructs a C<http:> or C<https:> URL under given base URI.
 | |
| 
 | |
| Parameters:
 | |
| 
 | |
| =over
 | |
| 
 | |
| =item $base
 | |
| 
 | |
| Base URI.
 | |
| 
 | |
| =item \@paths
 | |
| 
 | |
| Additional path components.
 | |
| 
 | |
| =item decode_html =E<gt> 1
 | |
| 
 | |
| If set, arguments are assumed to include HTML entities.
 | |
| Exception is $base:
 | |
| It is assumed not to include entities.
 | |
| 
 | |
| =item fragment =E<gt> $fragment
 | |
| 
 | |
| Optional fragment.
 | |
| 
 | |
| =item query =E<gt> \%query
 | |
| 
 | |
| Optional query.
 | |
| 
 | |
| =back
 | |
| 
 | |
| Returns:
 | |
| 
 | |
| A URI.
 | |
| 
 | |
| =item wrap_text ( $text, [ $init_tab, [ $subsequent_tab, [ $cols ] ] ] )
 | |
| 
 | |
| I<Function>.
 | |
| Returns line-wrapped text.
 | |
| 
 | |
| Parameters:
 | |
| 
 | |
| =over
 | |
| 
 | |
| =item $text
 | |
| 
 | |
| The text to be folded.
 | |
| 
 | |
| =item $init_tab
 | |
| 
 | |
| Indentation prepended to the first line of paragraph.
 | |
| Default is C<''>, no indentation.
 | |
| 
 | |
| =item $subsequent_tab
 | |
| 
 | |
| Indentation prepended to each subsequent line of folded paragraph.
 | |
| Default is C<''>, no indentation.
 | |
| 
 | |
| =item $cols
 | |
| 
 | |
| Max number of columns of folded text.
 | |
| Default is C<78>.
 | |
| 
 | |
| =back
 | |
| 
 | |
| =back
 | |
| 
 | |
| =head1 HISTORY
 | |
| 
 | |
| L<Sympa::Tools::Text> appeared on Sympa 6.2a.41.
 | |
| 
 | |
| decode_filesystem_safe() and encode_filesystem_safe() were added
 | |
| on Sympa 6.2.10.
 | |
| 
 | |
| decode_html(), encode_html(), encode_uri() and mailtourl()
 | |
| were added on Sympa 6.2.14, and escape_url() was deprecated.
 | |
| 
 | |
| guessed_to_utf8() and pad() were added on Sympa 6.2.17.
 | |
| 
 | |
| canonic_text() and slurp() were added on Sympa 6.2.53b.
 | |
| 
 | |
| =cut
 |