# -*- indent-tabs-mode: nil; -*- # vim:ft=perl:et:sw=4 # $Id$ # Sympa - SYsteme de Multi-Postage Automatique # # Copyright (c) 1997, 1998, 1999 Institut Pasteur & Christophe Wolfhugel # Copyright (c) 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, # 2006, 2007, 2008, 2009, 2010, 2011 Comite Reseau des Universites # Copyright (c) 2011, 2012, 2013, 2014, 2015, 2016, 2017 GIP RENATER # Copyright 2018, 2020 The Sympa Community. See the AUTHORS.md # file at the top-level directory of this distribution and at # . # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . package Sympa::Tools::Text; use strict; use warnings; use Encode qw(); use English qw(-no_match_vars); use Encode::MIME::Header; # 'MIME-Q' encoding. use HTML::Entities qw(); use MIME::EncWords; use Text::LineFold; use Unicode::GCString; use URI::Escape qw(); use if ($] < 5.016), qw(Unicode::CaseFold fc); use if (5.016 <= $]), qw(feature fc); BEGIN { eval 'use Unicode::Normalize qw()'; } BEGIN { eval 'use Unicode::UTF8 qw()'; } use Sympa::Language; use Sympa::Regexps; # Old name: tools::addrencode(). sub addrencode { my $addr = shift; my $phrase = (shift || ''); my $charset = (shift || 'utf8'); my $comment = (shift || ''); return undef unless $addr =~ /\S/; if ($phrase =~ /[^\s\x21-\x7E]/) { $phrase = MIME::EncWords::encode_mimewords( Encode::decode('utf8', $phrase), 'Encoding' => 'A', 'Charset' => $charset, 'Replacement' => 'FALLBACK', 'Field' => 'Resent-Sender', # almost longest 'Minimal' => 'DISPNAME', # needs MIME::EncWords >= 1.012. ); } elsif ($phrase =~ /\S/) { $phrase =~ s/([\\\"])/\\$1/g; $phrase = '"' . $phrase . '"'; } if ($comment =~ /[^\s\x21-\x27\x2A-\x5B\x5D-\x7E]/) { $comment = MIME::EncWords::encode_mimewords( Encode::decode('utf8', $comment), 'Encoding' => 'A', 'Charset' => $charset, 'Replacement' => 'FALLBACK', 'Minimal' => 'DISPNAME', ); } elsif ($comment =~ /\S/) { $comment =~ s/([\\\"])/\\$1/g; } return ($phrase =~ /\S/ ? "$phrase " : '') . ($comment =~ /\S/ ? "($comment) " : '') . "<$addr>"; } # Old names: tools::clean_email(), tools::get_canonical_email(). sub canonic_email { my $email = shift; return undef unless defined $email; # Remove leading and trailing white spaces. $email =~ s/\A\s+//; $email =~ s/\s+\z//; # Lower-case. $email =~ tr/A-Z/a-z/; return (length $email) ? $email : undef; } # Old name: tools::clean_msg_id(). sub canonic_message_id { my $msg_id = shift; return $msg_id unless defined $msg_id; chomp $msg_id; if ($msg_id =~ /\<(.+)\>/) { $msg_id = $1; } return $msg_id; } sub canonic_text { my $text = shift; return undef unless defined $text; # Normalize text. See also discussion on # https://listes.renater.fr/sympa/arc/sympa-developpers/2018-03/thrd1.html # # N.B.: Corresponding modules are optional by now, and should be # mandatory in the future. my $utext; if (Encode::is_utf8($text)) { $utext = $text; } elsif ($Unicode::UTF8::VERSION) { no warnings 'utf8'; $utext = Unicode::UTF8::decode_utf8($text); } else { $utext = Encode::decode_utf8($text); } if ($Unicode::Normalize::VERSION) { $utext = Unicode::Normalize::normalize('NFC', $utext); } # Remove DOS linefeeds (^M) that cause problems with Outlook 98, AOL, # and EIMS: $utext =~ s/\r\n|\r/\n/g; if (Encode::is_utf8($text)) { return $utext; } else { return Encode::encode_utf8($utext); } } sub slurp { my $path = shift; my $ifh; return undef unless open $ifh, '<', $path; my $text = do { local $RS; <$ifh> }; close $ifh; return canonic_text($text); } sub wrap_text { my $text = shift; my $init = shift; my $subs = shift; my $cols = shift; $init //= ''; $subs //= ''; $cols //= 78; return $text unless $cols; my $email_re = Sympa::Regexps::email(); my $linefold = Text::LineFold->new( Language => Sympa::Language->instance->get_lang, Prep => 'NONBREAKURI', prep => [$email_re, sub { shift; @_ }], ColumnsMax => $cols, Format => sub { shift; my $event = shift; my $str = shift; if ($event =~ /^eo/) { return "\n"; } if ($event =~ /^so[tp]/) { return $init . $str; } if ($event eq 'sol') { return $subs . $str; } undef; }, ); my $t = Encode::is_utf8($text) ? $text : Encode::decode_utf8($text); my $ret = ''; while (1000 < length $t) { my $s = substr $t, 0, 1000; $ret .= $linefold->break_partial($s); $t = substr $t, 1000; } $ret .= $linefold->break_partial($t) if length $t; $ret .= $linefold->break_partial(undef); return Encode::is_utf8($text) ? $ret : Encode::encode_utf8($ret); } sub decode_filesystem_safe { my $str = shift; return '' unless defined $str and length $str; $str = Encode::encode_utf8($str) if Encode::is_utf8($str); # On case-insensitive filesystem "_XX" along with "_xx" should be decoded. $str =~ s/_([0-9A-Fa-f]{2})/chr hex "0x$1"/eg; return $str; } sub decode_html { my $str = shift; Encode::encode_utf8( HTML::Entities::decode_entities(Encode::decode_utf8($str))); } sub encode_filesystem_safe { my $str = shift; return '' unless defined $str and length $str; $str = Encode::encode_utf8($str) if Encode::is_utf8($str); $str =~ s/([^-+.0-9\@A-Za-z])/sprintf '_%02x', ord $1/eg; return $str; } sub encode_html { my $str = shift; my $additional_unsafe = shift || ''; HTML::Entities::encode_entities($str, '<>&"' . $additional_unsafe); } sub encode_uri { my $str = shift; my %options = @_; # Note: URI-1.35 (URI::Escape 3.28) or later is required. return Encode::encode_utf8( URI::Escape::uri_escape_utf8( Encode::decode_utf8($str), '^-A-Za-z0-9._~' . (exists $options{omit} ? $options{omit} : '') ) ); } # Old name: tools::escape_chars(). sub escape_chars { my $s = shift; my $except = shift; ## Exceptions my $ord_except = ord $except if defined $except; ## Escape chars ## !"#$%&'()+,:;<=>?[] AND accented chars ## escape % first foreach my $i ( 0x25, 0x20 .. 0x24, 0x26 .. 0x2c, 0x3a .. 0x3f, 0x5b, 0x5d, 0x80 .. 0x9f, 0xa0 .. 0xff ) { next if defined $ord_except and $i == $ord_except; my $hex_i = sprintf "%lx", $i; $s =~ s/\x$hex_i/%$hex_i/g; } ## Special traetment for '/' $s =~ s/\//%a5/g unless defined $except and $except eq '/'; return $s; } # Old name: tt2::escape_url(). # DEPRECATED. Use Sympa::Tools::Text::escape_uri() or # Sympa::Tools::Text::mailtourl(). #sub escape_url; sub foldcase { my $str = shift; return '' unless defined $str and length $str; # Perl 5.16.0 and later have built-in fc(). Earlier uses Unicode::CaseFold. return Encode::encode_utf8(fc(Encode::decode_utf8($str))); } my %legacy_charsets = ( 'ar' => [qw(iso-8859-6)], 'bs' => [qw(iso-8859-2)], 'cs' => [qw(iso-8859-2)], 'eo' => [qw(iso-8859-3)], 'et' => [qw(iso-8859-4)], 'he' => [qw(iso-8859-8)], 'hr' => [qw(iso-8859-2)], 'hu' => [qw(iso-8859-2)], 'ja' => [qw(euc-jp cp932 MacJapanese)], 'kl' => [qw(iso-8859-4)], 'ko' => [qw(cp949)], 'lt' => [qw(iso-8859-4)], 'lv' => [qw(iso-8859-4)], 'mt' => [qw(iso-8859-3)], 'pl' => [qw(iso-8859-2)], 'ro' => [qw(iso-8859-2)], 'ru' => [qw(koi8-r cp1251)], # cp866? MacCyrillic? 'sk' => [qw(iso-8859-2)], 'sl' => [qw(iso-8859-2)], 'th' => [qw(iso-8859-11 cp874 MacThai)], 'tr' => [qw(iso-8859-9)], 'uk' => [qw(koi8-u)], # MacUkrainian? 'zh-CN' => [qw(euc-cn)], 'zh-TW' => [qw(big5-eten)], ); sub guessed_to_utf8 { my $text = shift; my @langs = @_; return Encode::encode_utf8($text) if Encode::is_utf8($text); return $text unless defined $text and length $text and $text =~ /[^\x00-\x7F]/; my $utf8; if ($Unicode::UTF8::VERSION) { $utf8 = eval { Unicode::UTF8::decode_utf8($text, Encode::FB_CROAK()) }; } unless (defined $utf8) { foreach my $charset (map { $_ ? @$_ : () } @legacy_charsets{@langs}) { $utf8 = eval { Encode::decode($charset, $text, Encode::FB_CROAK()) }; last if defined $utf8; } } unless (defined $utf8) { $utf8 = Encode::decode('iso-8859-1', $text); } # Apply NFC: e.g. for modified-NFD by Mac OS X. $utf8 = Unicode::Normalize::normalize('NFC', $utf8) if $Unicode::Normalize::VERSION; return Encode::encode_utf8($utf8); } sub mailtourl { my $text = shift; my %options = @_; my $dtext = (not defined $text) ? '' : $options{decode_html} ? Sympa::Tools::Text::decode_html($text) : $text; $dtext =~ s/\A\s+//; $dtext =~ s/\s+\z//; $dtext =~ s/(?:\r\n|\r|\n)(?=[ \t])//g; $dtext =~ s/\r\n|\r|\n/ /g; # The ``@'' in email address should not be encoded because some MUAs # aren't able to decode ``%40'' in e-mail address of mailto: URL. # Contrary, ``@'' in query component should be encoded because some # MUAs take it for a delimiter to separate URL from the rest. my ($format, $utext, $qsep); if ($dtext =~ /[()<>\[\]:;,\"\s]/) { # Use "to" header if source text includes any of RFC 5322 # "specials", minus ``@'' and ``\'', plus whitespaces. $format = 'mailto:?to=%s%s'; $utext = Sympa::Tools::Text::encode_uri($dtext); $qsep = '&'; } else { $format = 'mailto:%s%s'; $utext = Sympa::Tools::Text::encode_uri($dtext, omit => '@'); $qsep = '?'; } my $qstring = _url_query_string( $options{query}, decode_html => $options{decode_html}, leadchar => $qsep, sepchar => '&', trim_values => 1, ); return sprintf $format, $utext, $qstring; } sub _url_query_string { my $query = shift; my %options = @_; unless (ref $query eq 'HASH' and %$query) { return ''; } else { my $decode_html = $options{decode_html}; my $trim_values = $options{trim_values}; return ($options{leadchar} || '?') . join( ($options{sepchar} || ';'), map { my ($dkey, $dval) = map { (not defined $_) ? '' : $decode_html ? Sympa::Tools::Text::decode_html($_) : $_; } ($_, $query->{$_}); if ($trim_values and lc $dkey ne 'body') { $dval =~ s/\A\s+//; $dval =~ s/\s+\z//; $dval =~ s/(?:\r\n|\r|\n)(?=[ \t])//g; $dval =~ s/\r\n|\r|\n/ /g; } sprintf '%s=%s', Sympa::Tools::Text::encode_uri($dkey), Sympa::Tools::Text::encode_uri($dval); } sort keys %$query ); } } sub pad { my $str = shift; my $width = shift; return $str unless $width and defined $str; my $ustr = Encode::is_utf8($str) ? $str : Encode::decode_utf8($str); my $cols = Unicode::GCString->new($ustr)->columns; unless ($cols < abs $width) { return $str; } elsif ($width < 0) { return $str . (' ' x (-$width - $cols)); } else { return (' ' x ($width - $cols)) . $str; } } # Old name: tools::qdecode_filename(). sub qdecode_filename { my $filename = shift; ## We don't use MIME::Words here because it does not encode properly ## Unicode ## Check if string is already Q-encoded first #if ($filename =~ /\=\?UTF-8\?/) { $filename = Encode::encode_utf8(Encode::decode('MIME-Q', $filename)); #} return $filename; } # Old name: tools::qencode_filename(). sub qencode_filename { my $filename = shift; ## We don't use MIME::Words here because it does not encode properly ## Unicode ## Check if string is already Q-encoded first ## Also check if the string contains 8bit chars unless ($filename =~ /\=\?UTF-8\?/ || $filename =~ /^[\x00-\x7f]*$/) { ## Don't encode elements such as .desc. or .url or .moderate ## or .extension my $part = $filename; my ($leading, $trailing); $leading = $1 if ($part =~ s/^(\.desc\.)//); ## leading .desc $trailing = $1 if ($part =~ s/((\.\w+)+)$//); ## trailing .xx my $encoded_part = MIME::EncWords::encode_mimewords( $part, Charset => 'utf8', Encoding => 'q', MaxLineLen => 1000, Minimal => 'NO' ); $filename = $leading . $encoded_part . $trailing; } return $filename; } # Old name: tools::unescape_chars(). sub unescape_chars { my $s = shift; $s =~ s/%a5/\//g; ## Special traetment for '/' foreach my $i (0x20 .. 0x2c, 0x3a .. 0x3f, 0x5b, 0x5d, 0x80 .. 0x9f, 0xa0 .. 0xff) { my $hex_i = sprintf "%lx", $i; my $hex_s = sprintf "%c", $i; $s =~ s/%$hex_i/$hex_s/g; } return $s; } # Old name: tools::valid_email(). sub valid_email { my $email = shift; my $email_re = Sympa::Regexps::email(); return undef unless $email =~ /^${email_re}$/; # Forbidden characters. return undef if $email =~ /[\|\$\*\?\!]/; return 1; } sub weburl { my $base = shift; my $paths = shift; my %options = @_; my @paths = map { Sympa::Tools::Text::encode_uri( (not defined $_) ? '' : $options{decode_html} ? Sympa::Tools::Text::decode_html($_) : $_ ); } @{$paths || []}; my $qstring = _url_query_string( $options{query}, decode_html => $options{decode_html}, sepchar => '&', ); my $fstring; my $fragment = $options{fragment}; if (defined $fragment) { $fstring = '#' . Sympa::Tools::Text::encode_uri( $options{decode_html} ? Sympa::Tools::Text::decode_html($fragment) : $fragment ); } else { $fstring = ''; } return sprintf '%s%s%s', join('/', grep { defined $_ } ($base, @paths)), $qstring, $fstring; } 1; __END__ =encoding utf-8 =head1 NAME Sympa::Tools::Text - Text-related functions =head1 DESCRIPTION This package provides some text-related functions. =head2 Functions =over =item addrencode ( $addr, [ $phrase, [ $charset, [ $comment ] ] ] ) Returns formatted (and encoded) name-addr as RFC5322 3.4. =item canonic_email ( $email ) I. Returns canonical form of e-mail address. Leading and trailing white spaces are removed. Latin letters without accents are lower-cased. For malformed inputs returns C. =item canonic_message_id ( $message_id ) Returns canonical form of message ID without trailing or leading whitespaces or C>, C>. =item canonic_text ( $text ) Canonicalizes text. C<$text> should be a binary string encoded by UTF-8 character set or a Unicode string. Forbidden sequences in binary string will be replaced by U+FFFD REPLACEMENT CHARACTERs, and Normalization Form C (NFC) will be applied. =item decode_filesystem_safe ( $str ) I. Decodes a string encoded by encode_filesystem_safe(). Parameter: =over =item $str String to be decoded. =back Returns: Decoded string, stripped C flag if any. =item decode_html ( $str ) I. Decodes HTML entities in a string encoded by UTF-8 or a Unicode string. Parameter: =over =item $str String to be decoded. =back Returns: Decoded string, stripped C flag if any. =item encode_filesystem_safe ( $str ) I. Encodes a string $str to be suitable for filesystem. Parameter: =over =item $str String to be encoded. =back Returns: Encoded string, stripped C flag if any. All bytes except C<'-'>, C<'+'>, C<'.'>, C<'@'> and alphanumeric characters are encoded to sequences C<'_'> followed by two hexdigits. Note that C<'/'> will also be encoded. =item encode_html ( $str, [ $additional_unsafe ] ) I. Encodes characters in a string $str to HTML entities. By default C<'E'>, C<'E'>, C<'E'> and C<'E'> are encoded. Parameter: =over =item $str String to be encoded. =item $additional_unsafe Character or range of characters additionally encoded as entity references. This optional parameter was introduced on Sympa 6.2.37b.3. =back Returns: Encoded string, I stripping utf8 flag if any. =item encode_uri ( $str, [ omit => $chars ] ) I. Encodes potentially unsafe characters in the string using "percent" encoding suitable for URIs. Parameters: =over =item $str String to be encoded. =item omit =E $chars By default, all characters except those defined as "unreserved" in RFC 3986 are encoded, that is, C<[^-A-Za-z0-9._~]>. If this parameter is given, it will prevent encoding additional characters. =back Returns: Encoded string, stripped C flag if any. =item escape_chars ( $str ) Escape weird characters. ToDo: This should be obsoleted in the future release: Would be better to use L. =item escape_url ( $str ) DEPRECATED. Would be better to use L or L. =item foldcase ( $str ) I. Returns "fold-case" string suitable for case-insensitive match. For example, a code below looks for a needle in haystack not regarding case, even if they are non-ASCII UTF-8 strings. $haystack = Sympa::Tools::Text::foldcase($HayStack); $needle = Sympa::Tools::Text::foldcase($NeedLe); if (index $haystack, $needle >= 0) { ... } Parameter: =over =item $str A string. =back =item guessed_to_utf8( $text, [ lang, ... ] ) I. Guesses text charset considering language context and returns the text reencoded by UTF-8. Parameters: =over =item $text Text to be reencoded. =item lang, ... Language tag(s) which may be given by L. =back Returns: Reencoded text. If any charsets could not be guessed, C will be used as the last resort, just because it covers full range of 8-bit. =item mailtourl ( $email, [ decode_html =E 1 ], [ query =E {key =E val, ...} ] ) I. Constructs a C URL for given e-mail. Parameters: =over =item $email E-mail address. =item decode_html =E 1 If set, arguments are assumed to include HTML entities. =item query =E {key =E val, ...} Optional query. =back Returns: Constructed URL. =item pad ( $str, $width ) Pads space a string so that result will not be narrower than given width. Parameters: =over =item $str A string. =item $width If $width is false value or width of $str is not less than $width, does nothing. If $width is less than C<0>, pads right. Otherwise, pads left. =back Returns: Padded string. =item qdecode_filename ( $filename ) Q-Decodes web file name. ToDo: This should be obsoleted in the future release: Would be better to use L. =item qencode_filename ( $filename ) Q-Encodes web file name. ToDo: This should be obsoleted in the future release: Would be better to use L. =item slurp ( $file ) Get entire content of the file. Normalization by canonic_text() is applied. C<$file> is the path to text file. =item unescape_chars ( $str ) Unescape weird characters. ToDo: This should be obsoleted in the future release: Would be better to use L. =item valid_email ( $string ) Basic check of an email address. =item weburl ( $base, \@paths, [ decode_html =E 1 ], [ fragment =E $fragment ], [ query =E \%query ] ) Constructs a C or C URL under given base URI. Parameters: =over =item $base Base URI. =item \@paths Additional path components. =item decode_html =E 1 If set, arguments are assumed to include HTML entities. Exception is $base: It is assumed not to include entities. =item fragment =E $fragment Optional fragment. =item query =E \%query Optional query. =back Returns: A URI. =item wrap_text ( $text, [ $init_tab, [ $subsequent_tab, [ $cols ] ] ] ) I. Returns line-wrapped text. Parameters: =over =item $text The text to be folded. =item $init_tab Indentation prepended to the first line of paragraph. Default is C<''>, no indentation. =item $subsequent_tab Indentation prepended to each subsequent line of folded paragraph. Default is C<''>, no indentation. =item $cols Max number of columns of folded text. Default is C<78>. =back =back =head1 HISTORY L appeared on Sympa 6.2a.41. decode_filesystem_safe() and encode_filesystem_safe() were added on Sympa 6.2.10. decode_html(), encode_html(), encode_uri() and mailtourl() were added on Sympa 6.2.14, and escape_url() was deprecated. guessed_to_utf8() and pad() were added on Sympa 6.2.17. canonic_text() and slurp() were added on Sympa 6.2.53b. =cut