943 lines
22 KiB
Perl
Raw Normal View History

2024-06-03 18:43:35 +02:00
# -*- indent-tabs-mode: nil; -*-
# vim:ft=perl:et:sw=4
# $Id$
# Sympa - SYsteme de Multi-Postage Automatique
#
# Copyright (c) 1997, 1998, 1999 Institut Pasteur & Christophe Wolfhugel
# Copyright (c) 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005,
# 2006, 2007, 2008, 2009, 2010, 2011 Comite Reseau des Universites
# Copyright (c) 2011, 2012, 2013, 2014, 2015, 2016, 2017 GIP RENATER
# Copyright 2018, 2020 The Sympa Community. See the AUTHORS.md
# file at the top-level directory of this distribution and at
# <https://github.com/sympa-community/sympa.git>.
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
package Sympa::Tools::Text;
use strict;
use warnings;
use Encode qw();
use English qw(-no_match_vars);
use Encode::MIME::Header; # 'MIME-Q' encoding.
use HTML::Entities qw();
use MIME::EncWords;
use Text::LineFold;
use Unicode::GCString;
use URI::Escape qw();
use if ($] < 5.016), qw(Unicode::CaseFold fc);
use if (5.016 <= $]), qw(feature fc);
BEGIN { eval 'use Unicode::Normalize qw()'; }
BEGIN { eval 'use Unicode::UTF8 qw()'; }
use Sympa::Language;
use Sympa::Regexps;
# Old name: tools::addrencode().
sub addrencode {
my $addr = shift;
my $phrase = (shift || '');
my $charset = (shift || 'utf8');
my $comment = (shift || '');
return undef unless $addr =~ /\S/;
if ($phrase =~ /[^\s\x21-\x7E]/) {
$phrase = MIME::EncWords::encode_mimewords(
Encode::decode('utf8', $phrase),
'Encoding' => 'A',
'Charset' => $charset,
'Replacement' => 'FALLBACK',
'Field' => 'Resent-Sender', # almost longest
'Minimal' => 'DISPNAME', # needs MIME::EncWords >= 1.012.
);
} elsif ($phrase =~ /\S/) {
$phrase =~ s/([\\\"])/\\$1/g;
$phrase = '"' . $phrase . '"';
}
if ($comment =~ /[^\s\x21-\x27\x2A-\x5B\x5D-\x7E]/) {
$comment = MIME::EncWords::encode_mimewords(
Encode::decode('utf8', $comment),
'Encoding' => 'A',
'Charset' => $charset,
'Replacement' => 'FALLBACK',
'Minimal' => 'DISPNAME',
);
} elsif ($comment =~ /\S/) {
$comment =~ s/([\\\"])/\\$1/g;
}
return
($phrase =~ /\S/ ? "$phrase " : '')
. ($comment =~ /\S/ ? "($comment) " : '')
. "<$addr>";
}
# Old names: tools::clean_email(), tools::get_canonical_email().
sub canonic_email {
my $email = shift;
return undef unless defined $email;
# Remove leading and trailing white spaces.
$email =~ s/\A\s+//;
$email =~ s/\s+\z//;
# Lower-case.
$email =~ tr/A-Z/a-z/;
return (length $email) ? $email : undef;
}
# Old name: tools::clean_msg_id().
sub canonic_message_id {
my $msg_id = shift;
return $msg_id unless defined $msg_id;
chomp $msg_id;
if ($msg_id =~ /\<(.+)\>/) {
$msg_id = $1;
}
return $msg_id;
}
sub canonic_text {
my $text = shift;
return undef unless defined $text;
# Normalize text. See also discussion on
# https://listes.renater.fr/sympa/arc/sympa-developpers/2018-03/thrd1.html
#
# N.B.: Corresponding modules are optional by now, and should be
# mandatory in the future.
my $utext;
if (Encode::is_utf8($text)) {
$utext = $text;
} elsif ($Unicode::UTF8::VERSION) {
no warnings 'utf8';
$utext = Unicode::UTF8::decode_utf8($text);
} else {
$utext = Encode::decode_utf8($text);
}
if ($Unicode::Normalize::VERSION) {
$utext = Unicode::Normalize::normalize('NFC', $utext);
}
# Remove DOS linefeeds (^M) that cause problems with Outlook 98, AOL,
# and EIMS:
$utext =~ s/\r\n|\r/\n/g;
if (Encode::is_utf8($text)) {
return $utext;
} else {
return Encode::encode_utf8($utext);
}
}
sub slurp {
my $path = shift;
my $ifh;
return undef unless open $ifh, '<', $path;
my $text = do { local $RS; <$ifh> };
close $ifh;
return canonic_text($text);
}
sub wrap_text {
my $text = shift;
my $init = shift;
my $subs = shift;
my $cols = shift;
$init //= '';
$subs //= '';
$cols //= 78;
return $text unless $cols;
my $email_re = Sympa::Regexps::email();
my $linefold = Text::LineFold->new(
Language => Sympa::Language->instance->get_lang,
Prep => 'NONBREAKURI',
prep => [$email_re, sub { shift; @_ }],
ColumnsMax => $cols,
Format => sub {
shift;
my $event = shift;
my $str = shift;
if ($event =~ /^eo/) { return "\n"; }
if ($event =~ /^so[tp]/) { return $init . $str; }
if ($event eq 'sol') { return $subs . $str; }
undef;
},
);
my $t = Encode::is_utf8($text) ? $text : Encode::decode_utf8($text);
my $ret = '';
while (1000 < length $t) {
my $s = substr $t, 0, 1000;
$ret .= $linefold->break_partial($s);
$t = substr $t, 1000;
}
$ret .= $linefold->break_partial($t) if length $t;
$ret .= $linefold->break_partial(undef);
return Encode::is_utf8($text) ? $ret : Encode::encode_utf8($ret);
}
sub decode_filesystem_safe {
my $str = shift;
return '' unless defined $str and length $str;
$str = Encode::encode_utf8($str) if Encode::is_utf8($str);
# On case-insensitive filesystem "_XX" along with "_xx" should be decoded.
$str =~ s/_([0-9A-Fa-f]{2})/chr hex "0x$1"/eg;
return $str;
}
sub decode_html {
my $str = shift;
Encode::encode_utf8(
HTML::Entities::decode_entities(Encode::decode_utf8($str)));
}
sub encode_filesystem_safe {
my $str = shift;
return '' unless defined $str and length $str;
$str = Encode::encode_utf8($str) if Encode::is_utf8($str);
$str =~ s/([^-+.0-9\@A-Za-z])/sprintf '_%02x', ord $1/eg;
return $str;
}
sub encode_html {
my $str = shift;
my $additional_unsafe = shift || '';
HTML::Entities::encode_entities($str, '<>&"' . $additional_unsafe);
}
sub encode_uri {
my $str = shift;
my %options = @_;
# Note: URI-1.35 (URI::Escape 3.28) or later is required.
return Encode::encode_utf8(
URI::Escape::uri_escape_utf8(
Encode::decode_utf8($str),
'^-A-Za-z0-9._~' . (exists $options{omit} ? $options{omit} : '')
)
);
}
# Old name: tools::escape_chars().
sub escape_chars {
my $s = shift;
my $except = shift; ## Exceptions
my $ord_except = ord $except if defined $except;
## Escape chars
## !"#$%&'()+,:;<=>?[] AND accented chars
## escape % first
foreach my $i (
0x25,
0x20 .. 0x24,
0x26 .. 0x2c,
0x3a .. 0x3f,
0x5b, 0x5d,
0x80 .. 0x9f,
0xa0 .. 0xff
) {
next if defined $ord_except and $i == $ord_except;
my $hex_i = sprintf "%lx", $i;
$s =~ s/\x$hex_i/%$hex_i/g;
}
## Special traetment for '/'
$s =~ s/\//%a5/g unless defined $except and $except eq '/';
return $s;
}
# Old name: tt2::escape_url().
# DEPRECATED. Use Sympa::Tools::Text::escape_uri() or
# Sympa::Tools::Text::mailtourl().
#sub escape_url;
sub foldcase {
my $str = shift;
return '' unless defined $str and length $str;
# Perl 5.16.0 and later have built-in fc(). Earlier uses Unicode::CaseFold.
return Encode::encode_utf8(fc(Encode::decode_utf8($str)));
}
my %legacy_charsets = (
'ar' => [qw(iso-8859-6)],
'bs' => [qw(iso-8859-2)],
'cs' => [qw(iso-8859-2)],
'eo' => [qw(iso-8859-3)],
'et' => [qw(iso-8859-4)],
'he' => [qw(iso-8859-8)],
'hr' => [qw(iso-8859-2)],
'hu' => [qw(iso-8859-2)],
'ja' => [qw(euc-jp cp932 MacJapanese)],
'kl' => [qw(iso-8859-4)],
'ko' => [qw(cp949)],
'lt' => [qw(iso-8859-4)],
'lv' => [qw(iso-8859-4)],
'mt' => [qw(iso-8859-3)],
'pl' => [qw(iso-8859-2)],
'ro' => [qw(iso-8859-2)],
'ru' => [qw(koi8-r cp1251)], # cp866? MacCyrillic?
'sk' => [qw(iso-8859-2)],
'sl' => [qw(iso-8859-2)],
'th' => [qw(iso-8859-11 cp874 MacThai)],
'tr' => [qw(iso-8859-9)],
'uk' => [qw(koi8-u)], # MacUkrainian?
'zh-CN' => [qw(euc-cn)],
'zh-TW' => [qw(big5-eten)],
);
sub guessed_to_utf8 {
my $text = shift;
my @langs = @_;
return Encode::encode_utf8($text) if Encode::is_utf8($text);
return $text
unless defined $text
and length $text
and $text =~ /[^\x00-\x7F]/;
my $utf8;
if ($Unicode::UTF8::VERSION) {
$utf8 =
eval { Unicode::UTF8::decode_utf8($text, Encode::FB_CROAK()) };
}
unless (defined $utf8) {
foreach my $charset (map { $_ ? @$_ : () } @legacy_charsets{@langs}) {
$utf8 =
eval { Encode::decode($charset, $text, Encode::FB_CROAK()) };
last if defined $utf8;
}
}
unless (defined $utf8) {
$utf8 = Encode::decode('iso-8859-1', $text);
}
# Apply NFC: e.g. for modified-NFD by Mac OS X.
$utf8 = Unicode::Normalize::normalize('NFC', $utf8)
if $Unicode::Normalize::VERSION;
return Encode::encode_utf8($utf8);
}
sub mailtourl {
my $text = shift;
my %options = @_;
my $dtext =
(not defined $text) ? ''
: $options{decode_html} ? Sympa::Tools::Text::decode_html($text)
: $text;
$dtext =~ s/\A\s+//;
$dtext =~ s/\s+\z//;
$dtext =~ s/(?:\r\n|\r|\n)(?=[ \t])//g;
$dtext =~ s/\r\n|\r|\n/ /g;
# The ``@'' in email address should not be encoded because some MUAs
# aren't able to decode ``%40'' in e-mail address of mailto: URL.
# Contrary, ``@'' in query component should be encoded because some
# MUAs take it for a delimiter to separate URL from the rest.
my ($format, $utext, $qsep);
if ($dtext =~ /[()<>\[\]:;,\"\s]/) {
# Use "to" header if source text includes any of RFC 5322
# "specials", minus ``@'' and ``\'', plus whitespaces.
$format = 'mailto:?to=%s%s';
$utext = Sympa::Tools::Text::encode_uri($dtext);
$qsep = '&';
} else {
$format = 'mailto:%s%s';
$utext = Sympa::Tools::Text::encode_uri($dtext, omit => '@');
$qsep = '?';
}
my $qstring = _url_query_string(
$options{query},
decode_html => $options{decode_html},
leadchar => $qsep,
sepchar => '&',
trim_values => 1,
);
return sprintf $format, $utext, $qstring;
}
sub _url_query_string {
my $query = shift;
my %options = @_;
unless (ref $query eq 'HASH' and %$query) {
return '';
} else {
my $decode_html = $options{decode_html};
my $trim_values = $options{trim_values};
return ($options{leadchar} || '?') . join(
($options{sepchar} || ';'),
map {
my ($dkey, $dval) = map {
(not defined $_) ? ''
: $decode_html ? Sympa::Tools::Text::decode_html($_)
: $_;
} ($_, $query->{$_});
if ($trim_values and lc $dkey ne 'body') {
$dval =~ s/\A\s+//;
$dval =~ s/\s+\z//;
$dval =~ s/(?:\r\n|\r|\n)(?=[ \t])//g;
$dval =~ s/\r\n|\r|\n/ /g;
}
sprintf '%s=%s',
Sympa::Tools::Text::encode_uri($dkey),
Sympa::Tools::Text::encode_uri($dval);
} sort keys %$query
);
}
}
sub pad {
my $str = shift;
my $width = shift;
return $str unless $width and defined $str;
my $ustr = Encode::is_utf8($str) ? $str : Encode::decode_utf8($str);
my $cols = Unicode::GCString->new($ustr)->columns;
unless ($cols < abs $width) {
return $str;
} elsif ($width < 0) {
return $str . (' ' x (-$width - $cols));
} else {
return (' ' x ($width - $cols)) . $str;
}
}
# Old name: tools::qdecode_filename().
sub qdecode_filename {
my $filename = shift;
## We don't use MIME::Words here because it does not encode properly
## Unicode
## Check if string is already Q-encoded first
#if ($filename =~ /\=\?UTF-8\?/) {
$filename = Encode::encode_utf8(Encode::decode('MIME-Q', $filename));
#}
return $filename;
}
# Old name: tools::qencode_filename().
sub qencode_filename {
my $filename = shift;
## We don't use MIME::Words here because it does not encode properly
## Unicode
## Check if string is already Q-encoded first
## Also check if the string contains 8bit chars
unless ($filename =~ /\=\?UTF-8\?/
|| $filename =~ /^[\x00-\x7f]*$/) {
## Don't encode elements such as .desc. or .url or .moderate
## or .extension
my $part = $filename;
my ($leading, $trailing);
$leading = $1 if ($part =~ s/^(\.desc\.)//); ## leading .desc
$trailing = $1 if ($part =~ s/((\.\w+)+)$//); ## trailing .xx
my $encoded_part = MIME::EncWords::encode_mimewords(
$part,
Charset => 'utf8',
Encoding => 'q',
MaxLineLen => 1000,
Minimal => 'NO'
);
$filename = $leading . $encoded_part . $trailing;
}
return $filename;
}
# Old name: tools::unescape_chars().
sub unescape_chars {
my $s = shift;
$s =~ s/%a5/\//g; ## Special traetment for '/'
foreach my $i (0x20 .. 0x2c, 0x3a .. 0x3f, 0x5b, 0x5d, 0x80 .. 0x9f,
0xa0 .. 0xff) {
my $hex_i = sprintf "%lx", $i;
my $hex_s = sprintf "%c", $i;
$s =~ s/%$hex_i/$hex_s/g;
}
return $s;
}
# Old name: tools::valid_email().
sub valid_email {
my $email = shift;
my $email_re = Sympa::Regexps::email();
return undef unless $email =~ /^${email_re}$/;
# Forbidden characters.
return undef if $email =~ /[\|\$\*\?\!]/;
return 1;
}
sub weburl {
my $base = shift;
my $paths = shift;
my %options = @_;
my @paths = map {
Sympa::Tools::Text::encode_uri(
(not defined $_) ? ''
: $options{decode_html} ? Sympa::Tools::Text::decode_html($_)
: $_
);
} @{$paths || []};
my $qstring = _url_query_string(
$options{query},
decode_html => $options{decode_html},
sepchar => '&',
);
my $fstring;
my $fragment = $options{fragment};
if (defined $fragment) {
$fstring = '#'
. Sympa::Tools::Text::encode_uri(
$options{decode_html}
? Sympa::Tools::Text::decode_html($fragment)
: $fragment
);
} else {
$fstring = '';
}
return sprintf '%s%s%s', join('/', grep { defined $_ } ($base, @paths)),
$qstring, $fstring;
}
1;
__END__
=encoding utf-8
=head1 NAME
Sympa::Tools::Text - Text-related functions
=head1 DESCRIPTION
This package provides some text-related functions.
=head2 Functions
=over
=item addrencode ( $addr, [ $phrase, [ $charset, [ $comment ] ] ] )
Returns formatted (and encoded) name-addr as RFC5322 3.4.
=item canonic_email ( $email )
I<Function>.
Returns canonical form of e-mail address.
Leading and trailing white spaces are removed.
Latin letters without accents are lower-cased.
For malformed inputs returns C<undef>.
=item canonic_message_id ( $message_id )
Returns canonical form of message ID without trailing or leading whitespaces
or C<E<lt>>, C<E<gt>>.
=item canonic_text ( $text )
Canonicalizes text.
C<$text> should be a binary string encoded by UTF-8 character set or
a Unicode string.
Forbidden sequences in binary string will be replaced by
U+FFFD REPLACEMENT CHARACTERs, and Normalization Form C (NFC) will be applied.
=item decode_filesystem_safe ( $str )
I<Function>.
Decodes a string encoded by encode_filesystem_safe().
Parameter:
=over
=item $str
String to be decoded.
=back
Returns:
Decoded string, stripped C<utf8> flag if any.
=item decode_html ( $str )
I<Function>.
Decodes HTML entities in a string encoded by UTF-8 or a Unicode string.
Parameter:
=over
=item $str
String to be decoded.
=back
Returns:
Decoded string, stripped C<utf8> flag if any.
=item encode_filesystem_safe ( $str )
I<Function>.
Encodes a string $str to be suitable for filesystem.
Parameter:
=over
=item $str
String to be encoded.
=back
Returns:
Encoded string, stripped C<utf8> flag if any.
All bytes except C<'-'>, C<'+'>, C<'.'>, C<'@'>
and alphanumeric characters are encoded to sequences C<'_'> followed by
two hexdigits.
Note that C<'/'> will also be encoded.
=item encode_html ( $str, [ $additional_unsafe ] )
I<Function>.
Encodes characters in a string $str to HTML entities.
By default
C<'E<lt>'>, C<'E<gt>'>, C<'E<amp>'> and C<'E<quot>'> are encoded.
Parameter:
=over
=item $str
String to be encoded.
=item $additional_unsafe
Character or range of characters additionally encoded as entity references.
This optional parameter was introduced on Sympa 6.2.37b.3.
=back
Returns:
Encoded string, I<not> stripping utf8 flag if any.
=item encode_uri ( $str, [ omit => $chars ] )
I<Function>.
Encodes potentially unsafe characters in the string using "percent" encoding
suitable for URIs.
Parameters:
=over
=item $str
String to be encoded.
=item omit =E<gt> $chars
By default, all characters except those defined as "unreserved" in RFC 3986
are encoded, that is, C<[^-A-Za-z0-9._~]>.
If this parameter is given, it will prevent encoding additional characters.
=back
Returns:
Encoded string, stripped C<utf8> flag if any.
=item escape_chars ( $str )
Escape weird characters.
ToDo: This should be obsoleted in the future release: Would be better to use
L</encode_filesystem_safe>.
=item escape_url ( $str )
DEPRECATED.
Would be better to use L</"encode_uri"> or L</"mailtourl">.
=item foldcase ( $str )
I<Function>.
Returns "fold-case" string suitable for case-insensitive match.
For example, a code below looks for a needle in haystack not regarding case,
even if they are non-ASCII UTF-8 strings.
$haystack = Sympa::Tools::Text::foldcase($HayStack);
$needle = Sympa::Tools::Text::foldcase($NeedLe);
if (index $haystack, $needle >= 0) {
...
}
Parameter:
=over
=item $str
A string.
=back
=item guessed_to_utf8( $text, [ lang, ... ] )
I<Function>.
Guesses text charset considering language context
and returns the text reencoded by UTF-8.
Parameters:
=over
=item $text
Text to be reencoded.
=item lang, ...
Language tag(s) which may be given by L<Sympa::Language/"implicated_langs">.
=back
Returns:
Reencoded text.
If any charsets could not be guessed, C<iso-8859-1> will be used
as the last resort, just because it covers full range of 8-bit.
=item mailtourl ( $email, [ decode_html =E<gt> 1 ],
[ query =E<gt> {key =E<gt> val, ...} ] )
I<Function>.
Constructs a C<mailto:> URL for given e-mail.
Parameters:
=over
=item $email
E-mail address.
=item decode_html =E<gt> 1
If set, arguments are assumed to include HTML entities.
=item query =E<gt> {key =E<gt> val, ...}
Optional query.
=back
Returns:
Constructed URL.
=item pad ( $str, $width )
Pads space a string so that result will not be narrower than given width.
Parameters:
=over
=item $str
A string.
=item $width
If $width is false value or width of $str is not less than $width,
does nothing.
If $width is less than C<0>, pads right.
Otherwise, pads left.
=back
Returns:
Padded string.
=item qdecode_filename ( $filename )
Q-Decodes web file name.
ToDo:
This should be obsoleted in the future release: Would be better to use
L</decode_filesystem_safe>.
=item qencode_filename ( $filename )
Q-Encodes web file name.
ToDo:
This should be obsoleted in the future release: Would be better to use
L</encode_filesystem_safe>.
=item slurp ( $file )
Get entire content of the file.
Normalization by canonic_text() is applied.
C<$file> is the path to text file.
=item unescape_chars ( $str )
Unescape weird characters.
ToDo: This should be obsoleted in the future release: Would be better to use
L</decode_filesystem_safe>.
=item valid_email ( $string )
Basic check of an email address.
=item weburl ( $base, \@paths, [ decode_html =E<gt> 1 ],
[ fragment =E<gt> $fragment ], [ query =E<gt> \%query ] )
Constructs a C<http:> or C<https:> URL under given base URI.
Parameters:
=over
=item $base
Base URI.
=item \@paths
Additional path components.
=item decode_html =E<gt> 1
If set, arguments are assumed to include HTML entities.
Exception is $base:
It is assumed not to include entities.
=item fragment =E<gt> $fragment
Optional fragment.
=item query =E<gt> \%query
Optional query.
=back
Returns:
A URI.
=item wrap_text ( $text, [ $init_tab, [ $subsequent_tab, [ $cols ] ] ] )
I<Function>.
Returns line-wrapped text.
Parameters:
=over
=item $text
The text to be folded.
=item $init_tab
Indentation prepended to the first line of paragraph.
Default is C<''>, no indentation.
=item $subsequent_tab
Indentation prepended to each subsequent line of folded paragraph.
Default is C<''>, no indentation.
=item $cols
Max number of columns of folded text.
Default is C<78>.
=back
=back
=head1 HISTORY
L<Sympa::Tools::Text> appeared on Sympa 6.2a.41.
decode_filesystem_safe() and encode_filesystem_safe() were added
on Sympa 6.2.10.
decode_html(), encode_html(), encode_uri() and mailtourl()
were added on Sympa 6.2.14, and escape_url() was deprecated.
guessed_to_utf8() and pad() were added on Sympa 6.2.17.
canonic_text() and slurp() were added on Sympa 6.2.53b.
=cut