943 lines
22 KiB
Perl
943 lines
22 KiB
Perl
|
# -*- indent-tabs-mode: nil; -*-
|
||
|
# vim:ft=perl:et:sw=4
|
||
|
# $Id$
|
||
|
|
||
|
# Sympa - SYsteme de Multi-Postage Automatique
|
||
|
#
|
||
|
# Copyright (c) 1997, 1998, 1999 Institut Pasteur & Christophe Wolfhugel
|
||
|
# Copyright (c) 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005,
|
||
|
# 2006, 2007, 2008, 2009, 2010, 2011 Comite Reseau des Universites
|
||
|
# Copyright (c) 2011, 2012, 2013, 2014, 2015, 2016, 2017 GIP RENATER
|
||
|
# Copyright 2018, 2020 The Sympa Community. See the AUTHORS.md
|
||
|
# file at the top-level directory of this distribution and at
|
||
|
# <https://github.com/sympa-community/sympa.git>.
|
||
|
#
|
||
|
# This program is free software; you can redistribute it and/or modify
|
||
|
# it under the terms of the GNU General Public License as published by
|
||
|
# the Free Software Foundation; either version 2 of the License, or
|
||
|
# (at your option) any later version.
|
||
|
#
|
||
|
# This program is distributed in the hope that it will be useful,
|
||
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||
|
# GNU General Public License for more details.
|
||
|
#
|
||
|
# You should have received a copy of the GNU General Public License
|
||
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||
|
|
||
|
package Sympa::Tools::Text;
|
||
|
|
||
|
use strict;
|
||
|
use warnings;
|
||
|
use Encode qw();
|
||
|
use English qw(-no_match_vars);
|
||
|
use Encode::MIME::Header; # 'MIME-Q' encoding.
|
||
|
use HTML::Entities qw();
|
||
|
use MIME::EncWords;
|
||
|
use Text::LineFold;
|
||
|
use Unicode::GCString;
|
||
|
use URI::Escape qw();
|
||
|
use if ($] < 5.016), qw(Unicode::CaseFold fc);
|
||
|
use if (5.016 <= $]), qw(feature fc);
|
||
|
BEGIN { eval 'use Unicode::Normalize qw()'; }
|
||
|
BEGIN { eval 'use Unicode::UTF8 qw()'; }
|
||
|
|
||
|
use Sympa::Language;
|
||
|
use Sympa::Regexps;
|
||
|
|
||
|
# Old name: tools::addrencode().
|
||
|
sub addrencode {
|
||
|
my $addr = shift;
|
||
|
my $phrase = (shift || '');
|
||
|
my $charset = (shift || 'utf8');
|
||
|
my $comment = (shift || '');
|
||
|
|
||
|
return undef unless $addr =~ /\S/;
|
||
|
|
||
|
if ($phrase =~ /[^\s\x21-\x7E]/) {
|
||
|
$phrase = MIME::EncWords::encode_mimewords(
|
||
|
Encode::decode('utf8', $phrase),
|
||
|
'Encoding' => 'A',
|
||
|
'Charset' => $charset,
|
||
|
'Replacement' => 'FALLBACK',
|
||
|
'Field' => 'Resent-Sender', # almost longest
|
||
|
'Minimal' => 'DISPNAME', # needs MIME::EncWords >= 1.012.
|
||
|
);
|
||
|
} elsif ($phrase =~ /\S/) {
|
||
|
$phrase =~ s/([\\\"])/\\$1/g;
|
||
|
$phrase = '"' . $phrase . '"';
|
||
|
}
|
||
|
if ($comment =~ /[^\s\x21-\x27\x2A-\x5B\x5D-\x7E]/) {
|
||
|
$comment = MIME::EncWords::encode_mimewords(
|
||
|
Encode::decode('utf8', $comment),
|
||
|
'Encoding' => 'A',
|
||
|
'Charset' => $charset,
|
||
|
'Replacement' => 'FALLBACK',
|
||
|
'Minimal' => 'DISPNAME',
|
||
|
);
|
||
|
} elsif ($comment =~ /\S/) {
|
||
|
$comment =~ s/([\\\"])/\\$1/g;
|
||
|
}
|
||
|
|
||
|
return
|
||
|
($phrase =~ /\S/ ? "$phrase " : '')
|
||
|
. ($comment =~ /\S/ ? "($comment) " : '')
|
||
|
. "<$addr>";
|
||
|
}
|
||
|
|
||
|
# Old names: tools::clean_email(), tools::get_canonical_email().
|
||
|
sub canonic_email {
|
||
|
my $email = shift;
|
||
|
|
||
|
return undef unless defined $email;
|
||
|
|
||
|
# Remove leading and trailing white spaces.
|
||
|
$email =~ s/\A\s+//;
|
||
|
$email =~ s/\s+\z//;
|
||
|
|
||
|
# Lower-case.
|
||
|
$email =~ tr/A-Z/a-z/;
|
||
|
|
||
|
return (length $email) ? $email : undef;
|
||
|
}
|
||
|
|
||
|
# Old name: tools::clean_msg_id().
|
||
|
sub canonic_message_id {
|
||
|
my $msg_id = shift;
|
||
|
|
||
|
return $msg_id unless defined $msg_id;
|
||
|
|
||
|
chomp $msg_id;
|
||
|
|
||
|
if ($msg_id =~ /\<(.+)\>/) {
|
||
|
$msg_id = $1;
|
||
|
}
|
||
|
|
||
|
return $msg_id;
|
||
|
}
|
||
|
|
||
|
sub canonic_text {
|
||
|
my $text = shift;
|
||
|
|
||
|
return undef unless defined $text;
|
||
|
|
||
|
# Normalize text. See also discussion on
|
||
|
# https://listes.renater.fr/sympa/arc/sympa-developpers/2018-03/thrd1.html
|
||
|
#
|
||
|
# N.B.: Corresponding modules are optional by now, and should be
|
||
|
# mandatory in the future.
|
||
|
my $utext;
|
||
|
if (Encode::is_utf8($text)) {
|
||
|
$utext = $text;
|
||
|
} elsif ($Unicode::UTF8::VERSION) {
|
||
|
no warnings 'utf8';
|
||
|
$utext = Unicode::UTF8::decode_utf8($text);
|
||
|
} else {
|
||
|
$utext = Encode::decode_utf8($text);
|
||
|
}
|
||
|
if ($Unicode::Normalize::VERSION) {
|
||
|
$utext = Unicode::Normalize::normalize('NFC', $utext);
|
||
|
}
|
||
|
|
||
|
# Remove DOS linefeeds (^M) that cause problems with Outlook 98, AOL,
|
||
|
# and EIMS:
|
||
|
$utext =~ s/\r\n|\r/\n/g;
|
||
|
|
||
|
if (Encode::is_utf8($text)) {
|
||
|
return $utext;
|
||
|
} else {
|
||
|
return Encode::encode_utf8($utext);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
sub slurp {
|
||
|
my $path = shift;
|
||
|
|
||
|
my $ifh;
|
||
|
return undef unless open $ifh, '<', $path;
|
||
|
my $text = do { local $RS; <$ifh> };
|
||
|
close $ifh;
|
||
|
|
||
|
return canonic_text($text);
|
||
|
}
|
||
|
|
||
|
sub wrap_text {
|
||
|
my $text = shift;
|
||
|
my $init = shift;
|
||
|
my $subs = shift;
|
||
|
my $cols = shift;
|
||
|
|
||
|
$init //= '';
|
||
|
$subs //= '';
|
||
|
$cols //= 78;
|
||
|
return $text unless $cols;
|
||
|
|
||
|
my $email_re = Sympa::Regexps::email();
|
||
|
my $linefold = Text::LineFold->new(
|
||
|
Language => Sympa::Language->instance->get_lang,
|
||
|
Prep => 'NONBREAKURI',
|
||
|
prep => [$email_re, sub { shift; @_ }],
|
||
|
ColumnsMax => $cols,
|
||
|
Format => sub {
|
||
|
shift;
|
||
|
my $event = shift;
|
||
|
my $str = shift;
|
||
|
if ($event =~ /^eo/) { return "\n"; }
|
||
|
if ($event =~ /^so[tp]/) { return $init . $str; }
|
||
|
if ($event eq 'sol') { return $subs . $str; }
|
||
|
undef;
|
||
|
},
|
||
|
);
|
||
|
|
||
|
my $t = Encode::is_utf8($text) ? $text : Encode::decode_utf8($text);
|
||
|
|
||
|
my $ret = '';
|
||
|
while (1000 < length $t) {
|
||
|
my $s = substr $t, 0, 1000;
|
||
|
$ret .= $linefold->break_partial($s);
|
||
|
$t = substr $t, 1000;
|
||
|
}
|
||
|
$ret .= $linefold->break_partial($t) if length $t;
|
||
|
$ret .= $linefold->break_partial(undef);
|
||
|
|
||
|
return Encode::is_utf8($text) ? $ret : Encode::encode_utf8($ret);
|
||
|
}
|
||
|
|
||
|
sub decode_filesystem_safe {
|
||
|
my $str = shift;
|
||
|
return '' unless defined $str and length $str;
|
||
|
|
||
|
$str = Encode::encode_utf8($str) if Encode::is_utf8($str);
|
||
|
# On case-insensitive filesystem "_XX" along with "_xx" should be decoded.
|
||
|
$str =~ s/_([0-9A-Fa-f]{2})/chr hex "0x$1"/eg;
|
||
|
return $str;
|
||
|
}
|
||
|
|
||
|
sub decode_html {
|
||
|
my $str = shift;
|
||
|
|
||
|
Encode::encode_utf8(
|
||
|
HTML::Entities::decode_entities(Encode::decode_utf8($str)));
|
||
|
}
|
||
|
|
||
|
sub encode_filesystem_safe {
|
||
|
my $str = shift;
|
||
|
return '' unless defined $str and length $str;
|
||
|
|
||
|
$str = Encode::encode_utf8($str) if Encode::is_utf8($str);
|
||
|
$str =~ s/([^-+.0-9\@A-Za-z])/sprintf '_%02x', ord $1/eg;
|
||
|
return $str;
|
||
|
}
|
||
|
|
||
|
sub encode_html {
|
||
|
my $str = shift;
|
||
|
my $additional_unsafe = shift || '';
|
||
|
|
||
|
HTML::Entities::encode_entities($str, '<>&"' . $additional_unsafe);
|
||
|
}
|
||
|
|
||
|
sub encode_uri {
|
||
|
my $str = shift;
|
||
|
my %options = @_;
|
||
|
|
||
|
# Note: URI-1.35 (URI::Escape 3.28) or later is required.
|
||
|
return Encode::encode_utf8(
|
||
|
URI::Escape::uri_escape_utf8(
|
||
|
Encode::decode_utf8($str),
|
||
|
'^-A-Za-z0-9._~' . (exists $options{omit} ? $options{omit} : '')
|
||
|
)
|
||
|
);
|
||
|
}
|
||
|
|
||
|
# Old name: tools::escape_chars().
|
||
|
sub escape_chars {
|
||
|
my $s = shift;
|
||
|
my $except = shift; ## Exceptions
|
||
|
my $ord_except = ord $except if defined $except;
|
||
|
|
||
|
## Escape chars
|
||
|
## !"#$%&'()+,:;<=>?[] AND accented chars
|
||
|
## escape % first
|
||
|
foreach my $i (
|
||
|
0x25,
|
||
|
0x20 .. 0x24,
|
||
|
0x26 .. 0x2c,
|
||
|
0x3a .. 0x3f,
|
||
|
0x5b, 0x5d,
|
||
|
0x80 .. 0x9f,
|
||
|
0xa0 .. 0xff
|
||
|
) {
|
||
|
next if defined $ord_except and $i == $ord_except;
|
||
|
my $hex_i = sprintf "%lx", $i;
|
||
|
$s =~ s/\x$hex_i/%$hex_i/g;
|
||
|
}
|
||
|
## Special traetment for '/'
|
||
|
$s =~ s/\//%a5/g unless defined $except and $except eq '/';
|
||
|
|
||
|
return $s;
|
||
|
}
|
||
|
|
||
|
# Old name: tt2::escape_url().
|
||
|
# DEPRECATED. Use Sympa::Tools::Text::escape_uri() or
|
||
|
# Sympa::Tools::Text::mailtourl().
|
||
|
#sub escape_url;
|
||
|
|
||
|
sub foldcase {
|
||
|
my $str = shift;
|
||
|
|
||
|
return '' unless defined $str and length $str;
|
||
|
# Perl 5.16.0 and later have built-in fc(). Earlier uses Unicode::CaseFold.
|
||
|
return Encode::encode_utf8(fc(Encode::decode_utf8($str)));
|
||
|
}
|
||
|
|
||
|
my %legacy_charsets = (
|
||
|
'ar' => [qw(iso-8859-6)],
|
||
|
'bs' => [qw(iso-8859-2)],
|
||
|
'cs' => [qw(iso-8859-2)],
|
||
|
'eo' => [qw(iso-8859-3)],
|
||
|
'et' => [qw(iso-8859-4)],
|
||
|
'he' => [qw(iso-8859-8)],
|
||
|
'hr' => [qw(iso-8859-2)],
|
||
|
'hu' => [qw(iso-8859-2)],
|
||
|
'ja' => [qw(euc-jp cp932 MacJapanese)],
|
||
|
'kl' => [qw(iso-8859-4)],
|
||
|
'ko' => [qw(cp949)],
|
||
|
'lt' => [qw(iso-8859-4)],
|
||
|
'lv' => [qw(iso-8859-4)],
|
||
|
'mt' => [qw(iso-8859-3)],
|
||
|
'pl' => [qw(iso-8859-2)],
|
||
|
'ro' => [qw(iso-8859-2)],
|
||
|
'ru' => [qw(koi8-r cp1251)], # cp866? MacCyrillic?
|
||
|
'sk' => [qw(iso-8859-2)],
|
||
|
'sl' => [qw(iso-8859-2)],
|
||
|
'th' => [qw(iso-8859-11 cp874 MacThai)],
|
||
|
'tr' => [qw(iso-8859-9)],
|
||
|
'uk' => [qw(koi8-u)], # MacUkrainian?
|
||
|
'zh-CN' => [qw(euc-cn)],
|
||
|
'zh-TW' => [qw(big5-eten)],
|
||
|
);
|
||
|
|
||
|
sub guessed_to_utf8 {
|
||
|
my $text = shift;
|
||
|
my @langs = @_;
|
||
|
|
||
|
return Encode::encode_utf8($text) if Encode::is_utf8($text);
|
||
|
return $text
|
||
|
unless defined $text
|
||
|
and length $text
|
||
|
and $text =~ /[^\x00-\x7F]/;
|
||
|
|
||
|
my $utf8;
|
||
|
if ($Unicode::UTF8::VERSION) {
|
||
|
$utf8 =
|
||
|
eval { Unicode::UTF8::decode_utf8($text, Encode::FB_CROAK()) };
|
||
|
}
|
||
|
unless (defined $utf8) {
|
||
|
foreach my $charset (map { $_ ? @$_ : () } @legacy_charsets{@langs}) {
|
||
|
$utf8 =
|
||
|
eval { Encode::decode($charset, $text, Encode::FB_CROAK()) };
|
||
|
last if defined $utf8;
|
||
|
}
|
||
|
}
|
||
|
unless (defined $utf8) {
|
||
|
$utf8 = Encode::decode('iso-8859-1', $text);
|
||
|
}
|
||
|
|
||
|
# Apply NFC: e.g. for modified-NFD by Mac OS X.
|
||
|
$utf8 = Unicode::Normalize::normalize('NFC', $utf8)
|
||
|
if $Unicode::Normalize::VERSION;
|
||
|
|
||
|
return Encode::encode_utf8($utf8);
|
||
|
}
|
||
|
|
||
|
sub mailtourl {
|
||
|
my $text = shift;
|
||
|
my %options = @_;
|
||
|
|
||
|
my $dtext =
|
||
|
(not defined $text) ? ''
|
||
|
: $options{decode_html} ? Sympa::Tools::Text::decode_html($text)
|
||
|
: $text;
|
||
|
$dtext =~ s/\A\s+//;
|
||
|
$dtext =~ s/\s+\z//;
|
||
|
$dtext =~ s/(?:\r\n|\r|\n)(?=[ \t])//g;
|
||
|
$dtext =~ s/\r\n|\r|\n/ /g;
|
||
|
|
||
|
# The ``@'' in email address should not be encoded because some MUAs
|
||
|
# aren't able to decode ``%40'' in e-mail address of mailto: URL.
|
||
|
# Contrary, ``@'' in query component should be encoded because some
|
||
|
# MUAs take it for a delimiter to separate URL from the rest.
|
||
|
my ($format, $utext, $qsep);
|
||
|
if ($dtext =~ /[()<>\[\]:;,\"\s]/) {
|
||
|
# Use "to" header if source text includes any of RFC 5322
|
||
|
# "specials", minus ``@'' and ``\'', plus whitespaces.
|
||
|
$format = 'mailto:?to=%s%s';
|
||
|
$utext = Sympa::Tools::Text::encode_uri($dtext);
|
||
|
$qsep = '&';
|
||
|
} else {
|
||
|
$format = 'mailto:%s%s';
|
||
|
$utext = Sympa::Tools::Text::encode_uri($dtext, omit => '@');
|
||
|
$qsep = '?';
|
||
|
}
|
||
|
my $qstring = _url_query_string(
|
||
|
$options{query},
|
||
|
decode_html => $options{decode_html},
|
||
|
leadchar => $qsep,
|
||
|
sepchar => '&',
|
||
|
trim_values => 1,
|
||
|
);
|
||
|
|
||
|
return sprintf $format, $utext, $qstring;
|
||
|
}
|
||
|
|
||
|
sub _url_query_string {
|
||
|
my $query = shift;
|
||
|
my %options = @_;
|
||
|
|
||
|
unless (ref $query eq 'HASH' and %$query) {
|
||
|
return '';
|
||
|
} else {
|
||
|
my $decode_html = $options{decode_html};
|
||
|
my $trim_values = $options{trim_values};
|
||
|
return ($options{leadchar} || '?') . join(
|
||
|
($options{sepchar} || ';'),
|
||
|
map {
|
||
|
my ($dkey, $dval) = map {
|
||
|
(not defined $_) ? ''
|
||
|
: $decode_html ? Sympa::Tools::Text::decode_html($_)
|
||
|
: $_;
|
||
|
} ($_, $query->{$_});
|
||
|
if ($trim_values and lc $dkey ne 'body') {
|
||
|
$dval =~ s/\A\s+//;
|
||
|
$dval =~ s/\s+\z//;
|
||
|
$dval =~ s/(?:\r\n|\r|\n)(?=[ \t])//g;
|
||
|
$dval =~ s/\r\n|\r|\n/ /g;
|
||
|
}
|
||
|
|
||
|
sprintf '%s=%s',
|
||
|
Sympa::Tools::Text::encode_uri($dkey),
|
||
|
Sympa::Tools::Text::encode_uri($dval);
|
||
|
} sort keys %$query
|
||
|
);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
sub pad {
|
||
|
my $str = shift;
|
||
|
my $width = shift;
|
||
|
|
||
|
return $str unless $width and defined $str;
|
||
|
|
||
|
my $ustr = Encode::is_utf8($str) ? $str : Encode::decode_utf8($str);
|
||
|
my $cols = Unicode::GCString->new($ustr)->columns;
|
||
|
|
||
|
unless ($cols < abs $width) {
|
||
|
return $str;
|
||
|
} elsif ($width < 0) {
|
||
|
return $str . (' ' x (-$width - $cols));
|
||
|
} else {
|
||
|
return (' ' x ($width - $cols)) . $str;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
# Old name: tools::qdecode_filename().
|
||
|
sub qdecode_filename {
|
||
|
my $filename = shift;
|
||
|
|
||
|
## We don't use MIME::Words here because it does not encode properly
|
||
|
## Unicode
|
||
|
## Check if string is already Q-encoded first
|
||
|
#if ($filename =~ /\=\?UTF-8\?/) {
|
||
|
$filename = Encode::encode_utf8(Encode::decode('MIME-Q', $filename));
|
||
|
#}
|
||
|
|
||
|
return $filename;
|
||
|
}
|
||
|
|
||
|
# Old name: tools::qencode_filename().
|
||
|
sub qencode_filename {
|
||
|
my $filename = shift;
|
||
|
|
||
|
## We don't use MIME::Words here because it does not encode properly
|
||
|
## Unicode
|
||
|
## Check if string is already Q-encoded first
|
||
|
## Also check if the string contains 8bit chars
|
||
|
unless ($filename =~ /\=\?UTF-8\?/
|
||
|
|| $filename =~ /^[\x00-\x7f]*$/) {
|
||
|
|
||
|
## Don't encode elements such as .desc. or .url or .moderate
|
||
|
## or .extension
|
||
|
my $part = $filename;
|
||
|
my ($leading, $trailing);
|
||
|
$leading = $1 if ($part =~ s/^(\.desc\.)//); ## leading .desc
|
||
|
$trailing = $1 if ($part =~ s/((\.\w+)+)$//); ## trailing .xx
|
||
|
|
||
|
my $encoded_part = MIME::EncWords::encode_mimewords(
|
||
|
$part,
|
||
|
Charset => 'utf8',
|
||
|
Encoding => 'q',
|
||
|
MaxLineLen => 1000,
|
||
|
Minimal => 'NO'
|
||
|
);
|
||
|
|
||
|
$filename = $leading . $encoded_part . $trailing;
|
||
|
}
|
||
|
|
||
|
return $filename;
|
||
|
}
|
||
|
|
||
|
# Old name: tools::unescape_chars().
|
||
|
sub unescape_chars {
|
||
|
my $s = shift;
|
||
|
|
||
|
$s =~ s/%a5/\//g; ## Special traetment for '/'
|
||
|
foreach my $i (0x20 .. 0x2c, 0x3a .. 0x3f, 0x5b, 0x5d, 0x80 .. 0x9f,
|
||
|
0xa0 .. 0xff) {
|
||
|
my $hex_i = sprintf "%lx", $i;
|
||
|
my $hex_s = sprintf "%c", $i;
|
||
|
$s =~ s/%$hex_i/$hex_s/g;
|
||
|
}
|
||
|
|
||
|
return $s;
|
||
|
}
|
||
|
|
||
|
# Old name: tools::valid_email().
|
||
|
sub valid_email {
|
||
|
my $email = shift;
|
||
|
|
||
|
my $email_re = Sympa::Regexps::email();
|
||
|
return undef unless $email =~ /^${email_re}$/;
|
||
|
|
||
|
# Forbidden characters.
|
||
|
return undef if $email =~ /[\|\$\*\?\!]/;
|
||
|
|
||
|
return 1;
|
||
|
}
|
||
|
|
||
|
sub weburl {
|
||
|
my $base = shift;
|
||
|
my $paths = shift;
|
||
|
my %options = @_;
|
||
|
|
||
|
my @paths = map {
|
||
|
Sympa::Tools::Text::encode_uri(
|
||
|
(not defined $_) ? ''
|
||
|
: $options{decode_html} ? Sympa::Tools::Text::decode_html($_)
|
||
|
: $_
|
||
|
);
|
||
|
} @{$paths || []};
|
||
|
|
||
|
my $qstring = _url_query_string(
|
||
|
$options{query},
|
||
|
decode_html => $options{decode_html},
|
||
|
sepchar => '&',
|
||
|
);
|
||
|
|
||
|
my $fstring;
|
||
|
my $fragment = $options{fragment};
|
||
|
if (defined $fragment) {
|
||
|
$fstring = '#'
|
||
|
. Sympa::Tools::Text::encode_uri(
|
||
|
$options{decode_html}
|
||
|
? Sympa::Tools::Text::decode_html($fragment)
|
||
|
: $fragment
|
||
|
);
|
||
|
} else {
|
||
|
$fstring = '';
|
||
|
}
|
||
|
|
||
|
return sprintf '%s%s%s', join('/', grep { defined $_ } ($base, @paths)),
|
||
|
$qstring, $fstring;
|
||
|
}
|
||
|
|
||
|
1;
|
||
|
__END__
|
||
|
|
||
|
=encoding utf-8
|
||
|
|
||
|
=head1 NAME
|
||
|
|
||
|
Sympa::Tools::Text - Text-related functions
|
||
|
|
||
|
=head1 DESCRIPTION
|
||
|
|
||
|
This package provides some text-related functions.
|
||
|
|
||
|
=head2 Functions
|
||
|
|
||
|
=over
|
||
|
|
||
|
=item addrencode ( $addr, [ $phrase, [ $charset, [ $comment ] ] ] )
|
||
|
|
||
|
Returns formatted (and encoded) name-addr as RFC5322 3.4.
|
||
|
|
||
|
=item canonic_email ( $email )
|
||
|
|
||
|
I<Function>.
|
||
|
Returns canonical form of e-mail address.
|
||
|
|
||
|
Leading and trailing white spaces are removed.
|
||
|
Latin letters without accents are lower-cased.
|
||
|
|
||
|
For malformed inputs returns C<undef>.
|
||
|
|
||
|
=item canonic_message_id ( $message_id )
|
||
|
|
||
|
Returns canonical form of message ID without trailing or leading whitespaces
|
||
|
or C<E<lt>>, C<E<gt>>.
|
||
|
|
||
|
=item canonic_text ( $text )
|
||
|
|
||
|
Canonicalizes text.
|
||
|
C<$text> should be a binary string encoded by UTF-8 character set or
|
||
|
a Unicode string.
|
||
|
Forbidden sequences in binary string will be replaced by
|
||
|
U+FFFD REPLACEMENT CHARACTERs, and Normalization Form C (NFC) will be applied.
|
||
|
|
||
|
=item decode_filesystem_safe ( $str )
|
||
|
|
||
|
I<Function>.
|
||
|
Decodes a string encoded by encode_filesystem_safe().
|
||
|
|
||
|
Parameter:
|
||
|
|
||
|
=over
|
||
|
|
||
|
=item $str
|
||
|
|
||
|
String to be decoded.
|
||
|
|
||
|
=back
|
||
|
|
||
|
Returns:
|
||
|
|
||
|
Decoded string, stripped C<utf8> flag if any.
|
||
|
|
||
|
=item decode_html ( $str )
|
||
|
|
||
|
I<Function>.
|
||
|
Decodes HTML entities in a string encoded by UTF-8 or a Unicode string.
|
||
|
|
||
|
Parameter:
|
||
|
|
||
|
=over
|
||
|
|
||
|
=item $str
|
||
|
|
||
|
String to be decoded.
|
||
|
|
||
|
=back
|
||
|
|
||
|
Returns:
|
||
|
|
||
|
Decoded string, stripped C<utf8> flag if any.
|
||
|
|
||
|
=item encode_filesystem_safe ( $str )
|
||
|
|
||
|
I<Function>.
|
||
|
Encodes a string $str to be suitable for filesystem.
|
||
|
|
||
|
Parameter:
|
||
|
|
||
|
=over
|
||
|
|
||
|
=item $str
|
||
|
|
||
|
String to be encoded.
|
||
|
|
||
|
=back
|
||
|
|
||
|
Returns:
|
||
|
|
||
|
Encoded string, stripped C<utf8> flag if any.
|
||
|
All bytes except C<'-'>, C<'+'>, C<'.'>, C<'@'>
|
||
|
and alphanumeric characters are encoded to sequences C<'_'> followed by
|
||
|
two hexdigits.
|
||
|
|
||
|
Note that C<'/'> will also be encoded.
|
||
|
|
||
|
=item encode_html ( $str, [ $additional_unsafe ] )
|
||
|
|
||
|
I<Function>.
|
||
|
Encodes characters in a string $str to HTML entities.
|
||
|
By default
|
||
|
C<'E<lt>'>, C<'E<gt>'>, C<'E<amp>'> and C<'E<quot>'> are encoded.
|
||
|
|
||
|
Parameter:
|
||
|
|
||
|
=over
|
||
|
|
||
|
=item $str
|
||
|
|
||
|
String to be encoded.
|
||
|
|
||
|
=item $additional_unsafe
|
||
|
|
||
|
Character or range of characters additionally encoded as entity references.
|
||
|
|
||
|
This optional parameter was introduced on Sympa 6.2.37b.3.
|
||
|
|
||
|
=back
|
||
|
|
||
|
Returns:
|
||
|
|
||
|
Encoded string, I<not> stripping utf8 flag if any.
|
||
|
|
||
|
=item encode_uri ( $str, [ omit => $chars ] )
|
||
|
|
||
|
I<Function>.
|
||
|
Encodes potentially unsafe characters in the string using "percent" encoding
|
||
|
suitable for URIs.
|
||
|
|
||
|
Parameters:
|
||
|
|
||
|
=over
|
||
|
|
||
|
=item $str
|
||
|
|
||
|
String to be encoded.
|
||
|
|
||
|
=item omit =E<gt> $chars
|
||
|
|
||
|
By default, all characters except those defined as "unreserved" in RFC 3986
|
||
|
are encoded, that is, C<[^-A-Za-z0-9._~]>.
|
||
|
If this parameter is given, it will prevent encoding additional characters.
|
||
|
|
||
|
=back
|
||
|
|
||
|
Returns:
|
||
|
|
||
|
Encoded string, stripped C<utf8> flag if any.
|
||
|
|
||
|
=item escape_chars ( $str )
|
||
|
|
||
|
Escape weird characters.
|
||
|
|
||
|
ToDo: This should be obsoleted in the future release: Would be better to use
|
||
|
L</encode_filesystem_safe>.
|
||
|
|
||
|
=item escape_url ( $str )
|
||
|
|
||
|
DEPRECATED.
|
||
|
Would be better to use L</"encode_uri"> or L</"mailtourl">.
|
||
|
|
||
|
=item foldcase ( $str )
|
||
|
|
||
|
I<Function>.
|
||
|
Returns "fold-case" string suitable for case-insensitive match.
|
||
|
For example, a code below looks for a needle in haystack not regarding case,
|
||
|
even if they are non-ASCII UTF-8 strings.
|
||
|
|
||
|
$haystack = Sympa::Tools::Text::foldcase($HayStack);
|
||
|
$needle = Sympa::Tools::Text::foldcase($NeedLe);
|
||
|
if (index $haystack, $needle >= 0) {
|
||
|
...
|
||
|
}
|
||
|
|
||
|
Parameter:
|
||
|
|
||
|
=over
|
||
|
|
||
|
=item $str
|
||
|
|
||
|
A string.
|
||
|
|
||
|
=back
|
||
|
|
||
|
=item guessed_to_utf8( $text, [ lang, ... ] )
|
||
|
|
||
|
I<Function>.
|
||
|
Guesses text charset considering language context
|
||
|
and returns the text reencoded by UTF-8.
|
||
|
|
||
|
Parameters:
|
||
|
|
||
|
=over
|
||
|
|
||
|
=item $text
|
||
|
|
||
|
Text to be reencoded.
|
||
|
|
||
|
=item lang, ...
|
||
|
|
||
|
Language tag(s) which may be given by L<Sympa::Language/"implicated_langs">.
|
||
|
|
||
|
=back
|
||
|
|
||
|
Returns:
|
||
|
|
||
|
Reencoded text.
|
||
|
If any charsets could not be guessed, C<iso-8859-1> will be used
|
||
|
as the last resort, just because it covers full range of 8-bit.
|
||
|
|
||
|
=item mailtourl ( $email, [ decode_html =E<gt> 1 ],
|
||
|
[ query =E<gt> {key =E<gt> val, ...} ] )
|
||
|
|
||
|
I<Function>.
|
||
|
Constructs a C<mailto:> URL for given e-mail.
|
||
|
|
||
|
Parameters:
|
||
|
|
||
|
=over
|
||
|
|
||
|
=item $email
|
||
|
|
||
|
E-mail address.
|
||
|
|
||
|
=item decode_html =E<gt> 1
|
||
|
|
||
|
If set, arguments are assumed to include HTML entities.
|
||
|
|
||
|
=item query =E<gt> {key =E<gt> val, ...}
|
||
|
|
||
|
Optional query.
|
||
|
|
||
|
=back
|
||
|
|
||
|
Returns:
|
||
|
|
||
|
Constructed URL.
|
||
|
|
||
|
=item pad ( $str, $width )
|
||
|
|
||
|
Pads space a string so that result will not be narrower than given width.
|
||
|
|
||
|
Parameters:
|
||
|
|
||
|
=over
|
||
|
|
||
|
=item $str
|
||
|
|
||
|
A string.
|
||
|
|
||
|
=item $width
|
||
|
|
||
|
If $width is false value or width of $str is not less than $width,
|
||
|
does nothing.
|
||
|
If $width is less than C<0>, pads right.
|
||
|
Otherwise, pads left.
|
||
|
|
||
|
=back
|
||
|
|
||
|
Returns:
|
||
|
|
||
|
Padded string.
|
||
|
|
||
|
=item qdecode_filename ( $filename )
|
||
|
|
||
|
Q-Decodes web file name.
|
||
|
|
||
|
ToDo:
|
||
|
This should be obsoleted in the future release: Would be better to use
|
||
|
L</decode_filesystem_safe>.
|
||
|
|
||
|
=item qencode_filename ( $filename )
|
||
|
|
||
|
Q-Encodes web file name.
|
||
|
|
||
|
ToDo:
|
||
|
This should be obsoleted in the future release: Would be better to use
|
||
|
L</encode_filesystem_safe>.
|
||
|
|
||
|
=item slurp ( $file )
|
||
|
|
||
|
Get entire content of the file.
|
||
|
Normalization by canonic_text() is applied.
|
||
|
C<$file> is the path to text file.
|
||
|
|
||
|
=item unescape_chars ( $str )
|
||
|
|
||
|
Unescape weird characters.
|
||
|
|
||
|
ToDo: This should be obsoleted in the future release: Would be better to use
|
||
|
L</decode_filesystem_safe>.
|
||
|
|
||
|
=item valid_email ( $string )
|
||
|
|
||
|
Basic check of an email address.
|
||
|
|
||
|
=item weburl ( $base, \@paths, [ decode_html =E<gt> 1 ],
|
||
|
[ fragment =E<gt> $fragment ], [ query =E<gt> \%query ] )
|
||
|
|
||
|
Constructs a C<http:> or C<https:> URL under given base URI.
|
||
|
|
||
|
Parameters:
|
||
|
|
||
|
=over
|
||
|
|
||
|
=item $base
|
||
|
|
||
|
Base URI.
|
||
|
|
||
|
=item \@paths
|
||
|
|
||
|
Additional path components.
|
||
|
|
||
|
=item decode_html =E<gt> 1
|
||
|
|
||
|
If set, arguments are assumed to include HTML entities.
|
||
|
Exception is $base:
|
||
|
It is assumed not to include entities.
|
||
|
|
||
|
=item fragment =E<gt> $fragment
|
||
|
|
||
|
Optional fragment.
|
||
|
|
||
|
=item query =E<gt> \%query
|
||
|
|
||
|
Optional query.
|
||
|
|
||
|
=back
|
||
|
|
||
|
Returns:
|
||
|
|
||
|
A URI.
|
||
|
|
||
|
=item wrap_text ( $text, [ $init_tab, [ $subsequent_tab, [ $cols ] ] ] )
|
||
|
|
||
|
I<Function>.
|
||
|
Returns line-wrapped text.
|
||
|
|
||
|
Parameters:
|
||
|
|
||
|
=over
|
||
|
|
||
|
=item $text
|
||
|
|
||
|
The text to be folded.
|
||
|
|
||
|
=item $init_tab
|
||
|
|
||
|
Indentation prepended to the first line of paragraph.
|
||
|
Default is C<''>, no indentation.
|
||
|
|
||
|
=item $subsequent_tab
|
||
|
|
||
|
Indentation prepended to each subsequent line of folded paragraph.
|
||
|
Default is C<''>, no indentation.
|
||
|
|
||
|
=item $cols
|
||
|
|
||
|
Max number of columns of folded text.
|
||
|
Default is C<78>.
|
||
|
|
||
|
=back
|
||
|
|
||
|
=back
|
||
|
|
||
|
=head1 HISTORY
|
||
|
|
||
|
L<Sympa::Tools::Text> appeared on Sympa 6.2a.41.
|
||
|
|
||
|
decode_filesystem_safe() and encode_filesystem_safe() were added
|
||
|
on Sympa 6.2.10.
|
||
|
|
||
|
decode_html(), encode_html(), encode_uri() and mailtourl()
|
||
|
were added on Sympa 6.2.14, and escape_url() was deprecated.
|
||
|
|
||
|
guessed_to_utf8() and pad() were added on Sympa 6.2.17.
|
||
|
|
||
|
canonic_text() and slurp() were added on Sympa 6.2.53b.
|
||
|
|
||
|
=cut
|