first commit
This commit is contained in:
942
dockers/apikaz/source/Sympa/Tools/Text.pm
Normal file
942
dockers/apikaz/source/Sympa/Tools/Text.pm
Normal file
@ -0,0 +1,942 @@
|
||||
# -*- indent-tabs-mode: nil; -*-
|
||||
# vim:ft=perl:et:sw=4
|
||||
# $Id$
|
||||
|
||||
# Sympa - SYsteme de Multi-Postage Automatique
|
||||
#
|
||||
# Copyright (c) 1997, 1998, 1999 Institut Pasteur & Christophe Wolfhugel
|
||||
# Copyright (c) 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005,
|
||||
# 2006, 2007, 2008, 2009, 2010, 2011 Comite Reseau des Universites
|
||||
# Copyright (c) 2011, 2012, 2013, 2014, 2015, 2016, 2017 GIP RENATER
|
||||
# Copyright 2018, 2020 The Sympa Community. See the AUTHORS.md
|
||||
# file at the top-level directory of this distribution and at
|
||||
# <https://github.com/sympa-community/sympa.git>.
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation; either version 2 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
package Sympa::Tools::Text;
|
||||
|
||||
use strict;
|
||||
use warnings;
|
||||
use Encode qw();
|
||||
use English qw(-no_match_vars);
|
||||
use Encode::MIME::Header; # 'MIME-Q' encoding.
|
||||
use HTML::Entities qw();
|
||||
use MIME::EncWords;
|
||||
use Text::LineFold;
|
||||
use Unicode::GCString;
|
||||
use URI::Escape qw();
|
||||
use if ($] < 5.016), qw(Unicode::CaseFold fc);
|
||||
use if (5.016 <= $]), qw(feature fc);
|
||||
BEGIN { eval 'use Unicode::Normalize qw()'; }
|
||||
BEGIN { eval 'use Unicode::UTF8 qw()'; }
|
||||
|
||||
use Sympa::Language;
|
||||
use Sympa::Regexps;
|
||||
|
||||
# Old name: tools::addrencode().
|
||||
sub addrencode {
|
||||
my $addr = shift;
|
||||
my $phrase = (shift || '');
|
||||
my $charset = (shift || 'utf8');
|
||||
my $comment = (shift || '');
|
||||
|
||||
return undef unless $addr =~ /\S/;
|
||||
|
||||
if ($phrase =~ /[^\s\x21-\x7E]/) {
|
||||
$phrase = MIME::EncWords::encode_mimewords(
|
||||
Encode::decode('utf8', $phrase),
|
||||
'Encoding' => 'A',
|
||||
'Charset' => $charset,
|
||||
'Replacement' => 'FALLBACK',
|
||||
'Field' => 'Resent-Sender', # almost longest
|
||||
'Minimal' => 'DISPNAME', # needs MIME::EncWords >= 1.012.
|
||||
);
|
||||
} elsif ($phrase =~ /\S/) {
|
||||
$phrase =~ s/([\\\"])/\\$1/g;
|
||||
$phrase = '"' . $phrase . '"';
|
||||
}
|
||||
if ($comment =~ /[^\s\x21-\x27\x2A-\x5B\x5D-\x7E]/) {
|
||||
$comment = MIME::EncWords::encode_mimewords(
|
||||
Encode::decode('utf8', $comment),
|
||||
'Encoding' => 'A',
|
||||
'Charset' => $charset,
|
||||
'Replacement' => 'FALLBACK',
|
||||
'Minimal' => 'DISPNAME',
|
||||
);
|
||||
} elsif ($comment =~ /\S/) {
|
||||
$comment =~ s/([\\\"])/\\$1/g;
|
||||
}
|
||||
|
||||
return
|
||||
($phrase =~ /\S/ ? "$phrase " : '')
|
||||
. ($comment =~ /\S/ ? "($comment) " : '')
|
||||
. "<$addr>";
|
||||
}
|
||||
|
||||
# Old names: tools::clean_email(), tools::get_canonical_email().
|
||||
sub canonic_email {
|
||||
my $email = shift;
|
||||
|
||||
return undef unless defined $email;
|
||||
|
||||
# Remove leading and trailing white spaces.
|
||||
$email =~ s/\A\s+//;
|
||||
$email =~ s/\s+\z//;
|
||||
|
||||
# Lower-case.
|
||||
$email =~ tr/A-Z/a-z/;
|
||||
|
||||
return (length $email) ? $email : undef;
|
||||
}
|
||||
|
||||
# Old name: tools::clean_msg_id().
|
||||
sub canonic_message_id {
|
||||
my $msg_id = shift;
|
||||
|
||||
return $msg_id unless defined $msg_id;
|
||||
|
||||
chomp $msg_id;
|
||||
|
||||
if ($msg_id =~ /\<(.+)\>/) {
|
||||
$msg_id = $1;
|
||||
}
|
||||
|
||||
return $msg_id;
|
||||
}
|
||||
|
||||
sub canonic_text {
|
||||
my $text = shift;
|
||||
|
||||
return undef unless defined $text;
|
||||
|
||||
# Normalize text. See also discussion on
|
||||
# https://listes.renater.fr/sympa/arc/sympa-developpers/2018-03/thrd1.html
|
||||
#
|
||||
# N.B.: Corresponding modules are optional by now, and should be
|
||||
# mandatory in the future.
|
||||
my $utext;
|
||||
if (Encode::is_utf8($text)) {
|
||||
$utext = $text;
|
||||
} elsif ($Unicode::UTF8::VERSION) {
|
||||
no warnings 'utf8';
|
||||
$utext = Unicode::UTF8::decode_utf8($text);
|
||||
} else {
|
||||
$utext = Encode::decode_utf8($text);
|
||||
}
|
||||
if ($Unicode::Normalize::VERSION) {
|
||||
$utext = Unicode::Normalize::normalize('NFC', $utext);
|
||||
}
|
||||
|
||||
# Remove DOS linefeeds (^M) that cause problems with Outlook 98, AOL,
|
||||
# and EIMS:
|
||||
$utext =~ s/\r\n|\r/\n/g;
|
||||
|
||||
if (Encode::is_utf8($text)) {
|
||||
return $utext;
|
||||
} else {
|
||||
return Encode::encode_utf8($utext);
|
||||
}
|
||||
}
|
||||
|
||||
sub slurp {
|
||||
my $path = shift;
|
||||
|
||||
my $ifh;
|
||||
return undef unless open $ifh, '<', $path;
|
||||
my $text = do { local $RS; <$ifh> };
|
||||
close $ifh;
|
||||
|
||||
return canonic_text($text);
|
||||
}
|
||||
|
||||
sub wrap_text {
|
||||
my $text = shift;
|
||||
my $init = shift;
|
||||
my $subs = shift;
|
||||
my $cols = shift;
|
||||
|
||||
$init //= '';
|
||||
$subs //= '';
|
||||
$cols //= 78;
|
||||
return $text unless $cols;
|
||||
|
||||
my $email_re = Sympa::Regexps::email();
|
||||
my $linefold = Text::LineFold->new(
|
||||
Language => Sympa::Language->instance->get_lang,
|
||||
Prep => 'NONBREAKURI',
|
||||
prep => [$email_re, sub { shift; @_ }],
|
||||
ColumnsMax => $cols,
|
||||
Format => sub {
|
||||
shift;
|
||||
my $event = shift;
|
||||
my $str = shift;
|
||||
if ($event =~ /^eo/) { return "\n"; }
|
||||
if ($event =~ /^so[tp]/) { return $init . $str; }
|
||||
if ($event eq 'sol') { return $subs . $str; }
|
||||
undef;
|
||||
},
|
||||
);
|
||||
|
||||
my $t = Encode::is_utf8($text) ? $text : Encode::decode_utf8($text);
|
||||
|
||||
my $ret = '';
|
||||
while (1000 < length $t) {
|
||||
my $s = substr $t, 0, 1000;
|
||||
$ret .= $linefold->break_partial($s);
|
||||
$t = substr $t, 1000;
|
||||
}
|
||||
$ret .= $linefold->break_partial($t) if length $t;
|
||||
$ret .= $linefold->break_partial(undef);
|
||||
|
||||
return Encode::is_utf8($text) ? $ret : Encode::encode_utf8($ret);
|
||||
}
|
||||
|
||||
sub decode_filesystem_safe {
|
||||
my $str = shift;
|
||||
return '' unless defined $str and length $str;
|
||||
|
||||
$str = Encode::encode_utf8($str) if Encode::is_utf8($str);
|
||||
# On case-insensitive filesystem "_XX" along with "_xx" should be decoded.
|
||||
$str =~ s/_([0-9A-Fa-f]{2})/chr hex "0x$1"/eg;
|
||||
return $str;
|
||||
}
|
||||
|
||||
sub decode_html {
|
||||
my $str = shift;
|
||||
|
||||
Encode::encode_utf8(
|
||||
HTML::Entities::decode_entities(Encode::decode_utf8($str)));
|
||||
}
|
||||
|
||||
sub encode_filesystem_safe {
|
||||
my $str = shift;
|
||||
return '' unless defined $str and length $str;
|
||||
|
||||
$str = Encode::encode_utf8($str) if Encode::is_utf8($str);
|
||||
$str =~ s/([^-+.0-9\@A-Za-z])/sprintf '_%02x', ord $1/eg;
|
||||
return $str;
|
||||
}
|
||||
|
||||
sub encode_html {
|
||||
my $str = shift;
|
||||
my $additional_unsafe = shift || '';
|
||||
|
||||
HTML::Entities::encode_entities($str, '<>&"' . $additional_unsafe);
|
||||
}
|
||||
|
||||
sub encode_uri {
|
||||
my $str = shift;
|
||||
my %options = @_;
|
||||
|
||||
# Note: URI-1.35 (URI::Escape 3.28) or later is required.
|
||||
return Encode::encode_utf8(
|
||||
URI::Escape::uri_escape_utf8(
|
||||
Encode::decode_utf8($str),
|
||||
'^-A-Za-z0-9._~' . (exists $options{omit} ? $options{omit} : '')
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
# Old name: tools::escape_chars().
|
||||
sub escape_chars {
|
||||
my $s = shift;
|
||||
my $except = shift; ## Exceptions
|
||||
my $ord_except = ord $except if defined $except;
|
||||
|
||||
## Escape chars
|
||||
## !"#$%&'()+,:;<=>?[] AND accented chars
|
||||
## escape % first
|
||||
foreach my $i (
|
||||
0x25,
|
||||
0x20 .. 0x24,
|
||||
0x26 .. 0x2c,
|
||||
0x3a .. 0x3f,
|
||||
0x5b, 0x5d,
|
||||
0x80 .. 0x9f,
|
||||
0xa0 .. 0xff
|
||||
) {
|
||||
next if defined $ord_except and $i == $ord_except;
|
||||
my $hex_i = sprintf "%lx", $i;
|
||||
$s =~ s/\x$hex_i/%$hex_i/g;
|
||||
}
|
||||
## Special traetment for '/'
|
||||
$s =~ s/\//%a5/g unless defined $except and $except eq '/';
|
||||
|
||||
return $s;
|
||||
}
|
||||
|
||||
# Old name: tt2::escape_url().
|
||||
# DEPRECATED. Use Sympa::Tools::Text::escape_uri() or
|
||||
# Sympa::Tools::Text::mailtourl().
|
||||
#sub escape_url;
|
||||
|
||||
sub foldcase {
|
||||
my $str = shift;
|
||||
|
||||
return '' unless defined $str and length $str;
|
||||
# Perl 5.16.0 and later have built-in fc(). Earlier uses Unicode::CaseFold.
|
||||
return Encode::encode_utf8(fc(Encode::decode_utf8($str)));
|
||||
}
|
||||
|
||||
my %legacy_charsets = (
|
||||
'ar' => [qw(iso-8859-6)],
|
||||
'bs' => [qw(iso-8859-2)],
|
||||
'cs' => [qw(iso-8859-2)],
|
||||
'eo' => [qw(iso-8859-3)],
|
||||
'et' => [qw(iso-8859-4)],
|
||||
'he' => [qw(iso-8859-8)],
|
||||
'hr' => [qw(iso-8859-2)],
|
||||
'hu' => [qw(iso-8859-2)],
|
||||
'ja' => [qw(euc-jp cp932 MacJapanese)],
|
||||
'kl' => [qw(iso-8859-4)],
|
||||
'ko' => [qw(cp949)],
|
||||
'lt' => [qw(iso-8859-4)],
|
||||
'lv' => [qw(iso-8859-4)],
|
||||
'mt' => [qw(iso-8859-3)],
|
||||
'pl' => [qw(iso-8859-2)],
|
||||
'ro' => [qw(iso-8859-2)],
|
||||
'ru' => [qw(koi8-r cp1251)], # cp866? MacCyrillic?
|
||||
'sk' => [qw(iso-8859-2)],
|
||||
'sl' => [qw(iso-8859-2)],
|
||||
'th' => [qw(iso-8859-11 cp874 MacThai)],
|
||||
'tr' => [qw(iso-8859-9)],
|
||||
'uk' => [qw(koi8-u)], # MacUkrainian?
|
||||
'zh-CN' => [qw(euc-cn)],
|
||||
'zh-TW' => [qw(big5-eten)],
|
||||
);
|
||||
|
||||
sub guessed_to_utf8 {
|
||||
my $text = shift;
|
||||
my @langs = @_;
|
||||
|
||||
return Encode::encode_utf8($text) if Encode::is_utf8($text);
|
||||
return $text
|
||||
unless defined $text
|
||||
and length $text
|
||||
and $text =~ /[^\x00-\x7F]/;
|
||||
|
||||
my $utf8;
|
||||
if ($Unicode::UTF8::VERSION) {
|
||||
$utf8 =
|
||||
eval { Unicode::UTF8::decode_utf8($text, Encode::FB_CROAK()) };
|
||||
}
|
||||
unless (defined $utf8) {
|
||||
foreach my $charset (map { $_ ? @$_ : () } @legacy_charsets{@langs}) {
|
||||
$utf8 =
|
||||
eval { Encode::decode($charset, $text, Encode::FB_CROAK()) };
|
||||
last if defined $utf8;
|
||||
}
|
||||
}
|
||||
unless (defined $utf8) {
|
||||
$utf8 = Encode::decode('iso-8859-1', $text);
|
||||
}
|
||||
|
||||
# Apply NFC: e.g. for modified-NFD by Mac OS X.
|
||||
$utf8 = Unicode::Normalize::normalize('NFC', $utf8)
|
||||
if $Unicode::Normalize::VERSION;
|
||||
|
||||
return Encode::encode_utf8($utf8);
|
||||
}
|
||||
|
||||
sub mailtourl {
|
||||
my $text = shift;
|
||||
my %options = @_;
|
||||
|
||||
my $dtext =
|
||||
(not defined $text) ? ''
|
||||
: $options{decode_html} ? Sympa::Tools::Text::decode_html($text)
|
||||
: $text;
|
||||
$dtext =~ s/\A\s+//;
|
||||
$dtext =~ s/\s+\z//;
|
||||
$dtext =~ s/(?:\r\n|\r|\n)(?=[ \t])//g;
|
||||
$dtext =~ s/\r\n|\r|\n/ /g;
|
||||
|
||||
# The ``@'' in email address should not be encoded because some MUAs
|
||||
# aren't able to decode ``%40'' in e-mail address of mailto: URL.
|
||||
# Contrary, ``@'' in query component should be encoded because some
|
||||
# MUAs take it for a delimiter to separate URL from the rest.
|
||||
my ($format, $utext, $qsep);
|
||||
if ($dtext =~ /[()<>\[\]:;,\"\s]/) {
|
||||
# Use "to" header if source text includes any of RFC 5322
|
||||
# "specials", minus ``@'' and ``\'', plus whitespaces.
|
||||
$format = 'mailto:?to=%s%s';
|
||||
$utext = Sympa::Tools::Text::encode_uri($dtext);
|
||||
$qsep = '&';
|
||||
} else {
|
||||
$format = 'mailto:%s%s';
|
||||
$utext = Sympa::Tools::Text::encode_uri($dtext, omit => '@');
|
||||
$qsep = '?';
|
||||
}
|
||||
my $qstring = _url_query_string(
|
||||
$options{query},
|
||||
decode_html => $options{decode_html},
|
||||
leadchar => $qsep,
|
||||
sepchar => '&',
|
||||
trim_values => 1,
|
||||
);
|
||||
|
||||
return sprintf $format, $utext, $qstring;
|
||||
}
|
||||
|
||||
sub _url_query_string {
|
||||
my $query = shift;
|
||||
my %options = @_;
|
||||
|
||||
unless (ref $query eq 'HASH' and %$query) {
|
||||
return '';
|
||||
} else {
|
||||
my $decode_html = $options{decode_html};
|
||||
my $trim_values = $options{trim_values};
|
||||
return ($options{leadchar} || '?') . join(
|
||||
($options{sepchar} || ';'),
|
||||
map {
|
||||
my ($dkey, $dval) = map {
|
||||
(not defined $_) ? ''
|
||||
: $decode_html ? Sympa::Tools::Text::decode_html($_)
|
||||
: $_;
|
||||
} ($_, $query->{$_});
|
||||
if ($trim_values and lc $dkey ne 'body') {
|
||||
$dval =~ s/\A\s+//;
|
||||
$dval =~ s/\s+\z//;
|
||||
$dval =~ s/(?:\r\n|\r|\n)(?=[ \t])//g;
|
||||
$dval =~ s/\r\n|\r|\n/ /g;
|
||||
}
|
||||
|
||||
sprintf '%s=%s',
|
||||
Sympa::Tools::Text::encode_uri($dkey),
|
||||
Sympa::Tools::Text::encode_uri($dval);
|
||||
} sort keys %$query
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
sub pad {
|
||||
my $str = shift;
|
||||
my $width = shift;
|
||||
|
||||
return $str unless $width and defined $str;
|
||||
|
||||
my $ustr = Encode::is_utf8($str) ? $str : Encode::decode_utf8($str);
|
||||
my $cols = Unicode::GCString->new($ustr)->columns;
|
||||
|
||||
unless ($cols < abs $width) {
|
||||
return $str;
|
||||
} elsif ($width < 0) {
|
||||
return $str . (' ' x (-$width - $cols));
|
||||
} else {
|
||||
return (' ' x ($width - $cols)) . $str;
|
||||
}
|
||||
}
|
||||
|
||||
# Old name: tools::qdecode_filename().
|
||||
sub qdecode_filename {
|
||||
my $filename = shift;
|
||||
|
||||
## We don't use MIME::Words here because it does not encode properly
|
||||
## Unicode
|
||||
## Check if string is already Q-encoded first
|
||||
#if ($filename =~ /\=\?UTF-8\?/) {
|
||||
$filename = Encode::encode_utf8(Encode::decode('MIME-Q', $filename));
|
||||
#}
|
||||
|
||||
return $filename;
|
||||
}
|
||||
|
||||
# Old name: tools::qencode_filename().
|
||||
sub qencode_filename {
|
||||
my $filename = shift;
|
||||
|
||||
## We don't use MIME::Words here because it does not encode properly
|
||||
## Unicode
|
||||
## Check if string is already Q-encoded first
|
||||
## Also check if the string contains 8bit chars
|
||||
unless ($filename =~ /\=\?UTF-8\?/
|
||||
|| $filename =~ /^[\x00-\x7f]*$/) {
|
||||
|
||||
## Don't encode elements such as .desc. or .url or .moderate
|
||||
## or .extension
|
||||
my $part = $filename;
|
||||
my ($leading, $trailing);
|
||||
$leading = $1 if ($part =~ s/^(\.desc\.)//); ## leading .desc
|
||||
$trailing = $1 if ($part =~ s/((\.\w+)+)$//); ## trailing .xx
|
||||
|
||||
my $encoded_part = MIME::EncWords::encode_mimewords(
|
||||
$part,
|
||||
Charset => 'utf8',
|
||||
Encoding => 'q',
|
||||
MaxLineLen => 1000,
|
||||
Minimal => 'NO'
|
||||
);
|
||||
|
||||
$filename = $leading . $encoded_part . $trailing;
|
||||
}
|
||||
|
||||
return $filename;
|
||||
}
|
||||
|
||||
# Old name: tools::unescape_chars().
|
||||
sub unescape_chars {
|
||||
my $s = shift;
|
||||
|
||||
$s =~ s/%a5/\//g; ## Special traetment for '/'
|
||||
foreach my $i (0x20 .. 0x2c, 0x3a .. 0x3f, 0x5b, 0x5d, 0x80 .. 0x9f,
|
||||
0xa0 .. 0xff) {
|
||||
my $hex_i = sprintf "%lx", $i;
|
||||
my $hex_s = sprintf "%c", $i;
|
||||
$s =~ s/%$hex_i/$hex_s/g;
|
||||
}
|
||||
|
||||
return $s;
|
||||
}
|
||||
|
||||
# Old name: tools::valid_email().
|
||||
sub valid_email {
|
||||
my $email = shift;
|
||||
|
||||
my $email_re = Sympa::Regexps::email();
|
||||
return undef unless $email =~ /^${email_re}$/;
|
||||
|
||||
# Forbidden characters.
|
||||
return undef if $email =~ /[\|\$\*\?\!]/;
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
sub weburl {
|
||||
my $base = shift;
|
||||
my $paths = shift;
|
||||
my %options = @_;
|
||||
|
||||
my @paths = map {
|
||||
Sympa::Tools::Text::encode_uri(
|
||||
(not defined $_) ? ''
|
||||
: $options{decode_html} ? Sympa::Tools::Text::decode_html($_)
|
||||
: $_
|
||||
);
|
||||
} @{$paths || []};
|
||||
|
||||
my $qstring = _url_query_string(
|
||||
$options{query},
|
||||
decode_html => $options{decode_html},
|
||||
sepchar => '&',
|
||||
);
|
||||
|
||||
my $fstring;
|
||||
my $fragment = $options{fragment};
|
||||
if (defined $fragment) {
|
||||
$fstring = '#'
|
||||
. Sympa::Tools::Text::encode_uri(
|
||||
$options{decode_html}
|
||||
? Sympa::Tools::Text::decode_html($fragment)
|
||||
: $fragment
|
||||
);
|
||||
} else {
|
||||
$fstring = '';
|
||||
}
|
||||
|
||||
return sprintf '%s%s%s', join('/', grep { defined $_ } ($base, @paths)),
|
||||
$qstring, $fstring;
|
||||
}
|
||||
|
||||
1;
|
||||
__END__
|
||||
|
||||
=encoding utf-8
|
||||
|
||||
=head1 NAME
|
||||
|
||||
Sympa::Tools::Text - Text-related functions
|
||||
|
||||
=head1 DESCRIPTION
|
||||
|
||||
This package provides some text-related functions.
|
||||
|
||||
=head2 Functions
|
||||
|
||||
=over
|
||||
|
||||
=item addrencode ( $addr, [ $phrase, [ $charset, [ $comment ] ] ] )
|
||||
|
||||
Returns formatted (and encoded) name-addr as RFC5322 3.4.
|
||||
|
||||
=item canonic_email ( $email )
|
||||
|
||||
I<Function>.
|
||||
Returns canonical form of e-mail address.
|
||||
|
||||
Leading and trailing white spaces are removed.
|
||||
Latin letters without accents are lower-cased.
|
||||
|
||||
For malformed inputs returns C<undef>.
|
||||
|
||||
=item canonic_message_id ( $message_id )
|
||||
|
||||
Returns canonical form of message ID without trailing or leading whitespaces
|
||||
or C<E<lt>>, C<E<gt>>.
|
||||
|
||||
=item canonic_text ( $text )
|
||||
|
||||
Canonicalizes text.
|
||||
C<$text> should be a binary string encoded by UTF-8 character set or
|
||||
a Unicode string.
|
||||
Forbidden sequences in binary string will be replaced by
|
||||
U+FFFD REPLACEMENT CHARACTERs, and Normalization Form C (NFC) will be applied.
|
||||
|
||||
=item decode_filesystem_safe ( $str )
|
||||
|
||||
I<Function>.
|
||||
Decodes a string encoded by encode_filesystem_safe().
|
||||
|
||||
Parameter:
|
||||
|
||||
=over
|
||||
|
||||
=item $str
|
||||
|
||||
String to be decoded.
|
||||
|
||||
=back
|
||||
|
||||
Returns:
|
||||
|
||||
Decoded string, stripped C<utf8> flag if any.
|
||||
|
||||
=item decode_html ( $str )
|
||||
|
||||
I<Function>.
|
||||
Decodes HTML entities in a string encoded by UTF-8 or a Unicode string.
|
||||
|
||||
Parameter:
|
||||
|
||||
=over
|
||||
|
||||
=item $str
|
||||
|
||||
String to be decoded.
|
||||
|
||||
=back
|
||||
|
||||
Returns:
|
||||
|
||||
Decoded string, stripped C<utf8> flag if any.
|
||||
|
||||
=item encode_filesystem_safe ( $str )
|
||||
|
||||
I<Function>.
|
||||
Encodes a string $str to be suitable for filesystem.
|
||||
|
||||
Parameter:
|
||||
|
||||
=over
|
||||
|
||||
=item $str
|
||||
|
||||
String to be encoded.
|
||||
|
||||
=back
|
||||
|
||||
Returns:
|
||||
|
||||
Encoded string, stripped C<utf8> flag if any.
|
||||
All bytes except C<'-'>, C<'+'>, C<'.'>, C<'@'>
|
||||
and alphanumeric characters are encoded to sequences C<'_'> followed by
|
||||
two hexdigits.
|
||||
|
||||
Note that C<'/'> will also be encoded.
|
||||
|
||||
=item encode_html ( $str, [ $additional_unsafe ] )
|
||||
|
||||
I<Function>.
|
||||
Encodes characters in a string $str to HTML entities.
|
||||
By default
|
||||
C<'E<lt>'>, C<'E<gt>'>, C<'E<amp>'> and C<'E<quot>'> are encoded.
|
||||
|
||||
Parameter:
|
||||
|
||||
=over
|
||||
|
||||
=item $str
|
||||
|
||||
String to be encoded.
|
||||
|
||||
=item $additional_unsafe
|
||||
|
||||
Character or range of characters additionally encoded as entity references.
|
||||
|
||||
This optional parameter was introduced on Sympa 6.2.37b.3.
|
||||
|
||||
=back
|
||||
|
||||
Returns:
|
||||
|
||||
Encoded string, I<not> stripping utf8 flag if any.
|
||||
|
||||
=item encode_uri ( $str, [ omit => $chars ] )
|
||||
|
||||
I<Function>.
|
||||
Encodes potentially unsafe characters in the string using "percent" encoding
|
||||
suitable for URIs.
|
||||
|
||||
Parameters:
|
||||
|
||||
=over
|
||||
|
||||
=item $str
|
||||
|
||||
String to be encoded.
|
||||
|
||||
=item omit =E<gt> $chars
|
||||
|
||||
By default, all characters except those defined as "unreserved" in RFC 3986
|
||||
are encoded, that is, C<[^-A-Za-z0-9._~]>.
|
||||
If this parameter is given, it will prevent encoding additional characters.
|
||||
|
||||
=back
|
||||
|
||||
Returns:
|
||||
|
||||
Encoded string, stripped C<utf8> flag if any.
|
||||
|
||||
=item escape_chars ( $str )
|
||||
|
||||
Escape weird characters.
|
||||
|
||||
ToDo: This should be obsoleted in the future release: Would be better to use
|
||||
L</encode_filesystem_safe>.
|
||||
|
||||
=item escape_url ( $str )
|
||||
|
||||
DEPRECATED.
|
||||
Would be better to use L</"encode_uri"> or L</"mailtourl">.
|
||||
|
||||
=item foldcase ( $str )
|
||||
|
||||
I<Function>.
|
||||
Returns "fold-case" string suitable for case-insensitive match.
|
||||
For example, a code below looks for a needle in haystack not regarding case,
|
||||
even if they are non-ASCII UTF-8 strings.
|
||||
|
||||
$haystack = Sympa::Tools::Text::foldcase($HayStack);
|
||||
$needle = Sympa::Tools::Text::foldcase($NeedLe);
|
||||
if (index $haystack, $needle >= 0) {
|
||||
...
|
||||
}
|
||||
|
||||
Parameter:
|
||||
|
||||
=over
|
||||
|
||||
=item $str
|
||||
|
||||
A string.
|
||||
|
||||
=back
|
||||
|
||||
=item guessed_to_utf8( $text, [ lang, ... ] )
|
||||
|
||||
I<Function>.
|
||||
Guesses text charset considering language context
|
||||
and returns the text reencoded by UTF-8.
|
||||
|
||||
Parameters:
|
||||
|
||||
=over
|
||||
|
||||
=item $text
|
||||
|
||||
Text to be reencoded.
|
||||
|
||||
=item lang, ...
|
||||
|
||||
Language tag(s) which may be given by L<Sympa::Language/"implicated_langs">.
|
||||
|
||||
=back
|
||||
|
||||
Returns:
|
||||
|
||||
Reencoded text.
|
||||
If any charsets could not be guessed, C<iso-8859-1> will be used
|
||||
as the last resort, just because it covers full range of 8-bit.
|
||||
|
||||
=item mailtourl ( $email, [ decode_html =E<gt> 1 ],
|
||||
[ query =E<gt> {key =E<gt> val, ...} ] )
|
||||
|
||||
I<Function>.
|
||||
Constructs a C<mailto:> URL for given e-mail.
|
||||
|
||||
Parameters:
|
||||
|
||||
=over
|
||||
|
||||
=item $email
|
||||
|
||||
E-mail address.
|
||||
|
||||
=item decode_html =E<gt> 1
|
||||
|
||||
If set, arguments are assumed to include HTML entities.
|
||||
|
||||
=item query =E<gt> {key =E<gt> val, ...}
|
||||
|
||||
Optional query.
|
||||
|
||||
=back
|
||||
|
||||
Returns:
|
||||
|
||||
Constructed URL.
|
||||
|
||||
=item pad ( $str, $width )
|
||||
|
||||
Pads space a string so that result will not be narrower than given width.
|
||||
|
||||
Parameters:
|
||||
|
||||
=over
|
||||
|
||||
=item $str
|
||||
|
||||
A string.
|
||||
|
||||
=item $width
|
||||
|
||||
If $width is false value or width of $str is not less than $width,
|
||||
does nothing.
|
||||
If $width is less than C<0>, pads right.
|
||||
Otherwise, pads left.
|
||||
|
||||
=back
|
||||
|
||||
Returns:
|
||||
|
||||
Padded string.
|
||||
|
||||
=item qdecode_filename ( $filename )
|
||||
|
||||
Q-Decodes web file name.
|
||||
|
||||
ToDo:
|
||||
This should be obsoleted in the future release: Would be better to use
|
||||
L</decode_filesystem_safe>.
|
||||
|
||||
=item qencode_filename ( $filename )
|
||||
|
||||
Q-Encodes web file name.
|
||||
|
||||
ToDo:
|
||||
This should be obsoleted in the future release: Would be better to use
|
||||
L</encode_filesystem_safe>.
|
||||
|
||||
=item slurp ( $file )
|
||||
|
||||
Get entire content of the file.
|
||||
Normalization by canonic_text() is applied.
|
||||
C<$file> is the path to text file.
|
||||
|
||||
=item unescape_chars ( $str )
|
||||
|
||||
Unescape weird characters.
|
||||
|
||||
ToDo: This should be obsoleted in the future release: Would be better to use
|
||||
L</decode_filesystem_safe>.
|
||||
|
||||
=item valid_email ( $string )
|
||||
|
||||
Basic check of an email address.
|
||||
|
||||
=item weburl ( $base, \@paths, [ decode_html =E<gt> 1 ],
|
||||
[ fragment =E<gt> $fragment ], [ query =E<gt> \%query ] )
|
||||
|
||||
Constructs a C<http:> or C<https:> URL under given base URI.
|
||||
|
||||
Parameters:
|
||||
|
||||
=over
|
||||
|
||||
=item $base
|
||||
|
||||
Base URI.
|
||||
|
||||
=item \@paths
|
||||
|
||||
Additional path components.
|
||||
|
||||
=item decode_html =E<gt> 1
|
||||
|
||||
If set, arguments are assumed to include HTML entities.
|
||||
Exception is $base:
|
||||
It is assumed not to include entities.
|
||||
|
||||
=item fragment =E<gt> $fragment
|
||||
|
||||
Optional fragment.
|
||||
|
||||
=item query =E<gt> \%query
|
||||
|
||||
Optional query.
|
||||
|
||||
=back
|
||||
|
||||
Returns:
|
||||
|
||||
A URI.
|
||||
|
||||
=item wrap_text ( $text, [ $init_tab, [ $subsequent_tab, [ $cols ] ] ] )
|
||||
|
||||
I<Function>.
|
||||
Returns line-wrapped text.
|
||||
|
||||
Parameters:
|
||||
|
||||
=over
|
||||
|
||||
=item $text
|
||||
|
||||
The text to be folded.
|
||||
|
||||
=item $init_tab
|
||||
|
||||
Indentation prepended to the first line of paragraph.
|
||||
Default is C<''>, no indentation.
|
||||
|
||||
=item $subsequent_tab
|
||||
|
||||
Indentation prepended to each subsequent line of folded paragraph.
|
||||
Default is C<''>, no indentation.
|
||||
|
||||
=item $cols
|
||||
|
||||
Max number of columns of folded text.
|
||||
Default is C<78>.
|
||||
|
||||
=back
|
||||
|
||||
=back
|
||||
|
||||
=head1 HISTORY
|
||||
|
||||
L<Sympa::Tools::Text> appeared on Sympa 6.2a.41.
|
||||
|
||||
decode_filesystem_safe() and encode_filesystem_safe() were added
|
||||
on Sympa 6.2.10.
|
||||
|
||||
decode_html(), encode_html(), encode_uri() and mailtourl()
|
||||
were added on Sympa 6.2.14, and escape_url() was deprecated.
|
||||
|
||||
guessed_to_utf8() and pad() were added on Sympa 6.2.17.
|
||||
|
||||
canonic_text() and slurp() were added on Sympa 6.2.53b.
|
||||
|
||||
=cut
|
Reference in New Issue
Block a user