first commit

This commit is contained in:
2024-06-03 18:43:35 +02:00
parent 2da01a3f6e
commit f501d519af
883 changed files with 71550 additions and 2 deletions

View File

@ -0,0 +1,506 @@
# -*- indent-tabs-mode: nil; -*-
# vim:ft=perl:et:sw=4
# $Id$
# Sympa - SYsteme de Multi-Postage Automatique
#
# Copyright (c) 1997, 1998, 1999 Institut Pasteur & Christophe Wolfhugel
# Copyright (c) 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005,
# 2006, 2007, 2008, 2009, 2010, 2011 Comite Reseau des Universites
# Copyright (c) 2011, 2012, 2013, 2014, 2015, 2016, 2017 GIP RENATER
# Copyright 2018 The Sympa Community. See the AUTHORS.md file at the
# top-level directory of this distribution and at
# <https://github.com/sympa-community/sympa.git>.
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
package Sympa::Tools::Data;
use strict;
use warnings;
use Encode qw();
use English qw(-no_match_vars);
use POSIX qw();
use XML::LibXML qw();
BEGIN { eval 'use Clone qw()'; }
use Sympa::Tools::Text;
## This applies recursively to a data structure
## The transformation subroutine is passed as a ref
sub recursive_transformation {
my ($var, $subref) = @_;
return unless (ref($var));
if (ref($var) eq 'ARRAY') {
foreach my $index (0 .. $#{$var}) {
if (ref($var->[$index])) {
recursive_transformation($var->[$index], $subref);
} else {
$var->[$index] = &{$subref}($var->[$index]);
}
}
} elsif (ref($var) eq 'HASH') {
foreach my $key (keys %{$var}) {
if (ref($var->{$key})) {
recursive_transformation($var->{$key}, $subref);
} else {
$var->{$key} = &{$subref}($var->{$key});
}
}
}
return;
}
## Dump a variable's content
sub dump_var {
my ($var, $level, $fd) = @_;
return undef unless ($fd);
if (ref($var)) {
if (ref($var) eq 'ARRAY') {
foreach my $index (0 .. $#{$var}) {
print $fd "\t" x $level . $index . "\n";
dump_var($var->[$index], $level + 1, $fd);
}
} elsif (ref($var) eq 'HASH'
|| ref($var) eq 'Sympa::Scenario'
|| ref($var) eq 'Sympa::List'
|| ref($var) eq 'CGI::Fast') {
foreach my $key (sort keys %{$var}) {
print $fd "\t" x $level . '_' . $key . '_' . "\n";
dump_var($var->{$key}, $level + 1, $fd);
}
} else {
printf $fd "\t" x $level . "'%s'" . "\n", ref($var);
}
} else {
if (defined $var) {
print $fd "\t" x $level . "'$var'" . "\n";
} else {
print $fd "\t" x $level . "UNDEF\n";
}
}
}
## Dump a variable's content
sub dump_html_var {
my ($var) = shift;
my $html = '';
if (ref($var)) {
if (ref($var) eq 'ARRAY') {
$html .= '<ul>';
foreach my $index (0 .. $#{$var}) {
$html .= '<li> ' . $index . ':';
$html .= dump_html_var($var->[$index]);
$html .= '</li>';
}
$html .= '</ul>';
} elsif (ref($var) eq 'HASH'
|| ref($var) eq 'Sympa::Scenario'
|| ref($var) eq 'Sympa::List') {
$html .= '<ul>';
foreach my $key (sort keys %{$var}) {
$html .= '<li>' . $key . '=';
$html .= dump_html_var($var->{$key});
$html .= '</li>';
}
$html .= '</ul>';
} else {
$html .= 'EEEEEEEEEEEEEEEEEEEEE' . ref($var);
}
} else {
if (defined $var) {
$html .= Sympa::Tools::Text::encode_html($var);
} else {
$html .= 'UNDEF';
}
}
return $html;
}
# Duplicates a complex variable (faster).
# CAUTION: This duplicates blessed elements even if they are
# singleton/multiton; this breaks subroutine references.
sub clone_var {
return Clone::clone($_[0]) if $Clone::VERSION;
goto &dup_var; # '&' needed
}
## Duplictate a complex variable
sub dup_var {
my ($var) = @_;
if (ref($var)) {
if (ref($var) eq 'ARRAY') {
my $new_var = [];
foreach my $index (0 .. $#{$var}) {
$new_var->[$index] = dup_var($var->[$index]);
}
return $new_var;
} elsif (ref($var) eq 'HASH') {
my $new_var = {};
foreach my $key (sort keys %{$var}) {
$new_var->{$key} = dup_var($var->{$key});
}
return $new_var;
}
}
return $var;
}
####################################################
# get_array_from_splitted_string
####################################################
# return an array made on a string splited by ','.
# It removes spaces.
#
#
# IN : -$string (+): string to split
#
# OUT : -ref(ARRAY)
#
######################################################
# Note: This is used only by Sympa::List.
sub get_array_from_splitted_string {
my ($string) = @_;
my @array;
foreach my $word (split /,/, $string) {
$word =~ s/^\s+//;
$word =~ s/\s+$//;
push @array, $word;
}
return \@array;
}
####################################################
# diff_on_arrays
####################################################
# Makes set operation on arrays (seen as set, with no double) :
# - deleted : A \ B
# - added : B \ A
# - intersection : A /\ B
# - union : A \/ B
#
# IN : -$setA : ref(ARRAY) - set
# -$setB : ref(ARRAY) - set
#
# OUT : -ref(HASH) with keys :
# deleted, added, intersection, union
#
#######################################################
sub diff_on_arrays {
my ($setA, $setB) = @_;
my $result = {
'intersection' => [],
'union' => [],
'added' => [],
'deleted' => []
};
my %deleted;
my %added;
my %intersection;
my %union;
my %hashA;
my %hashB;
foreach my $eltA (@$setA) {
$hashA{$eltA} = 1;
$deleted{$eltA} = 1;
$union{$eltA} = 1;
}
foreach my $eltB (@$setB) {
$hashB{$eltB} = 1;
$added{$eltB} = 1;
if ($hashA{$eltB}) {
$intersection{$eltB} = 1;
$deleted{$eltB} = 0;
} else {
$union{$eltB} = 1;
}
}
foreach my $eltA (@$setA) {
if ($hashB{$eltA}) {
$added{$eltA} = 0;
}
}
foreach my $elt (keys %deleted) {
next unless $elt;
push @{$result->{'deleted'}}, $elt if ($deleted{$elt});
}
foreach my $elt (keys %added) {
next unless $elt;
push @{$result->{'added'}}, $elt if ($added{$elt});
}
foreach my $elt (keys %intersection) {
next unless $elt;
push @{$result->{'intersection'}}, $elt if ($intersection{$elt});
}
foreach my $elt (keys %union) {
next unless $elt;
push @{$result->{'union'}}, $elt if ($union{$elt});
}
return $result;
}
####################################################
# is_in_array
####################################################
# Test if a value is on an array
#
# IN : -$setA : ref(ARRAY) - set
# -$value : a serached value
#
# OUT : boolean
#######################################################
sub is_in_array {
my $set = shift;
die 'missing parameter "$value"' unless @_;
my $value = shift;
if (defined $value) {
foreach my $elt (@{$set || []}) {
next unless defined $elt;
return 1 if $elt eq $value;
}
} else {
foreach my $elt (@{$set || []}) {
return 1 unless defined $elt;
}
}
return undef;
}
=over
=item smart_eq ( $a, $b )
I<Function>.
Check if two strings are identical.
Parameters:
=over
=item $a, $b
Operands.
If both of them are undefined, they are equal.
If only one of them is undefined, the are not equal.
If C<$b> is a L<Regexp> object and it matches to C<$a>, they are equal.
Otherwise, they are compared as strings.
=back
Returns:
If arguments matched, true value. Otherwise false value.
=back
=cut
sub smart_eq {
die 'missing argument' if scalar @_ < 2;
my ($a, $b) = @_;
if (defined $a and defined $b) {
if (ref $b eq 'Regexp') {
return 1 if $a =~ $b;
} else {
return 1 if $a eq $b;
}
} elsif (!defined $a and !defined $b) {
return 1;
}
return undef;
}
## convert a string formated as var1="value1";var2="value2"; into a hash.
## Used when extracting from session table some session properties or when
## extracting users preference from user table
## Current encoding is NOT compatible with encoding of values with '"'
##
sub string_2_hash {
my $data = shift;
my %hash;
pos($data) = 0;
while ($data =~ /\G;?(\w+)\=\"((\\[\"\\]|[^\"])*)\"(?=(;|\z))/g) {
my ($var, $val) = ($1, $2);
$val =~ s/\\([\"\\])/$1/g;
$hash{$var} = $val;
}
return (%hash);
}
## convert a hash into a string formated as var1="value1";var2="value2"; into
## a hash
sub hash_2_string {
my $refhash = shift;
return undef unless ref $refhash eq 'HASH';
my $data_string;
foreach my $var (keys %$refhash) {
next unless length $var;
my $val = $refhash->{$var};
$val = '' unless defined $val;
$val =~ s/([\"\\])/\\$1/g;
$data_string .= ';' . $var . '="' . $val . '"';
}
return ($data_string);
}
## compare 2 scalars, string/numeric independant
sub smart_lessthan {
my ($stra, $strb) = @_;
$stra =~ s/^\s+//;
$stra =~ s/\s+$//;
$strb =~ s/^\s+//;
$strb =~ s/\s+$//;
$ERRNO = 0;
my ($numa, $unparsed) = POSIX::strtod($stra);
my $numb;
$numb = POSIX::strtod($strb)
unless ($ERRNO || $unparsed != 0);
if (($stra eq '') || ($strb eq '') || ($unparsed != 0) || $ERRNO) {
return $stra lt $strb;
} else {
return $stra < $strb;
}
}
=over
=item sort_uniq ( [ \&comp ], @items )
Returns sorted array of unique elements in the list.
Parameters:
=over
=item \&comp
Optional subroutine reference to compare each pairs of elements.
It should take two arguments and return negative, zero or positive result.
=item @items
Items to be sorted.
=back
This function was added on Sympa 6.2.16.
=back
=cut
sub sort_uniq {
my $comp;
if (ref $_[0] eq 'CODE') {
$comp = shift;
}
my %items;
@items{@_} = ();
if ($comp) {
return sort { $comp->($a, $b) } keys %items;
} else {
return sort keys %items;
}
}
# Create a custom attribute from an XML description
# IN : A string, XML formed data as stored in database
# OUT : HASH data storing custome attributes.
# Old name: Sympa::List::parseCustomAttribute().
sub decode_custom_attribute {
my $xmldoc = shift;
return undef unless defined $xmldoc and length $xmldoc;
my $parser = XML::LibXML->new();
my $tree;
## We should use eval to parse to prevent the program to crash if it fails
if (ref($xmldoc) eq 'GLOB') {
$tree = eval { $parser->parse_fh($xmldoc) };
} else {
$tree = eval { $parser->parse_string($xmldoc) };
}
return undef unless defined $tree;
my $doc = $tree->getDocumentElement;
my @custom_attr = $doc->getChildrenByTagName('custom_attribute');
my %ca;
foreach my $ca (@custom_attr) {
my $id = Encode::encode_utf8($ca->getAttribute('id'));
my $value = Encode::encode_utf8($ca->getElementsByTagName('value'));
$ca{$id} = {value => $value};
}
return \%ca;
}
# Create an XML Custom attribute to be stored into data base.
# IN : HASH data storing custome attributes
# OUT : string, XML formed data to be stored in database
# Old name: Sympa::List::createXMLCustomAttribute().
sub encode_custom_attribute {
my $custom_attr = shift;
return
'<?xml version="1.0" encoding="UTF-8" ?><custom_attributes></custom_attributes>'
if (not defined $custom_attr);
my $XMLstr = '<?xml version="1.0" encoding="UTF-8" ?><custom_attributes>';
foreach my $k (sort keys %{$custom_attr}) {
my $value = $custom_attr->{$k}{value};
$value = '' unless defined $value;
$XMLstr .=
"<custom_attribute id=\"$k\"><value>"
. Sympa::Tools::Text::encode_html($value, '\000-\037')
. "</value></custom_attribute>";
}
$XMLstr .= "</custom_attributes>";
$XMLstr =~ s/\s*\n\s*/ /g;
return $XMLstr;
}
1;

View File

@ -0,0 +1,942 @@
# -*- indent-tabs-mode: nil; -*-
# vim:ft=perl:et:sw=4
# $Id$
# Sympa - SYsteme de Multi-Postage Automatique
#
# Copyright (c) 1997, 1998, 1999 Institut Pasteur & Christophe Wolfhugel
# Copyright (c) 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005,
# 2006, 2007, 2008, 2009, 2010, 2011 Comite Reseau des Universites
# Copyright (c) 2011, 2012, 2013, 2014, 2015, 2016, 2017 GIP RENATER
# Copyright 2018, 2020 The Sympa Community. See the AUTHORS.md
# file at the top-level directory of this distribution and at
# <https://github.com/sympa-community/sympa.git>.
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
package Sympa::Tools::Text;
use strict;
use warnings;
use Encode qw();
use English qw(-no_match_vars);
use Encode::MIME::Header; # 'MIME-Q' encoding.
use HTML::Entities qw();
use MIME::EncWords;
use Text::LineFold;
use Unicode::GCString;
use URI::Escape qw();
use if ($] < 5.016), qw(Unicode::CaseFold fc);
use if (5.016 <= $]), qw(feature fc);
BEGIN { eval 'use Unicode::Normalize qw()'; }
BEGIN { eval 'use Unicode::UTF8 qw()'; }
use Sympa::Language;
use Sympa::Regexps;
# Old name: tools::addrencode().
sub addrencode {
my $addr = shift;
my $phrase = (shift || '');
my $charset = (shift || 'utf8');
my $comment = (shift || '');
return undef unless $addr =~ /\S/;
if ($phrase =~ /[^\s\x21-\x7E]/) {
$phrase = MIME::EncWords::encode_mimewords(
Encode::decode('utf8', $phrase),
'Encoding' => 'A',
'Charset' => $charset,
'Replacement' => 'FALLBACK',
'Field' => 'Resent-Sender', # almost longest
'Minimal' => 'DISPNAME', # needs MIME::EncWords >= 1.012.
);
} elsif ($phrase =~ /\S/) {
$phrase =~ s/([\\\"])/\\$1/g;
$phrase = '"' . $phrase . '"';
}
if ($comment =~ /[^\s\x21-\x27\x2A-\x5B\x5D-\x7E]/) {
$comment = MIME::EncWords::encode_mimewords(
Encode::decode('utf8', $comment),
'Encoding' => 'A',
'Charset' => $charset,
'Replacement' => 'FALLBACK',
'Minimal' => 'DISPNAME',
);
} elsif ($comment =~ /\S/) {
$comment =~ s/([\\\"])/\\$1/g;
}
return
($phrase =~ /\S/ ? "$phrase " : '')
. ($comment =~ /\S/ ? "($comment) " : '')
. "<$addr>";
}
# Old names: tools::clean_email(), tools::get_canonical_email().
sub canonic_email {
my $email = shift;
return undef unless defined $email;
# Remove leading and trailing white spaces.
$email =~ s/\A\s+//;
$email =~ s/\s+\z//;
# Lower-case.
$email =~ tr/A-Z/a-z/;
return (length $email) ? $email : undef;
}
# Old name: tools::clean_msg_id().
sub canonic_message_id {
my $msg_id = shift;
return $msg_id unless defined $msg_id;
chomp $msg_id;
if ($msg_id =~ /\<(.+)\>/) {
$msg_id = $1;
}
return $msg_id;
}
sub canonic_text {
my $text = shift;
return undef unless defined $text;
# Normalize text. See also discussion on
# https://listes.renater.fr/sympa/arc/sympa-developpers/2018-03/thrd1.html
#
# N.B.: Corresponding modules are optional by now, and should be
# mandatory in the future.
my $utext;
if (Encode::is_utf8($text)) {
$utext = $text;
} elsif ($Unicode::UTF8::VERSION) {
no warnings 'utf8';
$utext = Unicode::UTF8::decode_utf8($text);
} else {
$utext = Encode::decode_utf8($text);
}
if ($Unicode::Normalize::VERSION) {
$utext = Unicode::Normalize::normalize('NFC', $utext);
}
# Remove DOS linefeeds (^M) that cause problems with Outlook 98, AOL,
# and EIMS:
$utext =~ s/\r\n|\r/\n/g;
if (Encode::is_utf8($text)) {
return $utext;
} else {
return Encode::encode_utf8($utext);
}
}
sub slurp {
my $path = shift;
my $ifh;
return undef unless open $ifh, '<', $path;
my $text = do { local $RS; <$ifh> };
close $ifh;
return canonic_text($text);
}
sub wrap_text {
my $text = shift;
my $init = shift;
my $subs = shift;
my $cols = shift;
$init //= '';
$subs //= '';
$cols //= 78;
return $text unless $cols;
my $email_re = Sympa::Regexps::email();
my $linefold = Text::LineFold->new(
Language => Sympa::Language->instance->get_lang,
Prep => 'NONBREAKURI',
prep => [$email_re, sub { shift; @_ }],
ColumnsMax => $cols,
Format => sub {
shift;
my $event = shift;
my $str = shift;
if ($event =~ /^eo/) { return "\n"; }
if ($event =~ /^so[tp]/) { return $init . $str; }
if ($event eq 'sol') { return $subs . $str; }
undef;
},
);
my $t = Encode::is_utf8($text) ? $text : Encode::decode_utf8($text);
my $ret = '';
while (1000 < length $t) {
my $s = substr $t, 0, 1000;
$ret .= $linefold->break_partial($s);
$t = substr $t, 1000;
}
$ret .= $linefold->break_partial($t) if length $t;
$ret .= $linefold->break_partial(undef);
return Encode::is_utf8($text) ? $ret : Encode::encode_utf8($ret);
}
sub decode_filesystem_safe {
my $str = shift;
return '' unless defined $str and length $str;
$str = Encode::encode_utf8($str) if Encode::is_utf8($str);
# On case-insensitive filesystem "_XX" along with "_xx" should be decoded.
$str =~ s/_([0-9A-Fa-f]{2})/chr hex "0x$1"/eg;
return $str;
}
sub decode_html {
my $str = shift;
Encode::encode_utf8(
HTML::Entities::decode_entities(Encode::decode_utf8($str)));
}
sub encode_filesystem_safe {
my $str = shift;
return '' unless defined $str and length $str;
$str = Encode::encode_utf8($str) if Encode::is_utf8($str);
$str =~ s/([^-+.0-9\@A-Za-z])/sprintf '_%02x', ord $1/eg;
return $str;
}
sub encode_html {
my $str = shift;
my $additional_unsafe = shift || '';
HTML::Entities::encode_entities($str, '<>&"' . $additional_unsafe);
}
sub encode_uri {
my $str = shift;
my %options = @_;
# Note: URI-1.35 (URI::Escape 3.28) or later is required.
return Encode::encode_utf8(
URI::Escape::uri_escape_utf8(
Encode::decode_utf8($str),
'^-A-Za-z0-9._~' . (exists $options{omit} ? $options{omit} : '')
)
);
}
# Old name: tools::escape_chars().
sub escape_chars {
my $s = shift;
my $except = shift; ## Exceptions
my $ord_except = ord $except if defined $except;
## Escape chars
## !"#$%&'()+,:;<=>?[] AND accented chars
## escape % first
foreach my $i (
0x25,
0x20 .. 0x24,
0x26 .. 0x2c,
0x3a .. 0x3f,
0x5b, 0x5d,
0x80 .. 0x9f,
0xa0 .. 0xff
) {
next if defined $ord_except and $i == $ord_except;
my $hex_i = sprintf "%lx", $i;
$s =~ s/\x$hex_i/%$hex_i/g;
}
## Special traetment for '/'
$s =~ s/\//%a5/g unless defined $except and $except eq '/';
return $s;
}
# Old name: tt2::escape_url().
# DEPRECATED. Use Sympa::Tools::Text::escape_uri() or
# Sympa::Tools::Text::mailtourl().
#sub escape_url;
sub foldcase {
my $str = shift;
return '' unless defined $str and length $str;
# Perl 5.16.0 and later have built-in fc(). Earlier uses Unicode::CaseFold.
return Encode::encode_utf8(fc(Encode::decode_utf8($str)));
}
my %legacy_charsets = (
'ar' => [qw(iso-8859-6)],
'bs' => [qw(iso-8859-2)],
'cs' => [qw(iso-8859-2)],
'eo' => [qw(iso-8859-3)],
'et' => [qw(iso-8859-4)],
'he' => [qw(iso-8859-8)],
'hr' => [qw(iso-8859-2)],
'hu' => [qw(iso-8859-2)],
'ja' => [qw(euc-jp cp932 MacJapanese)],
'kl' => [qw(iso-8859-4)],
'ko' => [qw(cp949)],
'lt' => [qw(iso-8859-4)],
'lv' => [qw(iso-8859-4)],
'mt' => [qw(iso-8859-3)],
'pl' => [qw(iso-8859-2)],
'ro' => [qw(iso-8859-2)],
'ru' => [qw(koi8-r cp1251)], # cp866? MacCyrillic?
'sk' => [qw(iso-8859-2)],
'sl' => [qw(iso-8859-2)],
'th' => [qw(iso-8859-11 cp874 MacThai)],
'tr' => [qw(iso-8859-9)],
'uk' => [qw(koi8-u)], # MacUkrainian?
'zh-CN' => [qw(euc-cn)],
'zh-TW' => [qw(big5-eten)],
);
sub guessed_to_utf8 {
my $text = shift;
my @langs = @_;
return Encode::encode_utf8($text) if Encode::is_utf8($text);
return $text
unless defined $text
and length $text
and $text =~ /[^\x00-\x7F]/;
my $utf8;
if ($Unicode::UTF8::VERSION) {
$utf8 =
eval { Unicode::UTF8::decode_utf8($text, Encode::FB_CROAK()) };
}
unless (defined $utf8) {
foreach my $charset (map { $_ ? @$_ : () } @legacy_charsets{@langs}) {
$utf8 =
eval { Encode::decode($charset, $text, Encode::FB_CROAK()) };
last if defined $utf8;
}
}
unless (defined $utf8) {
$utf8 = Encode::decode('iso-8859-1', $text);
}
# Apply NFC: e.g. for modified-NFD by Mac OS X.
$utf8 = Unicode::Normalize::normalize('NFC', $utf8)
if $Unicode::Normalize::VERSION;
return Encode::encode_utf8($utf8);
}
sub mailtourl {
my $text = shift;
my %options = @_;
my $dtext =
(not defined $text) ? ''
: $options{decode_html} ? Sympa::Tools::Text::decode_html($text)
: $text;
$dtext =~ s/\A\s+//;
$dtext =~ s/\s+\z//;
$dtext =~ s/(?:\r\n|\r|\n)(?=[ \t])//g;
$dtext =~ s/\r\n|\r|\n/ /g;
# The ``@'' in email address should not be encoded because some MUAs
# aren't able to decode ``%40'' in e-mail address of mailto: URL.
# Contrary, ``@'' in query component should be encoded because some
# MUAs take it for a delimiter to separate URL from the rest.
my ($format, $utext, $qsep);
if ($dtext =~ /[()<>\[\]:;,\"\s]/) {
# Use "to" header if source text includes any of RFC 5322
# "specials", minus ``@'' and ``\'', plus whitespaces.
$format = 'mailto:?to=%s%s';
$utext = Sympa::Tools::Text::encode_uri($dtext);
$qsep = '&';
} else {
$format = 'mailto:%s%s';
$utext = Sympa::Tools::Text::encode_uri($dtext, omit => '@');
$qsep = '?';
}
my $qstring = _url_query_string(
$options{query},
decode_html => $options{decode_html},
leadchar => $qsep,
sepchar => '&',
trim_values => 1,
);
return sprintf $format, $utext, $qstring;
}
sub _url_query_string {
my $query = shift;
my %options = @_;
unless (ref $query eq 'HASH' and %$query) {
return '';
} else {
my $decode_html = $options{decode_html};
my $trim_values = $options{trim_values};
return ($options{leadchar} || '?') . join(
($options{sepchar} || ';'),
map {
my ($dkey, $dval) = map {
(not defined $_) ? ''
: $decode_html ? Sympa::Tools::Text::decode_html($_)
: $_;
} ($_, $query->{$_});
if ($trim_values and lc $dkey ne 'body') {
$dval =~ s/\A\s+//;
$dval =~ s/\s+\z//;
$dval =~ s/(?:\r\n|\r|\n)(?=[ \t])//g;
$dval =~ s/\r\n|\r|\n/ /g;
}
sprintf '%s=%s',
Sympa::Tools::Text::encode_uri($dkey),
Sympa::Tools::Text::encode_uri($dval);
} sort keys %$query
);
}
}
sub pad {
my $str = shift;
my $width = shift;
return $str unless $width and defined $str;
my $ustr = Encode::is_utf8($str) ? $str : Encode::decode_utf8($str);
my $cols = Unicode::GCString->new($ustr)->columns;
unless ($cols < abs $width) {
return $str;
} elsif ($width < 0) {
return $str . (' ' x (-$width - $cols));
} else {
return (' ' x ($width - $cols)) . $str;
}
}
# Old name: tools::qdecode_filename().
sub qdecode_filename {
my $filename = shift;
## We don't use MIME::Words here because it does not encode properly
## Unicode
## Check if string is already Q-encoded first
#if ($filename =~ /\=\?UTF-8\?/) {
$filename = Encode::encode_utf8(Encode::decode('MIME-Q', $filename));
#}
return $filename;
}
# Old name: tools::qencode_filename().
sub qencode_filename {
my $filename = shift;
## We don't use MIME::Words here because it does not encode properly
## Unicode
## Check if string is already Q-encoded first
## Also check if the string contains 8bit chars
unless ($filename =~ /\=\?UTF-8\?/
|| $filename =~ /^[\x00-\x7f]*$/) {
## Don't encode elements such as .desc. or .url or .moderate
## or .extension
my $part = $filename;
my ($leading, $trailing);
$leading = $1 if ($part =~ s/^(\.desc\.)//); ## leading .desc
$trailing = $1 if ($part =~ s/((\.\w+)+)$//); ## trailing .xx
my $encoded_part = MIME::EncWords::encode_mimewords(
$part,
Charset => 'utf8',
Encoding => 'q',
MaxLineLen => 1000,
Minimal => 'NO'
);
$filename = $leading . $encoded_part . $trailing;
}
return $filename;
}
# Old name: tools::unescape_chars().
sub unescape_chars {
my $s = shift;
$s =~ s/%a5/\//g; ## Special traetment for '/'
foreach my $i (0x20 .. 0x2c, 0x3a .. 0x3f, 0x5b, 0x5d, 0x80 .. 0x9f,
0xa0 .. 0xff) {
my $hex_i = sprintf "%lx", $i;
my $hex_s = sprintf "%c", $i;
$s =~ s/%$hex_i/$hex_s/g;
}
return $s;
}
# Old name: tools::valid_email().
sub valid_email {
my $email = shift;
my $email_re = Sympa::Regexps::email();
return undef unless $email =~ /^${email_re}$/;
# Forbidden characters.
return undef if $email =~ /[\|\$\*\?\!]/;
return 1;
}
sub weburl {
my $base = shift;
my $paths = shift;
my %options = @_;
my @paths = map {
Sympa::Tools::Text::encode_uri(
(not defined $_) ? ''
: $options{decode_html} ? Sympa::Tools::Text::decode_html($_)
: $_
);
} @{$paths || []};
my $qstring = _url_query_string(
$options{query},
decode_html => $options{decode_html},
sepchar => '&',
);
my $fstring;
my $fragment = $options{fragment};
if (defined $fragment) {
$fstring = '#'
. Sympa::Tools::Text::encode_uri(
$options{decode_html}
? Sympa::Tools::Text::decode_html($fragment)
: $fragment
);
} else {
$fstring = '';
}
return sprintf '%s%s%s', join('/', grep { defined $_ } ($base, @paths)),
$qstring, $fstring;
}
1;
__END__
=encoding utf-8
=head1 NAME
Sympa::Tools::Text - Text-related functions
=head1 DESCRIPTION
This package provides some text-related functions.
=head2 Functions
=over
=item addrencode ( $addr, [ $phrase, [ $charset, [ $comment ] ] ] )
Returns formatted (and encoded) name-addr as RFC5322 3.4.
=item canonic_email ( $email )
I<Function>.
Returns canonical form of e-mail address.
Leading and trailing white spaces are removed.
Latin letters without accents are lower-cased.
For malformed inputs returns C<undef>.
=item canonic_message_id ( $message_id )
Returns canonical form of message ID without trailing or leading whitespaces
or C<E<lt>>, C<E<gt>>.
=item canonic_text ( $text )
Canonicalizes text.
C<$text> should be a binary string encoded by UTF-8 character set or
a Unicode string.
Forbidden sequences in binary string will be replaced by
U+FFFD REPLACEMENT CHARACTERs, and Normalization Form C (NFC) will be applied.
=item decode_filesystem_safe ( $str )
I<Function>.
Decodes a string encoded by encode_filesystem_safe().
Parameter:
=over
=item $str
String to be decoded.
=back
Returns:
Decoded string, stripped C<utf8> flag if any.
=item decode_html ( $str )
I<Function>.
Decodes HTML entities in a string encoded by UTF-8 or a Unicode string.
Parameter:
=over
=item $str
String to be decoded.
=back
Returns:
Decoded string, stripped C<utf8> flag if any.
=item encode_filesystem_safe ( $str )
I<Function>.
Encodes a string $str to be suitable for filesystem.
Parameter:
=over
=item $str
String to be encoded.
=back
Returns:
Encoded string, stripped C<utf8> flag if any.
All bytes except C<'-'>, C<'+'>, C<'.'>, C<'@'>
and alphanumeric characters are encoded to sequences C<'_'> followed by
two hexdigits.
Note that C<'/'> will also be encoded.
=item encode_html ( $str, [ $additional_unsafe ] )
I<Function>.
Encodes characters in a string $str to HTML entities.
By default
C<'E<lt>'>, C<'E<gt>'>, C<'E<amp>'> and C<'E<quot>'> are encoded.
Parameter:
=over
=item $str
String to be encoded.
=item $additional_unsafe
Character or range of characters additionally encoded as entity references.
This optional parameter was introduced on Sympa 6.2.37b.3.
=back
Returns:
Encoded string, I<not> stripping utf8 flag if any.
=item encode_uri ( $str, [ omit => $chars ] )
I<Function>.
Encodes potentially unsafe characters in the string using "percent" encoding
suitable for URIs.
Parameters:
=over
=item $str
String to be encoded.
=item omit =E<gt> $chars
By default, all characters except those defined as "unreserved" in RFC 3986
are encoded, that is, C<[^-A-Za-z0-9._~]>.
If this parameter is given, it will prevent encoding additional characters.
=back
Returns:
Encoded string, stripped C<utf8> flag if any.
=item escape_chars ( $str )
Escape weird characters.
ToDo: This should be obsoleted in the future release: Would be better to use
L</encode_filesystem_safe>.
=item escape_url ( $str )
DEPRECATED.
Would be better to use L</"encode_uri"> or L</"mailtourl">.
=item foldcase ( $str )
I<Function>.
Returns "fold-case" string suitable for case-insensitive match.
For example, a code below looks for a needle in haystack not regarding case,
even if they are non-ASCII UTF-8 strings.
$haystack = Sympa::Tools::Text::foldcase($HayStack);
$needle = Sympa::Tools::Text::foldcase($NeedLe);
if (index $haystack, $needle >= 0) {
...
}
Parameter:
=over
=item $str
A string.
=back
=item guessed_to_utf8( $text, [ lang, ... ] )
I<Function>.
Guesses text charset considering language context
and returns the text reencoded by UTF-8.
Parameters:
=over
=item $text
Text to be reencoded.
=item lang, ...
Language tag(s) which may be given by L<Sympa::Language/"implicated_langs">.
=back
Returns:
Reencoded text.
If any charsets could not be guessed, C<iso-8859-1> will be used
as the last resort, just because it covers full range of 8-bit.
=item mailtourl ( $email, [ decode_html =E<gt> 1 ],
[ query =E<gt> {key =E<gt> val, ...} ] )
I<Function>.
Constructs a C<mailto:> URL for given e-mail.
Parameters:
=over
=item $email
E-mail address.
=item decode_html =E<gt> 1
If set, arguments are assumed to include HTML entities.
=item query =E<gt> {key =E<gt> val, ...}
Optional query.
=back
Returns:
Constructed URL.
=item pad ( $str, $width )
Pads space a string so that result will not be narrower than given width.
Parameters:
=over
=item $str
A string.
=item $width
If $width is false value or width of $str is not less than $width,
does nothing.
If $width is less than C<0>, pads right.
Otherwise, pads left.
=back
Returns:
Padded string.
=item qdecode_filename ( $filename )
Q-Decodes web file name.
ToDo:
This should be obsoleted in the future release: Would be better to use
L</decode_filesystem_safe>.
=item qencode_filename ( $filename )
Q-Encodes web file name.
ToDo:
This should be obsoleted in the future release: Would be better to use
L</encode_filesystem_safe>.
=item slurp ( $file )
Get entire content of the file.
Normalization by canonic_text() is applied.
C<$file> is the path to text file.
=item unescape_chars ( $str )
Unescape weird characters.
ToDo: This should be obsoleted in the future release: Would be better to use
L</decode_filesystem_safe>.
=item valid_email ( $string )
Basic check of an email address.
=item weburl ( $base, \@paths, [ decode_html =E<gt> 1 ],
[ fragment =E<gt> $fragment ], [ query =E<gt> \%query ] )
Constructs a C<http:> or C<https:> URL under given base URI.
Parameters:
=over
=item $base
Base URI.
=item \@paths
Additional path components.
=item decode_html =E<gt> 1
If set, arguments are assumed to include HTML entities.
Exception is $base:
It is assumed not to include entities.
=item fragment =E<gt> $fragment
Optional fragment.
=item query =E<gt> \%query
Optional query.
=back
Returns:
A URI.
=item wrap_text ( $text, [ $init_tab, [ $subsequent_tab, [ $cols ] ] ] )
I<Function>.
Returns line-wrapped text.
Parameters:
=over
=item $text
The text to be folded.
=item $init_tab
Indentation prepended to the first line of paragraph.
Default is C<''>, no indentation.
=item $subsequent_tab
Indentation prepended to each subsequent line of folded paragraph.
Default is C<''>, no indentation.
=item $cols
Max number of columns of folded text.
Default is C<78>.
=back
=back
=head1 HISTORY
L<Sympa::Tools::Text> appeared on Sympa 6.2a.41.
decode_filesystem_safe() and encode_filesystem_safe() were added
on Sympa 6.2.10.
decode_html(), encode_html(), encode_uri() and mailtourl()
were added on Sympa 6.2.14, and escape_url() was deprecated.
guessed_to_utf8() and pad() were added on Sympa 6.2.17.
canonic_text() and slurp() were added on Sympa 6.2.53b.
=cut