Dépollution des courriel par substitution des pièces jointes par un lien temporaire;
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 

515 lines
20 KiB

////////////////////////////////////////////////////////////////////////////
// Copyright KAZ 2021 //
// //
// contact (at) kaz.bzh //
// //
// This software is a filter to shrink email by attachment extraction. //
// //
// This software is governed by the CeCILL-B license under French law and //
// abiding by the rules of distribution of free software. You can use, //
// modify and/or redistribute the software under the terms of the //
// CeCILL-B license as circulated by CEA, CNRS and INRIA at the following //
// URL "http://www.cecill.info". //
// //
// As a counterpart to the access to the source code and rights to copy, //
// modify and redistribute granted by the license, users are provided //
// only with a limited warranty and the software's author, the holder of //
// the economic rights, and the successive licensors have only limited //
// liability. //
// //
// In this respect, the user's attention is drawn to the risks associated //
// with loading, using, modifying and/or developing or reproducing the //
// software by the user in light of its specific status of free software, //
// that may mean that it is complicated to manipulate, and that also //
// therefore means that it is reserved for developers and experienced //
// professionals having in-depth computer knowledge. Users are therefore //
// encouraged to load and test the software's suitability as regards //
// their requirements in conditions enabling the security of their //
// systems and/or data to be ensured and, more generally, to use and //
// operate it in the same conditions as regards security. //
// //
// The fact that you are presently reading this means that you have had //
// knowledge of the CeCILL-B license and that you accept its terms. //
////////////////////////////////////////////////////////////////////////////
#include <iostream>
#include <vector>
#include <set>
#include <fstream>
#include <iomanip>
#include <math.h>
#include <algorithm>
#include <unistd.h>
#include <boost/algorithm/string.hpp>
#include "kazDebug.hpp"
#include "kazMisc.hpp"
#include "SizeArg.hpp"
#include "Attachment.hpp"
using namespace std;
using namespace kaz;
// ================================================================================
const string Attachment::contentTypeToken ("content-type");
const string Attachment::contentDispositionToken ("content-disposition");
const string Attachment::contentTransferEncodingToken ("content-transfer-encoding");
const string Attachment::base64Token ("base64");
const string Attachment::quotedPrintableToken ("quoted-printable");
const string Attachment::contentIDToken ("content-id");
const string Attachment::PLAIN ("plain");
const string Attachment::HTML ("html");
const string Attachment::RELATED ("related");
const string Attachment::ALTERNATIVE ("alternative");
const regex Attachment::nameCharsetRegEx (".*name\\*=(.*)");
const regex Attachment::nameRegEx (".*name=\"([^\"]*)\".*");
// boundary="----=_Part_796779_1154936629.1668080348646"
// boundary="------------040709000505010508040808"
// boundary="----------=_1668606031-941125-91"
// boundary="_004_PAVPR10MB6792713B313048E3A259B215B2079PAVPR10MB6792EURP_";
// boundary="_000_PAVPR10MB6792713B313048E3A259B215B2079PAVPR10MB6792EURP_"
// boundary=--boundary_1351_64006126-2b0e-4a3b-98ac-4797d1634188
// boundary=--boundary_1352_7e294c9a-cfab-44a0-bfb3-7310380ac7cb;
const regex Attachment::boundaryRegEx (".*boundary=\"?([^\"; ]*)\"?;?.*");
const regex Attachment::cidDefRegEx (".*<([^>]*)>.*");
const regex Attachment::textRegEx (".*text/("+PLAIN+"|"+HTML+").*");
const regex Attachment::multiRegEx ("\\s*multipart/(mixed|"+RELATED+"|"+ALTERNATIVE+").*");
const string Attachment::IMG_BEGIN ("<IMG");
const string Attachment::IMG_END (">");
static const string SRC_BEGIN ("SRC=\"");
static const string RFC822 ("message/rfc822");
// ================================================================================
string
Attachment::getUnknown (const string &contentType) {
DEF_LOG ("Attachment::getUnknown", "contentType: " << contentType);
static time_t now (time (NULL));
static int count (0);
tm *ltm = localtime (&now);
ostringstream nameStream;
nameStream << "U-"
<< std::setfill ('0') << std::setw (2) << (ltm->tm_year-100)
<< std::setfill ('0') << std::setw (2) << (1 + ltm->tm_mon)
<< std::setfill ('0') << std::setw (2) << ltm->tm_mday
<< std::setfill ('0') << std::setw (2) << ltm->tm_hour
<< std::setfill ('0') << std::setw (2) << ltm->tm_min
<< std::setfill ('0') << std::setw (2) << ltm->tm_sec
<< "-" << count;
const string::size_type subTypePos (contentType.find ("/"));
if (subTypePos != string::npos)
nameStream << "." << contentType.substr (subTypePos+1);
++count;
LOG ("name: " << nameStream.str ());
return nameStream.str ();
}
// ================================================================================
void
Attachment::removeSection (string &content, const string &beginTag, const string &endTag) {
DEF_LOG ("Attachment::removeSection", "beginTag: " << beginTag << " endTag: " << endTag);
for (string::size_type startPos (0);
(startPos = caseInsensitiveFind (content, beginTag, startPos)) != string::npos;
) {
string::size_type stopPos = caseInsensitiveFind (content, endTag, startPos);
LOG_BUG (stopPos == startPos, content.erase (startPos, endTag.length ()); continue, "eMailShrinker: bug A1: removeSection: no " << beginTag);
LOG_BUG (stopPos == string::npos, content.erase (startPos, beginTag.length ()); break, "eMailShrinker: bug A2: removeSection: no " << endTag);
LOG ("KAZ start: " << startPos << " stop: " << stopPos);
content.erase (startPos, stopPos+endTag.length ()-startPos);
}
}
// ================================================================================
string
Attachment::getSection (const string &content, const string &beginTag, const string &endTag) {
DEF_LOG ("Attachment::getSection", "beginTag: " << beginTag << " endTag: " << endTag << " content: " << content);
vector<string> list;
getSection (content, beginTag, endTag, list);
size_t sum (0);
for (const string &s : list)
sum += s.length ();
string result;
result.reserve (sum);
for (const string &s : list)
result += s;
LOG ("result: " << result);
return result;
}
// ================================================================================
void
Attachment::getSection (const string &content, const string &beginTag, const string &endTag, vector<string> &result) {
DEF_LOG ("Attachment::getSection", "beginTag: " << beginTag << " endTag: " << endTag << " content: " << content);
for (string::size_type startPos (0);
(startPos = caseInsensitiveFind (content, beginTag, startPos)) != string::npos;
) {
LOG (beginTag << ": " << startPos);
string::size_type stopPos = caseInsensitiveFind (content, endTag, startPos);
LOG_BUG (stopPos == string::npos, break, "eMailShrinker: bug A3: " << endTag << " not found! at: " << startPos);
LOG ("start: " << startPos << " stop: " << stopPos);
LOG_BUG (startPos == stopPos, /**/, "eMailShrinker: bug A4: " << endTag << " without " << beginTag << " at: " << startPos);
if (startPos != stopPos) {
startPos += beginTag.length ();
result.push_back (content.substr (startPos, stopPos-startPos));
}
startPos = stopPos+endTag.length ();
}
}
// ================================================================================
const string
Attachment::getContentType () const {
map<string, string>::const_iterator it (env.find (contentTypeToken));
if (it == env.end ())
return "";
const string &contentTypeVal (it->second);
const string::size_type semicolonPos = contentTypeVal.find (';');
if (semicolonPos == string::npos)
return contentTypeVal;
return contentTypeVal.substr (0, semicolonPos);
}
const string
Attachment::getAttachName () const {
DEF_LOG ("Attachment::getAttachName", "");
string result = getProp (contentTypeToken, nameRegEx);
if (result.length ()) {
LOG ("name=: " << result);
encodedWord (result);
return result;
}
result = getProp (contentTypeToken, nameCharsetRegEx);
if (result.length ()) {
LOG ("name*=: " << result);
charsetValue (result);
return result;
}
// XXX il faut composer s'il y a plusieurs ligne filename*x=
result = getProp (contentDispositionToken, nameRegEx);
if (result.length ()) {
LOG ("filename=: " << result);
encodedWord (result);
return result;
}
// XXX il faut composer s'il y a plusieurs ligne filename*x*=
result = getProp (contentDispositionToken, nameRegEx);
if (result.length ()) {
LOG ("filename*=: " << result);
charsetValue (result);
return result;
}
return getUnknown (getContentType ());
}
const string &
Attachment::getBoundary () const {
return boundary;
}
const streamoff
Attachment::getSize () const {
return endPos-beginPos;
}
const string
Attachment::getProp (const string &token, const regex &regEx) const {
DEF_LOG ("Attachment::getProp", "token: " << token);
map<string, string>::const_iterator it (env.find (token));
if (it == env.end ()) {
LOG ("no token");
return "";
}
const string &val (it->second);
LOG ("val: " << val);
if (!regex_match (val.begin (), val.end (), regEx)) {
LOG ("no prop");
return "";
}
return regex_replace (val, regEx, "$1");
}
const bool
Attachment::isBase64Encoding () const {
return isDefProp (contentTransferEncodingToken, base64Token);
}
const bool
Attachment::isQuotedPrintableEnconding () const {
return isDefProp (contentTransferEncodingToken, quotedPrintableToken);
}
const bool
Attachment::isTextBase64 () const {
return !getProp (contentTypeToken, textRegEx).empty () && isBase64Encoding ();
}
const bool
Attachment::isDefProp (const string &token, const string &val) const {
DEF_LOG ("Attachment::getProp", "getProp token: " << token << " val: " << val);
map<string, string>::const_iterator it (env.find (token));
if (it == env.end ())
return false;
// XXX case insensitive ??
return it->second.find (val) != string::npos;
}
// ================================================================================
Attachment::Attachment (ifstream &mbox, const int &level, const streamoff beginInParent, streamoff &curPos)
: level (level),
beginInParent (beginInParent),
beginPos (curPos),
contentPos (0),
endPos (0),
toExtract (false),
toUpdate (false),
toDisclaim (false),
boundaryMiddleSize (0) {
DEF_LOG ("Attachment::Attachment", "curPos: " << curPos << " level: " << level);
readMime (mbox, curPos);
readBoundaries (mbox, curPos);
}
// ================================================================================
void
Attachment::readMime (ifstream &mbox, streamoff &curPos) {
DEF_LOG ("Attachment::readMime", "curPos: " << curPos);
string lastVar;
string line;
for (; getline (mbox, line); ) {
LOG ("pos: " << curPos << " line: " << line);
curPos += line.length () + 1;
if (line.empty ())
break;
if (line[0] == ' ' || line[0] == '\t') {
if (lastVar.empty ()) {
LOG_BUG (true, /**/, "eMailShrinker: bug A5: not compliant MIME. pos: " << (curPos - (line.length () + 1)) << " line: " << line);
} else {
LOG ("add line to var: " << line);
env.find (lastVar)->second += line;
LOG ("new val: " << env.find (lastVar)->second);
}
continue;
}
string::size_type colonPos = line.find (':');
if (colonPos != string::npos) {
lastVar = line.substr (0, colonPos);
toLower (lastVar);
LOG ("find var: " << lastVar);
string val (line.length () >= colonPos+2 ? line.substr (colonPos+2) : ""); // XXX check RFC " " after ": "
LOG ("new var: " << lastVar << " <=> " << val);
env [lastVar] = val;
}
}
LOG ("end of mime");
contentPos = curPos;
cid = getProp (contentIDToken, cidDefRegEx);
boundary = getProp (contentTypeToken, boundaryRegEx);
LOG ("boundary: " << boundary);
if (boundary.length ()) {
boundary = "--"+boundary+"--";
boundaryMiddleSize = boundary.length () - 2;
}
LOG ("readMime contentPos: " << contentPos << " cid: " << cid << " boundary: " << boundary);
}
// ================================================================================
void
Attachment::readBoundaries (ifstream &mbox, streamoff &curPos) {
DEF_LOG ("Attachment::readBoundaries", "curPos: " << curPos);
if (caseInsensitiveFind (getContentType (), RFC822) != string::npos) {
subAttachements.push_back (Attachment (mbox, level+1, curPos, curPos));
subAttachements.back ().endPos = curPos;
return;
}
if (boundary.empty ())
return;
for (; nextBondary (mbox, curPos); )
;
}
bool
Attachment::nextBondary (ifstream &mbox, streamoff &curPos) {
DEF_LOG ("Attachment::nextBondary", "curPos: " << curPos << " boundary: " << boundary);
bool isTextBase64 (subAttachements.size () && subAttachements.back ().isTextBase64 ());
LOG ("isTextBase64: " << isTextBase64 << " attach: " << *this);
for (string prev, line; getline (mbox, line); ) {
LOG ("curPos: " << curPos << " line: " << line);
streamoff lastPos = curPos;
curPos += line.length () + 1;
string::size_type bpos = line.find (boundary.c_str (), 0, boundaryMiddleSize);
if (bpos == string::npos) {
string clearLine (line);
if (isTextBase64)
base64Decode (clearLine);
string couple (prev+clearLine);
for (vector <string>::iterator it = stringsToUpdate.begin ();
it != stringsToUpdate.end ();
++it)
if (couple.find (*it) != string::npos) {
LOG ("find: "+ *it);
LOG ("size: " << subAttachements.size ());
if (subAttachements.size ())
subAttachements.back ().toUpdate = true;
else
LOG_BUG (true, continue, "eMailShrinker: bug A10: boundary format ? " << *this);
}
prev = clearLine;
continue;
}
LOG ("find: " << boundary);
LOG ("lastPos: " << lastPos << " bpos: " << bpos << " boundaryMiddleSize: " << boundaryMiddleSize);
if (subAttachements.size ())
subAttachements.back ().endPos = lastPos;
LOG ("line: " << line << "bpos+boundaryMiddleSize: " << (bpos+boundaryMiddleSize) << " find: " << line.find ("--", bpos+boundaryMiddleSize));
bpos += boundaryMiddleSize;
if (line.find ("--", bpos) == bpos) {
LOG ("end");
return false;
}
subAttachements.push_back (Attachment (mbox, level+1, lastPos, curPos));
return true;
}
endPos = curPos;
return false;
}
// ================================================================================
void
Attachment::markDisclaim (bool &plainMarked, bool &htmlMarked) {
if (plainMarked && htmlMarked)
return;
string multiProp = getProp (contentTypeToken, multiRegEx);
// LOG_BUG (multiProp == ALTERNATIVE && subAttachements.size () != 2, continue, "eMailShrinker: bug A6: alternative give not 1 case (" << subAttachements.size () << ").");
if (multiProp.length ())
for (Attachment &subAttach : subAttachements)
subAttach.markDisclaim (plainMarked, htmlMarked);
string textProp = getProp (contentTypeToken, textRegEx);
if (textProp.empty ())
return;
if (!plainMarked && textProp == PLAIN)
plainMarked = toUpdate = toDisclaim = true;
if (!htmlMarked && textProp == HTML)
htmlMarked = toUpdate = toDisclaim = true;
}
// ================================================================================
bool
Attachment::markSignificant (const string &parentMultiProp, const streamoff &minAttachSize, ifstream &mbox, vector<Attachment *> &allMarkedPtrs) {
DEF_LOG ("Attachment::markSignificant", "parentMultiProp: " << parentMultiProp << " minAttachSize: " << minAttachSize);
string textProp = getProp (contentTypeToken, textRegEx);
bool cantBeExtract ((parentMultiProp == ALTERNATIVE && (textProp == PLAIN || textProp == HTML)) ||
(parentMultiProp == RELATED && textProp == HTML));
string multiProp = getProp (contentTypeToken, multiRegEx);
for (Attachment &sub : subAttachements)
cantBeExtract |= sub.markSignificant (multiProp, minAttachSize, mbox, allMarkedPtrs);
if (getProp (contentTypeToken, textRegEx) == HTML) {
string content = getContent (mbox);
vector<string> imgs;
getSection (content, IMG_BEGIN, IMG_END, imgs);
EmbeddedData::fillEmbeddedData (imgs, minAttachSize, embeddedData);
if (embeddedData.size ())
toUpdate = true;
}
cantBeExtract |= toUpdate;
if (boundary.empty () && getSize () >= minAttachSize && !cantBeExtract)
cantBeExtract = toExtract = true; // XXX cantBeExtract ?
if (toExtract || toUpdate || toDisclaim)
allMarkedPtrs.push_back (this);
return cantBeExtract;
}
// ================================================================================
string
Attachment::getContent (ifstream &mbox) const {
DEF_LOG ("Attachment::getContent", "contentPos: " << contentPos);
string content;
content.resize (endPos-contentPos);
mbox.seekg (contentPos, ios::beg);
mbox.read (&content[0], endPos-contentPos);
if (isBase64Encoding ())
base64Decode (content);
if (isQuotedPrintableEnconding ())
quotedDecode (content);
return content;
}
// ================================================================================
void
Attachment::println (ofstream &outbox, string content) const {
DEF_LOG ("Attachment::println", "content: " << content);
if (isBase64Encoding ())
base64Encode (content);
if (isQuotedPrintableEnconding ())
quotedEncode (content);
outbox << content;
if (content.length () && content.back () != '\n')
outbox << endl;
}
// ================================================================================
void
Attachment::replaceEmbedded (string &content) const {
DEF_LOG ("Attachment::replaceEmbedded", "content.length: " << content.length ());
if (!embeddedData.size ())
return;
int imgIdx (-1);
string::size_type startPos (0);
for (const EmbeddedData &embedded : embeddedData) {
LOG ("embedded: " << embedded);
for ( ; ; ) {
startPos = caseInsensitiveFind (content, IMG_BEGIN, startPos);
LOG_BUG (startPos == string::npos, return, "eMailShrinker: bug A7: can't find " << IMG_BEGIN);
++imgIdx;
if (embedded.imgIdx >= imgIdx)
break;
startPos += IMG_BEGIN.length ();
}
startPos = caseInsensitiveFind (content, SRC_BEGIN, startPos);
LOG_BUG (startPos == string::npos, return, "eMailShrinker: bug A8: can't find " << SRC_BEGIN );
startPos += SRC_BEGIN.length ();
const string::size_type endPos (content.find ("\"", startPos));
LOG_BUG (endPos == string::npos, return, "eMailShrinker: bug A9: can't find end of " << SRC_BEGIN );
content.replace (startPos, endPos-startPos, embedded.downloadUrl);
}
}
// ================================================================================
ostream&
kaz::operator << (ostream& os, const Attachment& attachment) {
string prop, sep;
if (attachment.toExtract) { prop = "to extract"; sep = ", "; }
if (attachment.toUpdate) { prop += sep+"need update"; sep = ", "; }
if (attachment.toDisclaim) { prop += sep+"need diclaim"; sep = ", "; }
if (attachment.embeddedData.size ()) { prop += sep+"embeddedData"; }
if (prop.length ())
prop = " ["+prop+"]";
os << ("****************************************"+40-(attachment.level % 20)*2) << setw (10) << SizeArg (attachment.getSize ()) << " " << attachment.getContentType ()
<< prop << (attachment.cid.length () ? " id: "+attachment.cid : "")
<< (attachment.boundary.length () ? " boundary: "+attachment.boundary : "")
<< " (" << attachment.beginPos << " / " << attachment.contentPos << " / " << attachment.endPos << ") " << endl;
for (const EmbeddedData &embedded : attachment.embeddedData)
os << setw (((attachment.level+1) % 20)*2) << "" << setw (10) << SizeArg (embedded.dataLength) << " embedded [to extract] " << embedded;
for (const Attachment &sub : attachment.subAttachements) {
os << sub;
}
return os;
}
// ================================================================================