//////////////////////////////////////////////////////////////////////////// // Copyright KAZ 2021 // // // // contact (at) kaz.bzh // // // // This software is a filter to shrink email by attachment extraction. // // // // This software is governed by the CeCILL-B license under French law and // // abiding by the rules of distribution of free software. You can use, // // modify and/or redistribute the software under the terms of the // // CeCILL-B license as circulated by CEA, CNRS and INRIA at the following // // URL "http://www.cecill.info". // // // // As a counterpart to the access to the source code and rights to copy, // // modify and redistribute granted by the license, users are provided // // only with a limited warranty and the software's author, the holder of // // the economic rights, and the successive licensors have only limited // // liability. // // // // In this respect, the user's attention is drawn to the risks associated // // with loading, using, modifying and/or developing or reproducing the // // software by the user in light of its specific status of free software, // // that may mean that it is complicated to manipulate, and that also // // therefore means that it is reserved for developers and experienced // // professionals having in-depth computer knowledge. Users are therefore // // encouraged to load and test the software's suitability as regards // // their requirements in conditions enabling the security of their // // systems and/or data to be ensured and, more generally, to use and // // operate it in the same conditions as regards security. // // // // The fact that you are presently reading this means that you have had // // knowledge of the CeCILL-B license and that you accept its terms. // //////////////////////////////////////////////////////////////////////////// #include #include #include #include #include #include #include #include #include #include "kazDebug.hpp" #include "kazMisc.hpp" #include "SizeArg.hpp" #include "Attachment.hpp" using namespace std; using namespace kaz; // ================================================================================ const string Attachment::contentTypeToken ("content-type"); const string Attachment::contentDispositionToken ("content-disposition"); const string Attachment::contentTransferEncodingToken ("content-transfer-encoding"); const string Attachment::base64Token ("base64"); const string Attachment::quotedPrintableToken ("quoted-printable"); const string Attachment::contentIDToken ("content-id"); const string Attachment::PLAIN ("plain"); const string Attachment::HTML ("html"); const string Attachment::RELATED ("related"); const string Attachment::ALTERNATIVE ("alternative"); const regex Attachment::nameCharsetRegEx (".*name\\*=(.*)"); const regex Attachment::nameRegEx (".*name=\"([^\"]*)\".*"); // boundary="----=_Part_796779_1154936629.1668080348646" // boundary="------------040709000505010508040808" // boundary="----------=_1668606031-941125-91" // boundary="_004_PAVPR10MB6792713B313048E3A259B215B2079PAVPR10MB6792EURP_"; // boundary="_000_PAVPR10MB6792713B313048E3A259B215B2079PAVPR10MB6792EURP_" // boundary=--boundary_1351_64006126-2b0e-4a3b-98ac-4797d1634188 // boundary=--boundary_1352_7e294c9a-cfab-44a0-bfb3-7310380ac7cb; const regex Attachment::boundaryRegEx (".*boundary=\"?([^\"; ]*)\"?;?.*"); const regex Attachment::cidDefRegEx (".*<([^>]*)>.*"); const regex Attachment::textRegEx (".*text/("+PLAIN+"|"+HTML+").*"); const regex Attachment::multiRegEx ("\\s*multipart/(mixed|"+RELATED+"|"+ALTERNATIVE+").*"); const string Attachment::IMG_BEGIN (""); static const string SRC_BEGIN ("SRC=\""); static const string RFC822 ("message/rfc822"); // ================================================================================ string Attachment::getUnknown (const string &contentType) { DEF_LOG ("Attachment::getUnknown", "contentType: " << contentType); static time_t now (time (NULL)); static int count (0); tm *ltm = localtime (&now); ostringstream nameStream; nameStream << "U-" << std::setfill ('0') << std::setw (2) << (ltm->tm_year-100) << std::setfill ('0') << std::setw (2) << (1 + ltm->tm_mon) << std::setfill ('0') << std::setw (2) << ltm->tm_mday << std::setfill ('0') << std::setw (2) << ltm->tm_hour << std::setfill ('0') << std::setw (2) << ltm->tm_min << std::setfill ('0') << std::setw (2) << ltm->tm_sec << "-" << count; const string::size_type subTypePos (contentType.find ("/")); if (subTypePos != string::npos) nameStream << "." << contentType.substr (subTypePos+1); ++count; LOG ("name: " << nameStream.str ()); return nameStream.str (); } // ================================================================================ void Attachment::removeSection (string &content, const string &beginTag, const string &endTag) { DEF_LOG ("Attachment::removeSection", "beginTag: " << beginTag << " endTag: " << endTag); for (string::size_type startPos (0); (startPos = caseInsensitiveFind (content, beginTag, startPos)) != string::npos; ) { string::size_type stopPos = caseInsensitiveFind (content, endTag, startPos); LOG_BUG (stopPos == startPos, content.erase (startPos, endTag.length ()); continue, "eMailShrinker: bug A1: removeSection: no " << beginTag); LOG_BUG (stopPos == string::npos, content.erase (startPos, beginTag.length ()); break, "eMailShrinker: bug A2: removeSection: no " << endTag); LOG ("KAZ start: " << startPos << " stop: " << stopPos); content.erase (startPos, stopPos+endTag.length ()-startPos); } } // ================================================================================ string Attachment::getSection (const string &content, const string &beginTag, const string &endTag) { DEF_LOG ("Attachment::getSection", "beginTag: " << beginTag << " endTag: " << endTag << " content: " << content); vector list; getSection (content, beginTag, endTag, list); size_t sum (0); for (const string &s : list) sum += s.length (); string result; result.reserve (sum); for (const string &s : list) result += s; LOG ("result: " << result); return result; } // ================================================================================ void Attachment::getSection (const string &content, const string &beginTag, const string &endTag, vector &result) { DEF_LOG ("Attachment::getSection", "beginTag: " << beginTag << " endTag: " << endTag << " content: " << content); for (string::size_type startPos (0); (startPos = caseInsensitiveFind (content, beginTag, startPos)) != string::npos; ) { LOG (beginTag << ": " << startPos); string::size_type stopPos = caseInsensitiveFind (content, endTag, startPos); LOG_BUG (stopPos == string::npos, break, "eMailShrinker: bug A3: " << endTag << " not found! at: " << startPos); LOG ("start: " << startPos << " stop: " << stopPos); LOG_BUG (startPos == stopPos, /**/, "eMailShrinker: bug A4: " << endTag << " without " << beginTag << " at: " << startPos); if (startPos != stopPos) { startPos += beginTag.length (); result.push_back (content.substr (startPos, stopPos-startPos)); } startPos = stopPos+endTag.length (); } } // ================================================================================ const string Attachment::getContentType () const { map::const_iterator it (env.find (contentTypeToken)); if (it == env.end ()) return ""; const string &contentTypeVal (it->second); const string::size_type semicolonPos = contentTypeVal.find (';'); if (semicolonPos == string::npos) return contentTypeVal; return contentTypeVal.substr (0, semicolonPos); } const string Attachment::getAttachName () const { DEF_LOG ("Attachment::getAttachName", ""); string result = getProp (contentTypeToken, nameRegEx); if (result.length ()) { LOG ("name=: " << result); encodedWord (result); return result; } result = getProp (contentTypeToken, nameCharsetRegEx); if (result.length ()) { LOG ("name*=: " << result); charsetValue (result); return result; } // XXX il faut composer s'il y a plusieurs ligne filename*x= result = getProp (contentDispositionToken, nameRegEx); if (result.length ()) { LOG ("filename=: " << result); encodedWord (result); return result; } // XXX il faut composer s'il y a plusieurs ligne filename*x*= result = getProp (contentDispositionToken, nameRegEx); if (result.length ()) { LOG ("filename*=: " << result); charsetValue (result); return result; } return getUnknown (getContentType ()); } const string & Attachment::getBoundary () const { return boundary; } const streamoff Attachment::getSize () const { return endPos-beginPos; } const string Attachment::getProp (const string &token, const regex ®Ex) const { DEF_LOG ("Attachment::getProp", "token: " << token); map::const_iterator it (env.find (token)); if (it == env.end ()) { LOG ("no token"); return ""; } const string &val (it->second); LOG ("val: " << val); if (!regex_match (val.begin (), val.end (), regEx)) { LOG ("no prop"); return ""; } return regex_replace (val, regEx, "$1"); } const bool Attachment::isBase64Encoding () const { return isDefProp (contentTransferEncodingToken, base64Token); } const bool Attachment::isQuotedPrintableEnconding () const { return isDefProp (contentTransferEncodingToken, quotedPrintableToken); } const bool Attachment::isTextBase64 () const { return !getProp (contentTypeToken, textRegEx).empty () && isBase64Encoding (); } const bool Attachment::isDefProp (const string &token, const string &val) const { DEF_LOG ("Attachment::getProp", "getProp token: " << token << " val: " << val); map::const_iterator it (env.find (token)); if (it == env.end ()) return false; // XXX case insensitive ?? return it->second.find (val) != string::npos; } // ================================================================================ Attachment::Attachment (ifstream &mbox, const int &level, const streamoff beginInParent, streamoff &curPos) : level (level), beginInParent (beginInParent), beginPos (curPos), contentPos (0), endPos (0), toExtract (false), toUpdate (false), toDisclaim (false), boundaryMiddleSize (0) { DEF_LOG ("Attachment::Attachment", "curPos: " << curPos << " level: " << level); readMime (mbox, curPos); readBoundaries (mbox, curPos); } // ================================================================================ void Attachment::readMime (ifstream &mbox, streamoff &curPos) { DEF_LOG ("Attachment::readMime", "curPos: " << curPos); string lastVar; string line; for (; getline (mbox, line); ) { LOG ("pos: " << curPos << " line: " << line); curPos += line.length () + 1; if (line.empty ()) break; if (line[0] == ' ' || line[0] == '\t') { if (lastVar.empty ()) { LOG_BUG (true, /**/, "eMailShrinker: bug A5: not compliant MIME. pos: " << (curPos - (line.length () + 1)) << " line: " << line); } else { LOG ("add line to var: " << line); env.find (lastVar)->second += line; LOG ("new val: " << env.find (lastVar)->second); } continue; } string::size_type colonPos = line.find (':'); if (colonPos != string::npos) { lastVar = line.substr (0, colonPos); toLower (lastVar); LOG ("find var: " << lastVar); string val (line.length () >= colonPos+2 ? line.substr (colonPos+2) : ""); // XXX check RFC " " after ": " LOG ("new var: " << lastVar << " <=> " << val); env [lastVar] = val; } } LOG ("end of mime"); contentPos = curPos; cid = getProp (contentIDToken, cidDefRegEx); boundary = getProp (contentTypeToken, boundaryRegEx); LOG ("boundary: " << boundary); if (boundary.length ()) { boundary = "--"+boundary+"--"; boundaryMiddleSize = boundary.length () - 2; } LOG ("readMime contentPos: " << contentPos << " cid: " << cid << " boundary: " << boundary); } // ================================================================================ void Attachment::readBoundaries (ifstream &mbox, streamoff &curPos) { DEF_LOG ("Attachment::readBoundaries", "curPos: " << curPos); if (caseInsensitiveFind (getContentType (), RFC822) != string::npos) { subAttachements.push_back (Attachment (mbox, level+1, curPos, curPos)); subAttachements.back ().endPos = curPos; return; } if (boundary.empty ()) return; for (; nextBondary (mbox, curPos); ) ; } bool Attachment::nextBondary (ifstream &mbox, streamoff &curPos) { DEF_LOG ("Attachment::nextBondary", "curPos: " << curPos << " boundary: " << boundary); bool isTextBase64 (subAttachements.size () && subAttachements.back ().isTextBase64 ()); LOG ("isTextBase64: " << isTextBase64 << " attach: " << *this); for (string prev, line; getline (mbox, line); ) { LOG ("curPos: " << curPos << " line: " << line); streamoff lastPos = curPos; curPos += line.length () + 1; string::size_type bpos = line.find (boundary.c_str (), 0, boundaryMiddleSize); if (bpos == string::npos) { string clearLine (line); if (isTextBase64) base64Decode (clearLine); string couple (prev+clearLine); for (vector ::iterator it = stringsToUpdate.begin (); it != stringsToUpdate.end (); ++it) if (couple.find (*it) != string::npos) { LOG ("find: "+ *it); LOG ("size: " << subAttachements.size ()); if (subAttachements.size ()) subAttachements.back ().toUpdate = true; else LOG_BUG (true, continue, "eMailShrinker: bug A10: boundary format ? " << *this); } prev = clearLine; continue; } LOG ("find: " << boundary); LOG ("lastPos: " << lastPos << " bpos: " << bpos << " boundaryMiddleSize: " << boundaryMiddleSize); if (subAttachements.size ()) subAttachements.back ().endPos = lastPos; LOG ("line: " << line << "bpos+boundaryMiddleSize: " << (bpos+boundaryMiddleSize) << " find: " << line.find ("--", bpos+boundaryMiddleSize)); bpos += boundaryMiddleSize; if (line.find ("--", bpos) == bpos) { LOG ("end"); return false; } subAttachements.push_back (Attachment (mbox, level+1, lastPos, curPos)); return true; } endPos = curPos; return false; } // ================================================================================ void Attachment::markDisclaim (bool &plainMarked, bool &htmlMarked) { if (plainMarked && htmlMarked) return; string multiProp = getProp (contentTypeToken, multiRegEx); // LOG_BUG (multiProp == ALTERNATIVE && subAttachements.size () != 2, continue, "eMailShrinker: bug A6: alternative give not 1 case (" << subAttachements.size () << ")."); if (multiProp.length ()) for (Attachment &subAttach : subAttachements) subAttach.markDisclaim (plainMarked, htmlMarked); string textProp = getProp (contentTypeToken, textRegEx); if (textProp.empty ()) return; if (!plainMarked && textProp == PLAIN) plainMarked = toUpdate = toDisclaim = true; if (!htmlMarked && textProp == HTML) htmlMarked = toUpdate = toDisclaim = true; } // ================================================================================ bool Attachment::markSignificant (const string &parentMultiProp, const streamoff &minAttachSize, ifstream &mbox, vector &allMarkedPtrs) { DEF_LOG ("Attachment::markSignificant", "parentMultiProp: " << parentMultiProp << " minAttachSize: " << minAttachSize); string textProp = getProp (contentTypeToken, textRegEx); bool cantBeExtract ((parentMultiProp == ALTERNATIVE && (textProp == PLAIN || textProp == HTML)) || (parentMultiProp == RELATED && textProp == HTML)); string multiProp = getProp (contentTypeToken, multiRegEx); for (Attachment &sub : subAttachements) cantBeExtract |= sub.markSignificant (multiProp, minAttachSize, mbox, allMarkedPtrs); if (getProp (contentTypeToken, textRegEx) == HTML) { string content = getContent (mbox); vector imgs; getSection (content, IMG_BEGIN, IMG_END, imgs); EmbeddedData::fillEmbeddedData (imgs, minAttachSize, embeddedData); if (embeddedData.size ()) toUpdate = true; } cantBeExtract |= toUpdate; if (boundary.empty () && getSize () >= minAttachSize && !cantBeExtract) cantBeExtract = toExtract = true; // XXX cantBeExtract ? if (toExtract || toUpdate || toDisclaim) allMarkedPtrs.push_back (this); return cantBeExtract; } // ================================================================================ string Attachment::getContent (ifstream &mbox) const { DEF_LOG ("Attachment::getContent", "contentPos: " << contentPos); string content; content.resize (endPos-contentPos); mbox.seekg (contentPos, ios::beg); mbox.read (&content[0], endPos-contentPos); if (isBase64Encoding ()) base64Decode (content); if (isQuotedPrintableEnconding ()) quotedDecode (content); return content; } // ================================================================================ void Attachment::println (ofstream &outbox, string content) const { DEF_LOG ("Attachment::println", "content: " << content); if (isBase64Encoding ()) base64Encode (content); if (isQuotedPrintableEnconding ()) quotedEncode (content); outbox << content; if (content.length () && content.back () != '\n') outbox << endl; } // ================================================================================ void Attachment::replaceEmbedded (string &content) const { DEF_LOG ("Attachment::replaceEmbedded", "content.length: " << content.length ()); if (!embeddedData.size ()) return; int imgIdx (-1); string::size_type startPos (0); for (const EmbeddedData &embedded : embeddedData) { LOG ("embedded: " << embedded); for ( ; ; ) { startPos = caseInsensitiveFind (content, IMG_BEGIN, startPos); LOG_BUG (startPos == string::npos, return, "eMailShrinker: bug A7: can't find " << IMG_BEGIN); ++imgIdx; if (embedded.imgIdx >= imgIdx) break; startPos += IMG_BEGIN.length (); } startPos = caseInsensitiveFind (content, SRC_BEGIN, startPos); LOG_BUG (startPos == string::npos, return, "eMailShrinker: bug A8: can't find " << SRC_BEGIN ); startPos += SRC_BEGIN.length (); const string::size_type endPos (content.find ("\"", startPos)); LOG_BUG (endPos == string::npos, return, "eMailShrinker: bug A9: can't find end of " << SRC_BEGIN ); content.replace (startPos, endPos-startPos, embedded.downloadUrl); } } // ================================================================================ ostream& kaz::operator << (ostream& os, const Attachment& attachment) { string prop, sep; if (attachment.toExtract) { prop = "to extract"; sep = ", "; } if (attachment.toUpdate) { prop += sep+"need update"; sep = ", "; } if (attachment.toDisclaim) { prop += sep+"need diclaim"; sep = ", "; } if (attachment.embeddedData.size ()) { prop += sep+"embeddedData"; } if (prop.length ()) prop = " ["+prop+"]"; os << ("****************************************"+40-(attachment.level % 20)*2) << setw (10) << SizeArg (attachment.getSize ()) << " " << attachment.getContentType () << prop << (attachment.cid.length () ? " id: "+attachment.cid : "") << (attachment.boundary.length () ? " boundary: "+attachment.boundary : "") << " (" << attachment.beginPos << " / " << attachment.contentPos << " / " << attachment.endPos << ") " << endl; for (const EmbeddedData &embedded : attachment.embeddedData) os << setw (((attachment.level+1) % 20)*2) << "" << setw (10) << SizeArg (embedded.dataLength) << " embedded [to extract] " << embedded; for (const Attachment &sub : attachment.subAttachements) { os << sub; } return os; } // ================================================================================