From 64f3e49c37ab1753e4366da14321aa48ef8a0311 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois?= Date: Thu, 6 May 2021 09:58:16 +0200 Subject: [PATCH] C++ src --- LICENCE | 33 ++ src/cpp/Attachment.cpp | 504 +++++++++++++++++++++++++++ src/cpp/EmbeddedData.cpp | 97 ++++++ src/cpp/MainAttachment.cpp | 598 +++++++++++++++++++++++++++++++++ src/cpp/SizeArg.cpp | 101 ++++++ src/cpp/eMailShrinker.cpp | 232 +++++++++++++ src/cpp/jirafeauAPI.cpp | 293 ++++++++++++++++ src/cpp/kazDebug.cpp | 86 +++++ src/cpp/kazMisc.cpp | 437 ++++++++++++++++++++++++ src/include/Attachment.hpp | 154 +++++++++ src/include/EmbeddedData.hpp | 70 ++++ src/include/MainAttachment.hpp | 121 +++++++ src/include/SizeArg.hpp | 74 ++++ src/include/kazDebug.hpp | 134 ++++++++ src/include/kazMisc.hpp | 141 ++++++++ 15 files changed, 3075 insertions(+) create mode 100644 LICENCE create mode 100644 src/cpp/Attachment.cpp create mode 100644 src/cpp/EmbeddedData.cpp create mode 100644 src/cpp/MainAttachment.cpp create mode 100644 src/cpp/SizeArg.cpp create mode 100644 src/cpp/eMailShrinker.cpp create mode 100644 src/cpp/jirafeauAPI.cpp create mode 100644 src/cpp/kazDebug.cpp create mode 100644 src/cpp/kazMisc.cpp create mode 100644 src/include/Attachment.hpp create mode 100644 src/include/EmbeddedData.hpp create mode 100644 src/include/MainAttachment.hpp create mode 100644 src/include/SizeArg.hpp create mode 100644 src/include/kazDebug.hpp create mode 100644 src/include/kazMisc.hpp diff --git a/LICENCE b/LICENCE new file mode 100644 index 0000000..acfe6d1 --- /dev/null +++ b/LICENCE @@ -0,0 +1,33 @@ +//////////////////////////////////////////////////////////////////////////// +// Copyright KAZ 2021 // +// // +// contact (at) kaz.bzh // +// // +// This software is a filter to shrink email by attachment extraction. // +// // +// This software is governed by the CeCILL-B license under French law and // +// abiding by the rules of distribution of free software. You can use, // +// modify and/or redistribute the software under the terms of the // +// CeCILL-B license as circulated by CEA, CNRS and INRIA at the following // +// URL "http://www.cecill.info". // +// // +// As a counterpart to the access to the source code and rights to copy, // +// modify and redistribute granted by the license, users are provided // +// only with a limited warranty and the software's author, the holder of // +// the economic rights, and the successive licensors have only limited // +// liability. // +// // +// In this respect, the user's attention is drawn to the risks associated // +// with loading, using, modifying and/or developing or reproducing the // +// software by the user in light of its specific status of free software, // +// that may mean that it is complicated to manipulate, and that also // +// therefore means that it is reserved for developers and experienced // +// professionals having in-depth computer knowledge. Users are therefore // +// encouraged to load and test the software's suitability as regards // +// their requirements in conditions enabling the security of their // +// systems and/or data to be ensured and, more generally, to use and // +// operate it in the same conditions as regards security. // +// // +// The fact that you are presently reading this means that you have had // +// knowledge of the CeCILL-B license and that you accept its terms. // +//////////////////////////////////////////////////////////////////////////// diff --git a/src/cpp/Attachment.cpp b/src/cpp/Attachment.cpp new file mode 100644 index 0000000..6a3f70c --- /dev/null +++ b/src/cpp/Attachment.cpp @@ -0,0 +1,504 @@ +//////////////////////////////////////////////////////////////////////////// +// Copyright KAZ 2021 // +// // +// contact (at) kaz.bzh // +// // +// This software is a filter to shrink email by attachment extraction. // +// // +// This software is governed by the CeCILL-B license under French law and // +// abiding by the rules of distribution of free software. You can use, // +// modify and/or redistribute the software under the terms of the // +// CeCILL-B license as circulated by CEA, CNRS and INRIA at the following // +// URL "http://www.cecill.info". // +// // +// As a counterpart to the access to the source code and rights to copy, // +// modify and redistribute granted by the license, users are provided // +// only with a limited warranty and the software's author, the holder of // +// the economic rights, and the successive licensors have only limited // +// liability. // +// // +// In this respect, the user's attention is drawn to the risks associated // +// with loading, using, modifying and/or developing or reproducing the // +// software by the user in light of its specific status of free software, // +// that may mean that it is complicated to manipulate, and that also // +// therefore means that it is reserved for developers and experienced // +// professionals having in-depth computer knowledge. Users are therefore // +// encouraged to load and test the software's suitability as regards // +// their requirements in conditions enabling the security of their // +// systems and/or data to be ensured and, more generally, to use and // +// operate it in the same conditions as regards security. // +// // +// The fact that you are presently reading this means that you have had // +// knowledge of the CeCILL-B license and that you accept its terms. // +//////////////////////////////////////////////////////////////////////////// + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "kazDebug.hpp" +#include "kazMisc.hpp" +#include "SizeArg.hpp" +#include "Attachment.hpp" + +using namespace std; +using namespace kaz; + +// ================================================================================ +const string Attachment::contentTypeToken ("content-type"); +const string Attachment::contentDispositionToken ("content-disposition"); +const string Attachment::contentTransferEncodingToken ("content-transfer-encoding"); +const string Attachment::base64Token ("base64"); +const string Attachment::quotedPrintableToken ("quoted-printable"); +const string Attachment::contentIDToken ("content-id"); +const string Attachment::PLAIN ("plain"); +const string Attachment::HTML ("html"); +const string Attachment::RELATED ("related"); +const string Attachment::ALTERNATIVE ("alternative"); + + +const regex Attachment::nameCharsetRegEx (".*name\\*=(.*)"); +const regex Attachment::nameRegEx (".*name=\"([^\"]*)\".*"); +const regex Attachment::boundaryRegEx (".*boundary=\"?([^\" ]*)\"?.*"); +const regex Attachment::cidDefRegEx (".*<([^>]*)>.*"); +const regex Attachment::textRegEx (".*text/("+PLAIN+"|"+HTML+").*"); +const regex Attachment::multiRegEx ("\\s*multipart/(mixed|"+RELATED+"|"+ALTERNATIVE+").*"); + +const string Attachment::IMG_BEGIN (""); + + + +static const string SRC_BEGIN ("SRC=\""); +static const string RFC822 ("message/rfc822"); + +// ================================================================================ +string +Attachment::getUnknown (const string &contentType) { + DEF_LOG ("Attachment::getUnknown", "contentType: " << contentType); + static time_t now (time (NULL)); + static int count (0); + + tm *ltm = localtime (&now); + ostringstream nameStream; + nameStream << "U-" + << std::setfill ('0') << std::setw (2) << (ltm->tm_year-100) + << std::setfill ('0') << std::setw (2) << (1 + ltm->tm_mon) + << std::setfill ('0') << std::setw (2) << ltm->tm_mday + << std::setfill ('0') << std::setw (2) << ltm->tm_hour + << std::setfill ('0') << std::setw (2) << ltm->tm_min + << std::setfill ('0') << std::setw (2) << ltm->tm_sec + << "-" << count; + const string::size_type subTypePos (contentType.find ("/")); + if (subTypePos != string::npos) + nameStream << "." << contentType.substr (subTypePos+1); + ++count; + LOG ("name: " << nameStream.str ()); + return nameStream.str (); +} + +// ================================================================================ +void +Attachment::removeSection (string &content, const string &beginTag, const string &endTag) { + DEF_LOG ("Attachment::removeSection", "beginTag: " << beginTag << " endTag: " << endTag); + for (string::size_type startPos (0); + (startPos = caseInsensitiveFind (content, beginTag, startPos)) != string::npos; + ) { + string::size_type stopPos = caseInsensitiveFind (content, endTag, startPos); + + LOG_BUG (stopPos == startPos, content.erase (startPos, endTag.length ()); continue, "eMailShrinker: bug A1: removeSection: no " << beginTag); + LOG_BUG (stopPos == string::npos, content.erase (startPos, beginTag.length ()); break, "eMailShrinker: bug A2: removeSection: no " << endTag); + LOG ("KAZ start: " << startPos << " stop: " << stopPos); + + content.erase (startPos, stopPos+endTag.length ()-startPos); + } +} + +// ================================================================================ +string +Attachment::getSection (const string &content, const string &beginTag, const string &endTag) { + DEF_LOG ("Attachment::getSection", "beginTag: " << beginTag << " endTag: " << endTag << " content: " << content); + vector list; + getSection (content, beginTag, endTag, list); + size_t sum (0); + for (const string &s : list) + sum += s.length (); + string result; + result.reserve (sum); + for (const string &s : list) + result += s; + LOG ("result: " << result); + return result; +} + +// ================================================================================ +void +Attachment::getSection (const string &content, const string &beginTag, const string &endTag, vector &result) { + DEF_LOG ("Attachment::getSection", "beginTag: " << beginTag << " endTag: " << endTag << " content: " << content); + for (string::size_type startPos (0); + (startPos = caseInsensitiveFind (content, beginTag, startPos)) != string::npos; + ) { + LOG (beginTag << ": " << startPos); + string::size_type stopPos = caseInsensitiveFind (content, endTag, startPos); + + LOG_BUG (stopPos == string::npos, break, "eMailShrinker: bug A3: " << endTag << " not found! at: " << startPos); + LOG ("start: " << startPos << " stop: " << stopPos); + + LOG_BUG (startPos == stopPos, /**/, "eMailShrinker: bug A4: " << endTag << " without " << beginTag << " at: " << startPos); + if (startPos != stopPos) { + startPos += beginTag.length (); + result.push_back (content.substr (startPos, stopPos-startPos)); + } + startPos = stopPos+endTag.length (); + } +} + +// ================================================================================ +const string +Attachment::getContentType () const { + map::const_iterator it (env.find (contentTypeToken)); + if (it == env.end ()) + return ""; + const string &contentTypeVal (it->second); + const string::size_type semicolonPos = contentTypeVal.find (';'); + if (semicolonPos == string::npos) + return contentTypeVal; + return contentTypeVal.substr (0, semicolonPos); +} + +const string +Attachment::getAttachName () const { + DEF_LOG ("Attachment::getAttachName", ""); + string result = getProp (contentTypeToken, nameRegEx); + if (result.length ()) { + LOG ("name=: " << result); + encodedWord (result); + return result; + } + result = getProp (contentTypeToken, nameCharsetRegEx); + if (result.length ()) { + LOG ("name*=: " << result); + charsetValue (result); + return result; + } + // XXX il faut composer s'il y a plusieurs ligne filename*x= + result = getProp (contentDispositionToken, nameRegEx); + if (result.length ()) { + LOG ("filename=: " << result); + encodedWord (result); + return result; + } + // XXX il faut composer s'il y a plusieurs ligne filename*x*= + result = getProp (contentDispositionToken, nameRegEx); + if (result.length ()) { + LOG ("filename*=: " << result); + charsetValue (result); + return result; + } + return getUnknown (getContentType ()); +} + +const string & +Attachment::getBoundary () const { + return boundary; +} + +const streamoff +Attachment::getSize () const { + return endPos-beginPos; +} + +const string +Attachment::getProp (const string &token, const regex ®Ex) const { + DEF_LOG ("Attachment::getProp", "token: " << token); + map::const_iterator it (env.find (token)); + if (it == env.end ()) { + LOG ("no token"); + return ""; + } + const string &val (it->second); + LOG ("val: " << val); + if (!regex_match (val.begin (), val.end (), regEx)) { + LOG ("no prop"); + return ""; + } + return regex_replace (val, regEx, "$1"); +} + +const bool +Attachment::isBase64Encoding () const { + return isDefProp (contentTransferEncodingToken, base64Token); +} + +const bool +Attachment::isQuotedPrintableEnconding () const { + return isDefProp (contentTransferEncodingToken, quotedPrintableToken); +} + +const bool +Attachment::isTextBase64 () const { + return !getProp (contentTypeToken, textRegEx).empty () && isBase64Encoding (); +} + +const bool +Attachment::isDefProp (const string &token, const string &val) const { + DEF_LOG ("Attachment::getProp", "getProp token: " << token << " val: " << val); + map::const_iterator it (env.find (token)); + if (it == env.end ()) + return false; + // XXX case insensitive ?? + return it->second.find (val) != string::npos; +} + +// ================================================================================ +Attachment::Attachment (ifstream &mbox, const int &level, const streamoff beginInParent, streamoff &curPos) + : level (level), + beginInParent (beginInParent), + beginPos (curPos), + contentPos (0), + endPos (0), + toExtract (false), + toUpdate (false), + toDisclaim (false), + boundaryMiddleSize (0) { + DEF_LOG ("Attachment::Attachment", "curPos: " << curPos << " level: " << level); + readMime (mbox, curPos); + readBoundaries (mbox, curPos); +} + +// ================================================================================ +void +Attachment::readMime (ifstream &mbox, streamoff &curPos) { + DEF_LOG ("Attachment::readMime", "curPos: " << curPos); + string lastVar; + string line; + for (; getline (mbox, line); ) { + LOG ("pos: " << curPos << " line: " << line); + curPos += line.length () + 1; + if (line.empty ()) + break; + if (line[0] == ' ' || line[0] == '\t') { + if (lastVar.empty ()) { + + LOG_BUG (true, /**/, "eMailShrinker: bug A5: not compliant MIME. pos: " << (curPos - (line.length () + 1)) << " line: " << line); + } else { + LOG ("add line to var: " << line); + env.find (lastVar)->second += line; + LOG ("new val: " << env.find (lastVar)->second); + } + continue; + } + string::size_type colonPos = line.find (':'); + if (colonPos != string::npos) { + lastVar = line.substr (0, colonPos); + toLower (lastVar); + LOG ("find var: " << lastVar); + string val (line.length () >= colonPos+2 ? line.substr (colonPos+2) : ""); // XXX check RFC " " after ": " + LOG ("new var: " << lastVar << " <=> " << val); + env [lastVar] = val; + } + } + LOG ("end of mime"); + + contentPos = curPos; + cid = getProp (contentIDToken, cidDefRegEx); + boundary = getProp (contentTypeToken, boundaryRegEx); + LOG ("boundary: " << boundary); + if (boundary.length ()) { + boundary = "--"+boundary+"--"; + boundaryMiddleSize = boundary.length () - 2; + } + LOG ("readMime contentPos: " << contentPos << " cid: " << cid << " boundary: " << boundary); +} + +// ================================================================================ +void +Attachment::readBoundaries (ifstream &mbox, streamoff &curPos) { + DEF_LOG ("Attachment::readBoundaries", "curPos: " << curPos); + + if (caseInsensitiveFind (getContentType (), RFC822) != string::npos) { + subAttachements.push_back (Attachment (mbox, level+1, curPos, curPos)); + subAttachements.back ().endPos = curPos; + return; + } + if (boundary.empty ()) + return; + for (; nextBondary (mbox, curPos); ) + ; +} + +bool +Attachment::nextBondary (ifstream &mbox, streamoff &curPos) { + DEF_LOG ("Attachment::nextBondary", "curPos: " << curPos << " boundary: " << boundary); + bool isTextBase64 (subAttachements.size () && subAttachements.back ().isTextBase64 ()); + LOG ("isTextBase64: " << isTextBase64 << " attach: " << *this); + for (string prev, line; getline (mbox, line); ) { + LOG ("curPos: " << curPos << " line: " << line); + streamoff lastPos = curPos; + curPos += line.length () + 1; + + string::size_type bpos = line.find (boundary.c_str (), 0, boundaryMiddleSize); + if (bpos == string::npos) { + string clearLine (line); + if (isTextBase64) + base64Decode (clearLine); + string couple (prev+clearLine); + for (vector ::iterator it = stringsToUpdate.begin (); + it != stringsToUpdate.end (); + ++it) + if (couple.find (*it) != string::npos) { + LOG ("find: "+ *it); + subAttachements.back ().toUpdate = true; + } + prev = clearLine; + continue; + } + LOG ("find: " << boundary); + LOG ("lastPos: " << lastPos << " bpos: " << bpos << " boundaryMiddleSize: " << boundaryMiddleSize); + if (subAttachements.size ()) + subAttachements.back ().endPos = lastPos; + LOG ("line: " << line << "bpos+boundaryMiddleSize: " << (bpos+boundaryMiddleSize) << " find: " << line.find ("--", bpos+boundaryMiddleSize)); + bpos += boundaryMiddleSize; + if (line.find ("--", bpos) == bpos) { + LOG ("end"); + return false; + } + subAttachements.push_back (Attachment (mbox, level+1, lastPos, curPos)); + return true; + } + endPos = curPos; + return false; +} + +// ================================================================================ +void +Attachment::markDisclaim (bool &plainMarked, bool &htmlMarked) { + if (plainMarked && htmlMarked) + return; + string multiProp = getProp (contentTypeToken, multiRegEx); + // LOG_BUG (multiProp == ALTERNATIVE && subAttachements.size () != 2, continue, "eMailShrinker: bug A6: alternative give not 1 case (" << subAttachements.size () << ")."); + if (multiProp.length ()) + for (Attachment &subAttach : subAttachements) + subAttach.markDisclaim (plainMarked, htmlMarked); + string textProp = getProp (contentTypeToken, textRegEx); + if (textProp.empty ()) + return; + if (!plainMarked && textProp == PLAIN) + plainMarked = toUpdate = toDisclaim = true; + if (!htmlMarked && textProp == HTML) + htmlMarked = toUpdate = toDisclaim = true; +} + +// ================================================================================ +bool +Attachment::markSignificant (const string &parentMultiProp, const streamoff &minAttachSize, ifstream &mbox, vector &allMarkedPtrs) { + DEF_LOG ("Attachment::markSignificant", "parentMultiProp: " << parentMultiProp << " minAttachSize: " << minAttachSize); + string textProp = getProp (contentTypeToken, textRegEx); + bool cantBeExtract ((parentMultiProp == ALTERNATIVE && (textProp == PLAIN || textProp == HTML)) || + (parentMultiProp == RELATED && textProp == HTML)); + string multiProp = getProp (contentTypeToken, multiRegEx); + for (Attachment &sub : subAttachements) + cantBeExtract |= sub.markSignificant (multiProp, minAttachSize, mbox, allMarkedPtrs); + if (getProp (contentTypeToken, textRegEx) == HTML) { + string content = getContent (mbox); + vector imgs; + getSection (content, IMG_BEGIN, IMG_END, imgs); + EmbeddedData::fillEmbeddedData (imgs, minAttachSize, embeddedData); + if (embeddedData.size ()) + toUpdate = true; + } + cantBeExtract |= toUpdate; + if (boundary.empty () && getSize () >= minAttachSize && !cantBeExtract) + cantBeExtract = toExtract = true; // XXX cantBeExtract ? + if (toExtract || toUpdate || toDisclaim) + allMarkedPtrs.push_back (this); + return cantBeExtract; +} + +// ================================================================================ +string +Attachment::getContent (ifstream &mbox) const { + DEF_LOG ("Attachment::getContent", "contentPos: " << contentPos); + string content; + content.resize (endPos-contentPos); + mbox.seekg (contentPos, ios::beg); + mbox.read (&content[0], endPos-contentPos); + if (isBase64Encoding ()) + base64Decode (content); + if (isQuotedPrintableEnconding ()) + quotedDecode (content); + return content; +} + +// ================================================================================ +void +Attachment::println (ofstream &outbox, string content) const { + DEF_LOG ("Attachment::println", "content: " << content); + if (isBase64Encoding ()) + base64Encode (content); + if (isQuotedPrintableEnconding ()) + quotedEncode (content); + outbox << content; + if (content.length () && content.back () != '\n') + outbox << endl; +} + +// ================================================================================ +void +Attachment::replaceEmbedded (string &content) const { + DEF_LOG ("Attachment::replaceEmbedded", "content.length: " << content.length ()); + if (!embeddedData.size ()) + return; + int imgIdx (-1); + string::size_type startPos (0); + for (const EmbeddedData &embedded : embeddedData) { + LOG ("embedded: " << embedded); + for ( ; ; ) { + startPos = caseInsensitiveFind (content, IMG_BEGIN, startPos); + LOG_BUG (startPos == string::npos, return, "eMailShrinker: bug A7: can't find " << IMG_BEGIN); + ++imgIdx; + if (embedded.imgIdx >= imgIdx) + break; + startPos += IMG_BEGIN.length (); + } + startPos = caseInsensitiveFind (content, SRC_BEGIN, startPos); + + LOG_BUG (startPos == string::npos, return, "eMailShrinker: bug A8: can't find " << SRC_BEGIN ); + startPos += SRC_BEGIN.length (); + const string::size_type endPos (content.find ("\"", startPos)); + + LOG_BUG (endPos == string::npos, return, "eMailShrinker: bug A9: can't find end of " << SRC_BEGIN ); + content.replace (startPos, endPos-startPos, embedded.downloadUrl); + } +} + +// ================================================================================ +ostream& +kaz::operator << (ostream& os, const Attachment& attachment) { + string prop, sep; + if (attachment.toExtract) { prop = "to extract"; sep = ", "; } + if (attachment.toUpdate) { prop += sep+"need update"; sep = ", "; } + if (attachment.toDisclaim) { prop += sep+"need diclaim"; sep = ", "; } + if (attachment.embeddedData.size ()) { prop += sep+"embeddedData"; } + if (prop.length ()) + prop = " ["+prop+"]"; + + os << setw ((attachment.level % 20)*2) << "" << setw (10) << SizeArg (attachment.getSize ()) << " " << attachment.getContentType () + << prop << (attachment.cid.length () ? " id: "+attachment.cid : "") + << (attachment.boundary.length () ? " boundary: "+attachment.boundary : "") + << " (" << attachment.beginPos << " / " << attachment.contentPos << " / " << attachment.endPos << ") " << endl; + for (const EmbeddedData &embedded : attachment.embeddedData) + os << setw (((attachment.level+1) % 20)*2) << "" << setw (10) << SizeArg (embedded.dataLength) << " embedded [to extract] " << embedded; + for (const Attachment &sub : attachment.subAttachements) { + os << sub; + } + return os; +} + +// ================================================================================ diff --git a/src/cpp/EmbeddedData.cpp b/src/cpp/EmbeddedData.cpp new file mode 100644 index 0000000..d998014 --- /dev/null +++ b/src/cpp/EmbeddedData.cpp @@ -0,0 +1,97 @@ +//////////////////////////////////////////////////////////////////////////// +// Copyright KAZ 2021 // +// // +// contact (at) kaz.bzh // +// // +// This software is a filter to shrink email by attachment extraction. // +// // +// This software is governed by the CeCILL-B license under French law and // +// abiding by the rules of distribution of free software. You can use, // +// modify and/or redistribute the software under the terms of the // +// CeCILL-B license as circulated by CEA, CNRS and INRIA at the following // +// URL "http://www.cecill.info". // +// // +// As a counterpart to the access to the source code and rights to copy, // +// modify and redistribute granted by the license, users are provided // +// only with a limited warranty and the software's author, the holder of // +// the economic rights, and the successive licensors have only limited // +// liability. // +// // +// In this respect, the user's attention is drawn to the risks associated // +// with loading, using, modifying and/or developing or reproducing the // +// software by the user in light of its specific status of free software, // +// that may mean that it is complicated to manipulate, and that also // +// therefore means that it is reserved for developers and experienced // +// professionals having in-depth computer knowledge. Users are therefore // +// encouraged to load and test the software's suitability as regards // +// their requirements in conditions enabling the security of their // +// systems and/or data to be ensured and, more generally, to use and // +// operate it in the same conditions as regards security. // +// // +// The fact that you are presently reading this means that you have had // +// knowledge of the CeCILL-B license and that you accept its terms. // +//////////////////////////////////////////////////////////////////////////// + +#include "kazDebug.hpp" +#include "kazMisc.hpp" +#include "EmbeddedData.hpp" +#include "Attachment.hpp" + +using namespace std; +using namespace kaz; + +// ================================================================================ +static const string EMBEDDED_TAG ("SRC=\"DATA:"); + +// ================================================================================ +EmbeddedData::EmbeddedData (const int &imgIdx, const string &contentType, const string &name, const string::size_type &startData, const string::size_type &dataLength) + : imgIdx (imgIdx), + contentType (contentType), + name (name), + startData (startData), + dataLength (dataLength) { + DEF_LOG ("EmbeddedData::EmbeddedData", "imgIdx: " << imgIdx << " contentType:" << contentType << " name:" << name << " startData:" << startData << " dataLength:" << dataLength); +} + +// ================================================================================ +void +EmbeddedData::fillEmbeddedData (const vector &imgs, const streamoff &minAttachSize, vector &data) { + DEF_LOG ("EmbeddedData::fillEmbeddedData", "imgs.size: " << imgs.size () << " minAttachSize:" << minAttachSize << " data.size:" << data.size ()); + + int imgIdx (-1); + for (const string &img : imgs) { + ++imgIdx; + if (streamoff (img.length ()) < minAttachSize) + continue; + string::size_type startPos (caseInsensitiveFind (img, EMBEDDED_TAG)); + if (startPos == string::npos) + continue; + startPos += EMBEDDED_TAG.length (); + // XXX check base64 ? + string::size_type endPos = img.find_first_of (";,", startPos); + + LOG_BUG (endPos == string::npos, continue, "eMailShrinker: bug E1: can't find end of contentType" ); + const string contentType (img.substr (startPos, endPos-startPos)); + const string name (Attachment::getUnknown (contentType)); + startPos = img.find (',', startPos); + + LOG_BUG (startPos == string::npos, continue, "eMailShrinker: bug E2: can't find start data" ); + + ++startPos; + endPos = img.find ('"', startPos); + data.push_back (EmbeddedData (imgIdx, contentType, name, startPos, endPos-startPos)); + } +} + +// ================================================================================ +ostream& +kaz::operator << (ostream& os, const EmbeddedData& embeddedData) { + os << embeddedData.imgIdx << ": " + << embeddedData.contentType << " - " << embeddedData.name + << " (" << embeddedData.startData << " / " << embeddedData.dataLength << ") " + << embeddedData.downloadUrl << " - " << embeddedData.downloadId + << endl; + return os; +} + +// ================================================================================ diff --git a/src/cpp/MainAttachment.cpp b/src/cpp/MainAttachment.cpp new file mode 100644 index 0000000..43fd742 --- /dev/null +++ b/src/cpp/MainAttachment.cpp @@ -0,0 +1,598 @@ +//////////////////////////////////////////////////////////////////////////// +// Copyright KAZ 2021 // +// // +// contact (at) kaz.bzh // +// // +// This software is a filter to shrink email by attachment extraction. // +// // +// This software is governed by the CeCILL-B license under French law and // +// abiding by the rules of distribution of free software. You can use, // +// modify and/or redistribute the software under the terms of the // +// CeCILL-B license as circulated by CEA, CNRS and INRIA at the following // +// URL "http://www.cecill.info". // +// // +// As a counterpart to the access to the source code and rights to copy, // +// modify and redistribute granted by the license, users are provided // +// only with a limited warranty and the software's author, the holder of // +// the economic rights, and the successive licensors have only limited // +// liability. // +// // +// In this respect, the user's attention is drawn to the risks associated // +// with loading, using, modifying and/or developing or reproducing the // +// software by the user in light of its specific status of free software, // +// that may mean that it is complicated to manipulate, and that also // +// therefore means that it is reserved for developers and experienced // +// professionals having in-depth computer knowledge. Users are therefore // +// encouraged to load and test the software's suitability as regards // +// their requirements in conditions enabling the security of their // +// systems and/or data to be ensured and, more generally, to use and // +// operate it in the same conditions as regards security. // +// // +// The fact that you are presently reading this means that you have had // +// knowledge of the CeCILL-B license and that you accept its terms. // +//////////////////////////////////////////////////////////////////////////// + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "kazDebug.hpp" +#include "kazMisc.hpp" +#include "SizeArg.hpp" +#include "Attachment.hpp" +#include "MainAttachment.hpp" + +using namespace std; +using namespace kaz; + +static const string KAZ_WEB_SITE = "https://kaz.bzh/"; +static const string TMPL_DOWNLOAD = "{{DOWNLOAD}}"; +static const string TMPL_FILENAME = "{{FILENAME}}"; +static const string CID = "cid:"; + +static const string KAZ_PLAIN_HR = "______________________________________________________________________________"; +static const string KAZ_PLAIN_START = "~~ PJ-KAZ !"; // don't end whith space +static const string KAZ_PLAIN_STOP = KAZ_PLAIN_START+" ~~"; +static const string KAZ_PLAIN_DONT_TOUCH = "(concervez cette partie intacte dans votre réponse si vous voulez transmettre les documents précédents)"; +static const string KAZ_PLAIN_WARNING = "Attention : Kaz a dépollué ce message. Les pièces jointes ont été retirées et placées dans un dépôt provisoire. Elles seront automatiquement supprimées dans 1 mois. Si elles sont importantes et que vous souhaitez les conserver, vous devez utiliser les liens ci-dessous. Pour mieux comprendre la politique de nos services visitez kaz.bzh"; +static const string KAZ_PLAIN_DOWLOAD_ONE = "Vos pièces jointes sont à télécharger individuellement ici :"; +static const string KAZ_PLAIN_DOWLOAD_OTHER = "(Contenu dans des messages précédents)"; +static const string KAZ_PLAIN_DOWLOAD_ALL = "Vous pouvez télécharger l'ensemble dans une archive là :"; + +static const string HEAD = ""; +static const string HEAD_END = ""; +static const string KAZ_CSS_URL = "https://kaz.bzh/m/email.css"; +static const string KAZ_CSS = ""; +static const string A_END = ""; +static const string LI_BEGIN = ""; +static const string LI_ALL = LI_BEGIN+" class=\"all\">"; +static const string LI_END = ""; +static const string HREF_ONE = "href=\""; +static const string BODY_END = ""; +static const string HTML_END = ""; + +static const string KAZ_HTML_TAG = ""; +static const string KAZ_HTML_STOP = KAZ_HTML_TAG+" STOP-->"; +// Textes précédents encodés en SGML +static const string KAZ_HTML_DONT_TOUCH = "(concervez cette partie intacte dans votre réponse si vous voulez transmettre les documents précédents)"; +static const string KAZ_HTML_DOWLOAD_ONE = "Vos pièces jointes sont à télécharger individuellement ici :"; +static const string KAZ_HTML_DOWLOAD_OTHER = "(Contenu dans des messages précédents)"; +static const string KAZ_HTML_DOWLOAD_ALL = "Vous pouvez télécharger l'ensemble dans une archive là :"; +static const string KAZ_HTML_ARCHIVE = "archive"; + +// ================================================================================ +vector +Attachment::stringsToUpdate ({KAZ_PLAIN_START, "\""+CID}); + +// ================================================================================ +const string MainAttachment::templatePlainAddLink (" - "+TMPL_DOWNLOAD+" "+TMPL_FILENAME+"\n"); +const string MainAttachment::templatePlainAllLink ("\n"+KAZ_PLAIN_DOWLOAD_ALL+"\n * "+TMPL_DOWNLOAD+"\n"); + +const string MainAttachment::templateHtmlHeader (KAZ_HTML_START+"

"+KAZ_PLAIN_START+"


\n" + "
" + "

"+KAZ_HTML_DONT_TOUCH+"

\n" + "

"+KAZ_HTML_DOWLOAD_ONE+"

    \n"); +const string MainAttachment::templateHtmlAddLink (LI_ONE+""+TMPL_FILENAME+""+LI_END+"\n"); +const string MainAttachment::templateHtmlOtherLink ("
"+KAZ_HTML_DOWLOAD_OTHER+"
    \n"); +const string MainAttachment::templateHtmlAllLink ("
    "+LI_ALL+KAZ_HTML_DOWLOAD_ALL+" "+KAZ_HTML_ARCHIVE+""+LI_END+"\n"); +const string MainAttachment::templateHtmlFooter ("

\n" + "

"+KAZ_WEB_SITE+"

\n" + "

"+KAZ_PLAIN_STOP+"

"+KAZ_HTML_STOP+"\n"); + +const regex MainAttachment::whiteSpaceRegEx ("\\s+"); + +// ================================================================================ +void +MainAttachment::copy (ifstream &mbox, ofstream &outbox, const streamoff &begin, const streamoff &end) { + DEF_LOG ("MainAttachment::copy", "begin: " << begin << " end: " << end); + mbox.seekg (begin, ios::beg); + char c; + for (streamoff pos (begin); pos < end; ++pos) { + mbox.get (c); + outbox.put (c); + } + outbox.flush (); +} + +// ================================================================================ +void +MainAttachment::fillUrlId (string &url, string &id) { + DEF_LOG ("MainAttachment::fillUrlId", ""); + url = id = ""; + string urlId; + getline (cin, urlId); + LOG ("get URL: " << urlId); + vector urlIdVect { sregex_token_iterator (urlId.begin(), urlId.end (), whiteSpaceRegEx, -1), {} }; + if (urlIdVect [0].empty ()) + return; + url = urlIdVect [0]; + if (urlIdVect.size () > 1) + id = urlIdVect [1]; +} + +// ================================================================================ +void +MainAttachment::setExtractDir (const bfs::path &extractDir) { + if (extractDir.empty ()) + throw invalid_argument ("no tmp dir"); + this->extractDir = extractDir; + if (! is_directory (extractDir)) + bfs::create_directory (extractDir); +} + +void +MainAttachment::setArchiveDownloadURL (const string &archiveDownloadURL) { + this->archiveDownloadURL = archiveDownloadURL; +} + +// ================================================================================ +void +MainAttachment::addLink (string &plain, string &html, const string &url, const string &name) const { + string plainNewOneLink (templatePlainAddLink); + replaceAll (plainNewOneLink, TMPL_DOWNLOAD, url); + replaceAll (plainNewOneLink, TMPL_FILENAME, name); + plain += plainNewOneLink; + string htmlNewOneLink (templateHtmlAddLink); + string codedUrl (url); + replaceAll (codedUrl, "&", "&"); + replaceAll (htmlNewOneLink, TMPL_DOWNLOAD, codedUrl); + replaceAll (htmlNewOneLink, TMPL_FILENAME, name); + html += htmlNewOneLink; +} + +// ================================================================================ +void +MainAttachment::getDisclaim (string &plain, string &html) const { + DEF_LOG ("Attachment::getDisclaim", ""); + plain = html = ""; + + int linkCount (0); + string allId; + string plainNewLinks, htmlNewLinks; + for (Attachment *attachP : allMarkedPtrs) { + if (!attachP->toExtract) + continue; + addLink (plainNewLinks, htmlNewLinks, attachP->downloadUrl, attachP->getAttachName ()); + ++linkCount; + allId += attachP->downloadId; + // if (previousLinks [attachP->downloadUrl] != previousLinks.end ()) + // // impossible puisque le lien est toujours nouveau + // previousLinks.erase (attachP->downloadUrl); + } + for (Attachment *attachP : allMarkedPtrs) { + if (!attachP->embeddedData.size ()) + continue; + for (EmbeddedData &embedded : attachP->embeddedData) { + addLink (plainNewLinks, htmlNewLinks, embedded.downloadUrl, embedded.name); + ++linkCount; + allId += embedded.downloadId; + } + } + LOG ("allId:" << allId); + + string plainOldLinks, htmlOldLinks; + for (map ::const_iterator it = previousLinks.begin (); it != previousLinks.end (); ++it) + addLink (plainOldLinks, htmlOldLinks, it->first, it->second); + linkCount += previousLinks.size (); + if (! linkCount) { + LOG ("no attach"); + return; + } + + plain = "\n"+KAZ_PLAIN_START+"\n"+KAZ_PLAIN_HR+"\n"+KAZ_PLAIN_DONT_TOUCH+"\n\n"+KAZ_PLAIN_WARNING+"\n\n"+KAZ_PLAIN_DOWLOAD_ONE+"\n"+plainNewLinks; + html = templateHtmlHeader+htmlNewLinks; + if (previousLinks.size ()) { + plain += KAZ_PLAIN_DOWLOAD_OTHER+"\n"+plainOldLinks; + html += templateHtmlOtherLink+htmlOldLinks; + } + if (linkCount > 1 && archiveDownloadURL.length ()) { + string allPlainLinks (templatePlainAllLink); + replaceAll (allPlainLinks, TMPL_DOWNLOAD, archiveDownloadURL+allId); + plain += allPlainLinks; + string allLinks (templateHtmlAllLink); + // allId => & => & done + replaceAll (allLinks, TMPL_DOWNLOAD, archiveDownloadURL+allId); + html += allLinks; + } + html += templateHtmlFooter; + plain += "\n\n"+KAZ_WEB_SITE+"\n"+KAZ_PLAIN_HR+"\n"+KAZ_PLAIN_STOP+"\n"; + // & => & done + LOG ("plain: " << plain); + LOG ("html: " << html); +} + +// ================================================================================ +void +MainAttachment::addPrevious (const string &href, const string &name) { + DEF_LOG ("Attachment::addPrevious", "href: " << href << " name: " << name); + const string oldVal = previousLinks [href]; + if (name.empty ()) + return; + previousLinks.erase (href); + previousLinks [href] = name; + LOG ("inserted: " << href << ": " << previousLinks[href]); +} + +void +MainAttachment::extractLinks (const string &extractedPlainKAZ) { + DEF_LOG ("Attachment::extractedPlainKAZ", "extractedPlainKAZ: " << extractedPlainKAZ); + for (string::size_type startPos (0); + (startPos = extractedPlainKAZ.find ("http", startPos)) != string::npos; + ) { + streamoff stopPos = startPos; + while (extractedPlainKAZ [stopPos] && availableURLChars.find (extractedPlainKAZ [stopPos]) != string::npos) + ++stopPos; + const string href (extractedPlainKAZ.substr (startPos, stopPos-startPos)); + LOG ("plain href: " << href); + if (extractedPlainKAZ [stopPos] && extractedPlainKAZ [stopPos] != '\n') + ++stopPos; + startPos = stopPos; + // get all href but KAZ_WEB_SITE + // the archive link while be skip by filter.sh + if (href == KAZ_WEB_SITE) + continue; + while (extractedPlainKAZ [stopPos] && extractedPlainKAZ [stopPos] != '\n') + ++stopPos; + const string name (extractedPlainKAZ.substr (startPos, stopPos-startPos)); + LOG ("plain name: " << name); + addPrevious (href, name); + } +} + +// ================================================================================ +void +MainAttachment::extractLinks (const vector &liOne) { + DEF_LOG ("Attachment::extractedPlainKAZ", "liOne.size: " << liOne.size ()); + for (const string &one : liOne) { + if (caseInsensitiveFind (one, CLASS_ONE) == string::npos) + continue; + string::size_type startPos = caseInsensitiveFind (one, HREF_ONE); + + LOG_BUG (startPos == string::npos, continue, "eMailShrinker: bug M1: no href KAZ link. (one: " << one << ")"); + startPos += HREF_ONE.length (); + LOG ("startPos: " << startPos); + string::size_type stopPos = one.find ("\"", startPos); + + LOG_BUG (stopPos == string::npos, break, "eMailShrinker: bug M2: no ending href KAZ link. (one: " << one << ")"); + LOG ("stopPos: " << stopPos); + string href (one.substr (startPos, stopPos-startPos)); + LOG ("html href: " << href); + stopPos = one.find (">", startPos); + + LOG_BUG (one [stopPos] != '>', break, "eMailShrinker: bug M3: no ending href KAZ link. (one: " << one << ")"); + ++stopPos; + startPos = stopPos; + LOG ("startPos: " << startPos); + stopPos = caseInsensitiveFind (one, A_END, startPos); + LOG ("stopPos: " << stopPos); + + LOG_BUG (stopPos == string::npos, break, "eMailShrinker: bug M4: no ending anchor KAZ link. (one: " << one << ")"); + string name (one.substr (startPos, stopPos-startPos)); + LOG ("html name: " << name); + addPrevious (href, name); + } +} + +void +MainAttachment::extractPreviousKAZ (ifstream &mbox) { + DEF_LOG ("MainAttachment::extractPreviousKAZ", ""); + string extractedPlainKAZ, extractedHtmlKAZ; + for (const Attachment *attachP : allMarkedPtrs) { + if (!attachP->toUpdate || isBase64Encoding ()) + continue; + string textProp = attachP->getProp (contentTypeToken, textRegEx); + if (textProp.empty ()) + continue; + string content (attachP->getContent (mbox)); + if (textProp == PLAIN) { + LOG (PLAIN); + extractedPlainKAZ += attachP->getSection (content, KAZ_PLAIN_START, KAZ_PLAIN_STOP); + } + if (textProp == HTML) { + LOG (HTML); + string section = attachP->getSection (content, KAZ_HTML_START, KAZ_HTML_STOP); + section += attachP->getSection (content, KAZ_PLAIN_START, KAZ_PLAIN_STOP); + // update href from HTML attachments + replaceAll (section, "&", "&"); + extractedHtmlKAZ += section; + } + } + LOG ("extractedPlainKAZ: "<< extractedPlainKAZ); + extractLinks (extractedPlainKAZ); + + LOG ("extractedHtmlKAZ: "<< extractedHtmlKAZ); + vector liOne; + getSection (extractedHtmlKAZ, LI_BEGIN, LI_END, liOne); + extractLinks (liOne); + +#ifndef DISABLE_LOG + for (map ::const_iterator it = previousLinks.begin (); it != previousLinks.end (); ++it) + LOG ("oldLink link: " << it->first << " name: " << it->second); +#endif +} + +void +MainAttachment::removePreviousArchive () { + vector toRemove; + for (map ::const_iterator it = previousLinks.begin (); it != previousLinks.end (); ++it) + if (it->first.find ("&l=/") != string::npos) + toRemove.push_back (it->first); + for (string old : toRemove) + previousLinks.erase (old); +} + +// ================================================================================ +MainAttachment::MainAttachment (ifstream &mbox) + : Attachment (mbox, initTmpLevel (), 0, initTmpPos ()) { + DEF_LOG ("MainAttachment::MainAttachment", ""); + string line; + for (; getline (mbox, line); ) + tmpPos += line.length () + 1; + endPos = tmpPos; +} + +// ================================================================================ +void +MainAttachment::markSignificant (const streamoff &minAttachSize, ifstream &mbox) { + DEF_LOG ("MainAttachment::markSignificant", "minAttachSize: " << minAttachSize); + bool plainMarked (false), htmlMarked (false); + markDisclaim (plainMarked, htmlMarked); + Attachment::markSignificant ("", minAttachSize, mbox, allMarkedPtrs); +} + +// ================================================================================ +void +MainAttachment::getUpdatedURL (ifstream &mbox) { + DEF_LOG ("MainAttachment::getUpdatedURL", ""); + extractPreviousKAZ (mbox); + for (map ::iterator it = previousLinks.begin (); it != previousLinks.end (); ++it) + cout << it->first << endl; +} + +void +MainAttachment::newPjEntry (const int &attachCount, const string &contentType, const string &name, string &dirName, string &mediaName) const { + DEF_LOG ("MainAttachment::newPjEntry", "attachCount: " << attachCount << " contentType: " << contentType << " name: " << name); + ostringstream dirNameStream; + dirNameStream << "PJ-" << std::setfill ('0') << std::setw (3) << int (attachCount); + dirName = dirNameStream.str (); + bfs::path dirPath (extractDir / dirName); + + bfs::create_directory (dirPath); + bfs::path metaPath (dirPath / "meta"); + + ofstream metaOut (metaPath.c_str ()); + metaOut + << "Content-Type: " << contentType << endl + << "Name: " << name << endl; + metaOut.flush (); + metaOut.close (); + + bfs::path filePath (dirPath / "media"); + mediaName = filePath.c_str (); + dirName = dirPath.c_str (); + LOG ("dirName: " << dirName << " mediaName: " << mediaName); +} + +// ================================================================================ +void +MainAttachment::extract (ifstream &mbox, const SizeArg &minSize) const { + DEF_LOG ("MainAttachment::extract", "minSize: " << minSize); + int attachCount (0); + string dirName, mediaName; + for (Attachment *attachP : allMarkedPtrs) { + if (!attachP->toExtract) + continue; + newPjEntry (attachCount, attachP->getContentType (), attachP->getAttachName (), dirName, mediaName); + ++attachCount; + ofstream out (mediaName); + + streamoff + start (attachP->Attachment::contentPos), + end (attachP->Attachment::endPos+1); // pour assurer le cas sans ^M + mbox.seekg (start, ios::beg); + if (attachP->isBase64Encoding ()) { + unsigned char buff[4]; + int idx = 0; + char c; + for (streamoff curPos (start); mbox.get (c) && curPos < end; ++curPos) { + if (c == '=') + break; + if (!isBase64 (c)) + continue; + buff [idx] = getBase64Val (c); + if (++idx != 4) + continue; + out.put (buff [0] << 2 | (buff [1] & 0x30) >> 4); + out.put (buff [1] << 4 | (buff [2] & 0x3c) >> 2); + out.put (buff [2] << 6 | buff [3]); + idx = 0; + } + if (idx) { + for (int j = idx; j < 4; ++j) + buff [j] = 0; + out.put (buff [0] << 2 | (buff [1] & 0x30) >> 4); + --idx; + if (idx) + out.put (buff [1] << 4 | (buff [2] & 0x3c) >> 2); + } + } else { + string line; + for (streamoff curPos (start); getline (mbox, line); ) { + curPos += line.length () + 1; + if (curPos >= end) { + out << line.substr (0, end + line.length () - curPos) << endl; + break; + } + out << line << endl; + } + } + out.flush (); + out.close (); + cout << dirName << endl; + } + for (Attachment *attachP : allMarkedPtrs) { + if (!attachP->embeddedData.size ()) + continue; + string content = attachP->getContent (mbox); + vector imgs; + getSection (content, IMG_BEGIN, IMG_END, imgs); + for (const EmbeddedData &embedded : attachP->embeddedData) { + string &img (imgs[embedded.imgIdx]); + img.erase (0, embedded.startData); + img.erase (embedded.dataLength); + base64Decode (img); + newPjEntry (attachCount, embedded.contentType, embedded.name, dirName, mediaName); + ++attachCount; + + ofstream out (mediaName); + out.write (img.c_str (), img.size ()); + out.flush (); + out.close (); + cout << dirName << endl; + } + } +} + +// ================================================================================ +void +MainAttachment::substitute (ifstream &mbox, ofstream &outbox, const SizeArg &minSize) { + DEF_LOG ("MainAttachment::substitute", "minSize: " << minSize); + + // preparation + extractPreviousKAZ (mbox); + removePreviousArchive (); + map translateHtml; + for (Attachment *attachP : allMarkedPtrs) + if (attachP->toExtract) { + fillUrlId (attachP->downloadUrl, attachP->downloadId); + if (attachP->downloadUrl.empty ()) { + LOG ("no change"); + attachP->toExtract = false; + continue; + } + if (attachP->cid.length ()) { + string tmp (attachP->downloadUrl); + replaceAll (tmp, "&", "&"); + translateHtml.insert (pair (CID+attachP->cid, tmp)); + } + } + for (Attachment *attachP : allMarkedPtrs) { + if (!attachP->embeddedData.size ()) + continue; + for (EmbeddedData &embedded : attachP->embeddedData) + fillUrlId (embedded.downloadUrl, embedded.downloadId); + } + string plainDisclaim, htmlDisclaim; + getDisclaim (plainDisclaim, htmlDisclaim); + // copy email + streamoff curPos = 0; + for (Attachment *attachP : allMarkedPtrs) { + copy (mbox, outbox, curPos, attachP->beginInParent); + + LOG_BUG (attachP->toUpdate && attachP->toExtract, /**/, "eMailShrinker: bug M5: update and extract. pos: " << attachP->beginPos); + + if (attachP->toExtract) { + LOG ("skip Extracted"); + + } else if (attachP->toUpdate) { + string textProp = attachP->getProp (contentTypeToken, textRegEx); + bool isPlain = textProp == PLAIN; + bool isHtml = textProp == HTML; + bool isDisclaimer = attachP->toDisclaim; + + LOG_BUG (isPlain && isHtml, /**/, "eMailShrinker: bug M6: plain and html: " << attachP->getContentType ()); + LOG_BUG (! (isPlain || isHtml), /**/, "eMailShrinker: bug M7: not plain or html: " << attachP->getContentType ()); + LOG ("toUpdate: isPlain: " << isPlain << " isHtml: " << isHtml << " isDisclaimer: " << isDisclaimer); + copy (mbox, outbox, attachP->beginInParent, attachP->contentPos); + + string content = attachP->getContent (mbox); + if (isHtml) { + string::size_type headStart (caseInsensitiveFind (content, HEAD)); + LOG ("HEAD start: " << headStart); + if (headStart != string::npos) { + headStart += HEAD.length (); + string::size_type headStop (caseInsensitiveFind (content, HEAD_END, headStart)); + if (headStop != string::npos) { + // to reduce the scoop of search + string oldHead (content.substr (headStart, headStop-headStart)); + LOG ("HEAD start: " << headStart << " stop: " << headStop << " old: " << oldHead); + string::size_type oldCssPos (oldHead.find (KAZ_CSS_URL)); + if (oldCssPos != string::npos) { + string::size_type oldStart (oldHead.rfind ('<', oldCssPos)); + string::size_type oldStop (oldHead.find ('>', oldCssPos)); + if (oldStart != string::npos && oldStop != string::npos) { + ++oldStop; + if (oldStop < oldHead.length () && oldHead [oldStop] == '\n') + ++oldStop; + content.erase (headStart+oldStart, oldStop-oldStart); + } + } + content.insert (headStart, "\n"+KAZ_CSS); + } + // else XXX pas de /head (if faut en ajouter un (avec ?)) + } + removeSection (content, KAZ_HTML_START, KAZ_HTML_STOP); + removeSection (content, KAZ_PLAIN_START, KAZ_PLAIN_STOP); + // XXX case insensitive ?? + if (content.find (CID) != string::npos) + replaceAll (content, translateHtml); + attachP->replaceEmbedded (content); + } + if (isPlain) + removeSection (content, KAZ_PLAIN_START, KAZ_PLAIN_STOP); + if (isDisclaimer) { + if (isHtml) { + for (string endTag : {BODY_END, HTML_END}) { + LOG ("try tag: " << endTag); + string::size_type endTagStart = caseInsensitiveRFind (content, endTag); + if (endTagStart != string::npos) { + content = content.substr (0, endTagStart); + LOG ("remove tag: " << endTag << " content: " << content); + } + } + content += htmlDisclaim+BODY_END+HTML_END; + LOG ("content: " << content); + } + if (isPlain) + content += plainDisclaim; + } + attachP->println (outbox, content); + } else { + LOG_BUG (true, continue, "eMailShrinker: bug M8: can't change" << *attachP); + } + outbox.flush (); + curPos = attachP->endPos; + } + copy (mbox, outbox, curPos, endPos); + outbox.close (); +} + +// ================================================================================ diff --git a/src/cpp/SizeArg.cpp b/src/cpp/SizeArg.cpp new file mode 100644 index 0000000..9993090 --- /dev/null +++ b/src/cpp/SizeArg.cpp @@ -0,0 +1,101 @@ +//////////////////////////////////////////////////////////////////////////// +// Copyright KAZ 2021 // +// // +// contact (at) kaz.bzh // +// // +// This software is a filter to shrink email by attachment extraction. // +// // +// This software is governed by the CeCILL-B license under French law and // +// abiding by the rules of distribution of free software. You can use, // +// modify and/or redistribute the software under the terms of the // +// CeCILL-B license as circulated by CEA, CNRS and INRIA at the following // +// URL "http://www.cecill.info". // +// // +// As a counterpart to the access to the source code and rights to copy, // +// modify and redistribute granted by the license, users are provided // +// only with a limited warranty and the software's author, the holder of // +// the economic rights, and the successive licensors have only limited // +// liability. // +// // +// In this respect, the user's attention is drawn to the risks associated // +// with loading, using, modifying and/or developing or reproducing the // +// software by the user in light of its specific status of free software, // +// that may mean that it is complicated to manipulate, and that also // +// therefore means that it is reserved for developers and experienced // +// professionals having in-depth computer knowledge. Users are therefore // +// encouraged to load and test the software's suitability as regards // +// their requirements in conditions enabling the security of their // +// systems and/or data to be ensured and, more generally, to use and // +// operate it in the same conditions as regards security. // +// // +// The fact that you are presently reading this means that you have had // +// knowledge of the CeCILL-B license and that you accept its terms. // +//////////////////////////////////////////////////////////////////////////// + +#include + +#include +#include + +#include "kazDebug.hpp" +#include "SizeArg.hpp" + +using namespace std; +using namespace kaz; + +// ================================================================================ +SizeArg::SizeArg (const size_t &bytes) + : bytes (bytes) { +} + +SizeArg::SizeArg (const string &option) + : bytes (0) { + init (option); +} + +void +SizeArg::init (const string &token) { + DEF_LOG ("SizeArg::init", "token: " << token); + static const string prefix ("KMGTPEZY"); + static const regex formatRegEx ("([0-9]+) *([k"+prefix+"]?)(i?)"); + + if (!regex_match (token.begin (), token.end (), formatRegEx)) + throw invalid_argument ("Bad size"); + bytes = boost::lexical_cast (regex_replace (token, formatRegEx, "$1")); + const string v2 (regex_replace (token, formatRegEx, "$2")); + size_t index = prefix.find (v2); + if (v2.length ()) { + if (index == string::npos) + index = 0; // "k" case + ++index; + } + bytes *= pow (regex_replace (token, formatRegEx, "$3").empty () ? 1000 : 1024, index); + LOG ("token:" << token << " index:" << index << " v2:<" << v2 << ">" << " b:" << bytes); +} + + +// ================================================================================ +ostream & +kaz::operator << (ostream &out, const SizeArg &sizeArg) { + static string sizes [] = {"", "Ki", "Mi", "Gi", "Ti", "Pi", "Ei", "Zi", "Yi"}; + + if (!sizeArg.bytes) + return out << "0 byte"; + int nbBytes = (int) floor (log (sizeArg.bytes) / log (1024)); + double val ((sizeArg.bytes / pow (1024, nbBytes))); + return out << boost::str (boost::format(nbBytes ? "%.2f " : val == 1 ? "%.0f byte" : + "%.0f bytes") % val) + sizes [nbBytes]; +} + +istream & +kaz::operator >> (istream &in, SizeArg &sizeArg) { + string token; + in >> token; + try { + sizeArg.init (token); + } catch (...) { + in.setstate (ios_base::failbit); + } + return in; +} + +// ================================================================================ diff --git a/src/cpp/eMailShrinker.cpp b/src/cpp/eMailShrinker.cpp new file mode 100644 index 0000000..d7f0f62 --- /dev/null +++ b/src/cpp/eMailShrinker.cpp @@ -0,0 +1,232 @@ +//////////////////////////////////////////////////////////////////////////// +// Copyright KAZ 2021 // +// // +// contact (at) kaz.bzh // +// // +// This software is a filter to shrink email by attachment extraction. // +// // +// This software is governed by the CeCILL-B license under French law and // +// abiding by the rules of distribution of free software. You can use, // +// modify and/or redistribute the software under the terms of the // +// CeCILL-B license as circulated by CEA, CNRS and INRIA at the following // +// URL "http://www.cecill.info". // +// // +// As a counterpart to the access to the source code and rights to copy, // +// modify and redistribute granted by the license, users are provided // +// only with a limited warranty and the software's author, the holder of // +// the economic rights, and the successive licensors have only limited // +// liability. // +// // +// In this respect, the user's attention is drawn to the risks associated // +// with loading, using, modifying and/or developing or reproducing the // +// software by the user in light of its specific status of free software, // +// that may mean that it is complicated to manipulate, and that also // +// therefore means that it is reserved for developers and experienced // +// professionals having in-depth computer knowledge. Users are therefore // +// encouraged to load and test the software's suitability as regards // +// their requirements in conditions enabling the security of their // +// systems and/or data to be ensured and, more generally, to use and // +// operate it in the same conditions as regards security. // +// // +// The fact that you are presently reading this means that you have had // +// knowledge of the CeCILL-B license and that you accept its terms. // +//////////////////////////////////////////////////////////////////////////// + +#define LAST_VERSION "eMailShrinker 1.3 2021-04-04" + +#include +#include +#include +#include +#include +#include + +#include "kazDebug.hpp" +#include "kazMisc.hpp" +#include "SizeArg.hpp" +#include "MainAttachment.hpp" + +using namespace std; +using namespace boost; +using namespace boost::program_options; +using namespace kaz; + +// ================================================================================ +static options_description mainDescription ("Main options", getCols ()); +static options_description hide ("Hidded options", getCols ()); +static char *prog = NULL; + +// ================================================================================ +void +usage (const string &msg = "", const bool &hidden = false) { + if (!msg.empty ()) { + cout << msg << endl; + exit (1); + } + cout << endl + << "Usage: " << endl + << " A) " << prog << " -u mbox > url-list" << endl + << " B) " << prog << " [-s size] [-d dirName}] mbox > file-list" << endl + << " C) " << prog << " [-s size] [-a url] mbox altered-mbox < url-list" << endl + << endl << " filter attachments" << endl << endl + << " A: list previous embded url need to be refresh (no added option)" << endl + << " => downloadURL list" << endl + << " B: attachment extraction (options : s, d)" << endl + << " => list of (filename)" << endl + << " C: attachment replace with url (options : s) " << endl + << " <= list of (downloadURL [id])" << endl + << endl << mainDescription + << endl; + if (hidden) + cout << hide << endl; + exit (0); +} + +void +version () { + cout << LAST_VERSION << " KAZ team production (https://kaz.bzh/)" << endl; + exit (0); +} + +static auto startPrg = std::chrono::high_resolution_clock::now (); +void +showTime (string msg) { + using namespace std::chrono; + static auto stopPrg = high_resolution_clock::now (); + + cerr << msg << " done in " << ns2string (duration_cast > (stopPrg-startPrg).count ()) << endl; +} + +// ================================================================================ +static const string inputFile = "input-file"; +static const char *const inputFileC = inputFile.c_str (); + +int +main (int argc, char** argv) { + // XXX debug before parse options + // Log::debug = true; + DEF_LOG ("main:", ""); + prog = argv [0]; + bool + debugFlag (false), + helpFlag (false), + versionFlag (false), + updateListFlag (false), + useTheForceLuke (false), + listFlag (false); + string inputName, outputName, archiveDownloadURL; + bfs::path extractDir (bfs::temp_directory_path ()); + SizeArg minAttachSize ("48 Ki"); + + try { + mainDescription.add_options () + ("help,h", bool_switch (&helpFlag), "produce this help message") + ("version,v", bool_switch (&versionFlag), "display version information") + ("size,s", value (&minAttachSize)->default_value (minAttachSize), "minimum size for extration") + ("updateList,u", bool_switch (&updateListFlag), "list URL need refresh") + ("extractDir,d", value (&extractDir)->default_value (extractDir), "set tmp directory name for extraction") + ("archiveDownloadURL,a", value (&archiveDownloadURL)->default_value (archiveDownloadURL), "set url root web site to get bundle (like https://file.kaz.bzh/t.php?)") + ; + + hide.add_options () + ("useTheForceLuke", bool_switch (&useTheForceLuke), "display hidded options") + ("list,l", bool_switch (&listFlag), "get attachment list") + ("debug,g", bool_switch (&debugFlag), "debug mode") + ; + + options_description cmd ("All options"); + cmd.add (mainDescription).add (hide).add_options () + (inputFileC, value > (), "input") + ; + positional_options_description p; + p.add (inputFileC, -1); + variables_map vm; + basic_parsed_options parsed = command_line_parser (argc, argv).options (cmd).positional (p).run (); + store (parsed, vm); + notify (vm); + + if (debugFlag) { +#ifdef DISABLE_LOG + cerr << "No debug option available (was compiled with -DDISABLE_LOG)" << endl; +#endif + } + Log::debug = debugFlag; + + if (useTheForceLuke) + usage ("", true); + if (versionFlag) + version (); + if (helpFlag) + usage (); + + if (vm.count (inputFileC)) { + vector var = vm[inputFileC].as > (); + int nbArgs = vm[inputFileC].as > ().size (); + if (!nbArgs) + usage ("No input file(s)"); + inputName = var [0]; + if (nbArgs > 1) + outputName = var [1]; + if (nbArgs > 2) + usage ("Too much arguments"); + } + } catch (std::exception &e) { + cerr << "error: " << e.what() << endl; + usage (); + return 1; + } catch (...) { + cerr << "Exception of unknown type!" << endl; + return 1; + } + + LOG ("minAttachSize: " << minAttachSize); + + if (inputName.empty ()) + usage ("no input file"); + + // input mbox file + ifstream mbox (inputName); + MainAttachment attachment (mbox); + mbox.close (); + + if (attachment.getBoundary ().empty ()) { + cerr << "no attachment" << endl; + return 1; + } + // parse structure + mbox.open (inputName); + attachment.markSignificant (minAttachSize, mbox); + mbox.close (); + + if (listFlag) + // debug + cerr << attachment; + + if (updateListFlag) { + // update + mbox.open (inputName); + attachment.getUpdatedURL (mbox); + showTime ("Find old links"); + return 0; + } + + if (outputName.empty ()) { + // extract + attachment.setExtractDir (extractDir); + mbox.open (inputName); + attachment.extract (mbox, minAttachSize); + showTime ("Extraction"); + return 0; + } + + // substitute + if (archiveDownloadURL.length ()) + attachment.setArchiveDownloadURL (archiveDownloadURL); + mbox.open (inputName); + ofstream outbox (outputName); + attachment.substitute (mbox, outbox, minAttachSize); + showTime ("Substitution"); + return 0; +} + +// ================================================================================ diff --git a/src/cpp/jirafeauAPI.cpp b/src/cpp/jirafeauAPI.cpp new file mode 100644 index 0000000..a938a2a --- /dev/null +++ b/src/cpp/jirafeauAPI.cpp @@ -0,0 +1,293 @@ +//////////////////////////////////////////////////////////////////////////// +// Copyright KAZ 2021 // +// // +// contact (at) kaz.bzh // +// // +// This software is a filter to shrink email by attachment extraction. // +// // +// This software is governed by the CeCILL-B license under French law and // +// abiding by the rules of distribution of free software. You can use, // +// modify and/or redistribute the software under the terms of the // +// CeCILL-B license as circulated by CEA, CNRS and INRIA at the following // +// URL "http://www.cecill.info". // +// // +// As a counterpart to the access to the source code and rights to copy, // +// modify and redistribute granted by the license, users are provided // +// only with a limited warranty and the software's author, the holder of // +// the economic rights, and the successive licensors have only limited // +// liability. // +// // +// In this respect, the user's attention is drawn to the risks associated // +// with loading, using, modifying and/or developing or reproducing the // +// software by the user in light of its specific status of free software, // +// that may mean that it is complicated to manipulate, and that also // +// therefore means that it is reserved for developers and experienced // +// professionals having in-depth computer knowledge. Users are therefore // +// encouraged to load and test the software's suitability as regards // +// their requirements in conditions enabling the security of their // +// systems and/or data to be ensured and, more generally, to use and // +// operate it in the same conditions as regards security. // +// // +// The fact that you are presently reading this means that you have had // +// knowledge of the CeCILL-B license and that you accept its terms. // +//////////////////////////////////////////////////////////////////////////// + +#define LAST_VERSION "1.0 2021-02-21 jirafeauAPI" + +#include +#include +#include +#include +#include +#include + +#include "kazDebug.hpp" +#include "kazMisc.hpp" +#include "SizeArg.hpp" + +using namespace std; +using namespace boost; +using namespace boost::program_options; +using namespace kaz; + +namespace bfs = boost::filesystem; + +// ================================================================================ +static options_description mainDescription ("Main options", getCols ()); +static options_description hide ("Hidded options", getCols ()); +static char *prog = NULL; + +// ================================================================================ +void +usage (const string &msg = "", const bool &hidden = false) { + if (!msg.empty ()) { + cout << msg << endl; + exit (1); + } + cout << endl + << "Usage: " << endl + << " A) " << prog << " [-s size] [-t period] [-c content-type] [-n attachName] [-f server] send file [password] > url,delCode" << endl + << " B) " << prog << " [-t period] [-f server] update ref > dealine" << endl + << endl << " store ficle" << endl << endl + << " A: send file (options : s, t)" << endl + << " B: update deadline (options : t) " << endl + << endl << mainDescription + << endl; + if (hidden) + cout << hide << endl; + exit (0); +} + +void +version () { + cout << LAST_VERSION << " KAZ team production (https://kaz.bzh/)" << endl; + exit (0); +} + +static auto startPrg = std::chrono::high_resolution_clock::now (); +void +showTime (string msg) { + using namespace std::chrono; + static auto stopPrg = high_resolution_clock::now (); + + cerr << msg << " done in " << ns2string (duration_cast > (stopPrg-startPrg).count ()) << endl; +} + +// ================================================================================ +static size_t +WriteCallback (void *contents, size_t size, size_t nmemb, void *userp) { + ((std::string*) userp)->append ((char*) contents, size * nmemb); + return size * nmemb; +} + +// ================================================================================ +static const string inputFile = "input-file"; +static const char *const inputFileC = inputFile.c_str (); + +int +main (int argc, char** argv) { + // XXX debug before parse options + // Log::debug = true; + DEF_LOG ("main:", ""); + prog = argv [0]; + bool + debugFlag (false), + helpFlag (false), + versionFlag (false), + useTheForceLuke (false); + enum JirCmd { SEND, UPDATE } jirCmd; + string + inputFileName, + password, + contentType, + attachName, + urlBase ("http://file.kaz.bzh"), + apiPage ("/script.php"), + downloadPage ("/f.php"), + minimumAvailability ("month"), + proxy; + + SizeArg maxUploadSize ("100 Mi"); + + try { + mainDescription.add_options () + ("help,h", bool_switch (&helpFlag), "produce this help message") + ("version,v", bool_switch (&versionFlag), "display version information") + ("contentType,c", value (&contentType)->default_value (contentType), "content-type of the sended file") + ("attachName,n", value (&attachName)->default_value (attachName), "force attachment name") + ("minimumAvailability,t", value (&minimumAvailability)->default_value (minimumAvailability), "minimum period of available download") + ("maxUploadSize,s", value (&maxUploadSize)->default_value (maxUploadSize), "maximum upload size") + ("file server registery,f", value (&urlBase)->default_value (urlBase), "server where file are temporary stored") + ; + + hide.add_options () + ("useTheForceLuke", bool_switch (&useTheForceLuke), "display hidded options") + ("debug,g", bool_switch (&debugFlag), "debug mode") + ("proxy,p", value (&proxy)->default_value (proxy), "set proxy (proxy-host.org:8080)") + ("uploadPage,u", value (&apiPage)->default_value (apiPage), "upload page") + ("downloadPage,d", value (&downloadPage)->default_value (downloadPage), "download page") + ; + + options_description cmd ("All options"); + cmd.add (mainDescription).add (hide).add_options () + (inputFileC, value > (), "input") + ; + positional_options_description p; + p.add (inputFileC, -1); + variables_map vm; + basic_parsed_options parsed = command_line_parser (argc, argv).options (cmd).positional (p).run (); + store (parsed, vm); + notify (vm); + + if (debugFlag) { +#ifdef DISABLE_LOG + cerr << "No debug option available (was compiled with -DDISABLE_LOG)" << endl; +#endif + } + Log::debug = debugFlag; + + if (useTheForceLuke) + usage ("", true); + if (versionFlag) + version (); + if (helpFlag) + usage (); + + if (vm.count (inputFileC)) { + vector var = vm[inputFileC].as > (); + int nbArgs = vm[inputFileC].as > ().size (); + if (!nbArgs) + usage ("No command"); + if (var [0].compare ("send") == 0) + jirCmd = SEND; + else if (var [0].compare ("update") == 0) + jirCmd = UPDATE; + else + usage ("Unknown command ("+var [0]+")"); + if (nbArgs < 2) + usage ("no input file"); + inputFileName = var [1]; + if (nbArgs == 3) + password = var [2]; + if (nbArgs > 3) + usage ("Too much arguments"); + } + } catch (std::exception &e) { + cerr << "error: " << e.what() << endl; + usage (); + return 1; + } catch (...) { + cerr << "Exception of unknown type!" << endl; + return 1; + } + + if (inputFileName.empty ()) + usage ("no input"); + + CURL *easyhandle = curl_easy_init (); + if (! easyhandle) { + cerr << "no curl" << endl; + return 1; + } + + string readBuffer; + if (proxy.length ()) + curl_easy_setopt(easyhandle, CURLOPT_PROXY, proxy.c_str ()); + curl_easy_setopt (easyhandle, CURLOPT_WRITEFUNCTION, WriteCallback); + curl_easy_setopt (easyhandle, CURLOPT_WRITEDATA, &readBuffer); + curl_mime *multipart = curl_mime_init (easyhandle); + curl_mimepart *part = nullptr; + + switch (jirCmd) { + case SEND: { + LOG ("SEND: " << (urlBase+apiPage)); + curl_easy_setopt (easyhandle, CURLOPT_URL, (urlBase+apiPage).c_str ()); + + LOG ("maxUploadSize: " << maxUploadSize); + long uploadsize = (size_t) maxUploadSize; + curl_easy_setopt (easyhandle, CURLOPT_INFILESIZE, uploadsize); + + LOG ("time: " << minimumAvailability); + part = curl_mime_addpart (multipart); + curl_mime_name (part, "time"); + curl_mime_data (part, minimumAvailability.c_str (), CURL_ZERO_TERMINATED); + + if (password.size ()) { + LOG ("key: " << password); + part = curl_mime_addpart (multipart); + curl_mime_name (part, "key"); + curl_mime_data (part, password.c_str (), CURL_ZERO_TERMINATED); + } + + LOG ("inputFileName: " << bfs::path (inputFileName).filename ()); + part = curl_mime_addpart (multipart); + curl_mime_name (part, "file"); + if (contentType.length ()) { + LOG ("contentType: " << contentType); + curl_mime_type (part, contentType.c_str ()); + } + if (attachName.empty ()) { + attachName = bfs::path (inputFileName).filename ().c_str (); + LOG ("attachName: " << attachName); + } + curl_mime_filename (part, attachName.c_str ()); + FILE *fp = fopen (inputFileName.c_str (), "r"); + fseek (fp, 0L, SEEK_END); + long int fsize (ftell (fp)); + fseek (fp, 0L, SEEK_SET); + curl_mime_data_cb (part, fsize, + (curl_read_callback) fread, + (curl_seek_callback) fseek, + NULL, //(curl_seek_callback) fclose, + fp); + } + break; + + case UPDATE: { + LOG ("UPDATE: " << (urlBase+downloadPage)); + curl_easy_setopt (easyhandle, CURLOPT_URL, (urlBase+downloadPage).c_str ()); + + LOG ("h: " << inputFileName); + part = curl_mime_addpart (multipart); + curl_mime_name (part, "h"); + curl_mime_data (part, inputFileName.c_str (), CURL_ZERO_TERMINATED); + + LOG ("u: " << minimumAvailability); + part = curl_mime_addpart (multipart); + curl_mime_name (part, "u"); + curl_mime_data (part, minimumAvailability.c_str (), CURL_ZERO_TERMINATED); + } + break; + } + + curl_easy_setopt (easyhandle, CURLOPT_MIMEPOST, multipart); + curl_easy_perform (easyhandle); + curl_easy_cleanup (easyhandle); + cout << readBuffer << endl; + + showTime ("Upload"); + + return 0; +} + +// ================================================================================ diff --git a/src/cpp/kazDebug.cpp b/src/cpp/kazDebug.cpp new file mode 100644 index 0000000..9bed63c --- /dev/null +++ b/src/cpp/kazDebug.cpp @@ -0,0 +1,86 @@ +//////////////////////////////////////////////////////////////////////////// +// Copyright KAZ 2021 // +// // +// contact (at) kaz.bzh // +// // +// This software is a filter to shrink email by attachment extraction. // +// // +// This software is governed by the CeCILL-B license under French law and // +// abiding by the rules of distribution of free software. You can use, // +// modify and/or redistribute the software under the terms of the // +// CeCILL-B license as circulated by CEA, CNRS and INRIA at the following // +// URL "http://www.cecill.info". // +// // +// As a counterpart to the access to the source code and rights to copy, // +// modify and redistribute granted by the license, users are provided // +// only with a limited warranty and the software's author, the holder of // +// the economic rights, and the successive licensors have only limited // +// liability. // +// // +// In this respect, the user's attention is drawn to the risks associated // +// with loading, using, modifying and/or developing or reproducing the // +// software by the user in light of its specific status of free software, // +// that may mean that it is complicated to manipulate, and that also // +// therefore means that it is reserved for developers and experienced // +// professionals having in-depth computer knowledge. Users are therefore // +// encouraged to load and test the software's suitability as regards // +// their requirements in conditions enabling the security of their // +// systems and/or data to be ensured and, more generally, to use and // +// operate it in the same conditions as regards security. // +// // +// The fact that you are presently reading this means that you have had // +// knowledge of the CeCILL-B license and that you accept its terms. // +//////////////////////////////////////////////////////////////////////////// + +#include +#include +#include +#include + +#include "kazDebug.hpp" + + +using namespace std; +using namespace kaz; + +bool +Log::debug = false; + +size_t +Log::indent = 0; + +// ================================================================================ +string +Log::getLocalTimeStr () { + using namespace boost::posix_time; + using namespace std; + ptime now = second_clock::second_clock::local_time (); + stringstream ss; + auto date = now.date (); + auto time = now.time_of_day (); + ss << setfill ('0') << "[" + << setw (2) << static_cast (date.month ()) << "/" << setw (2) << date.day () + << "] " << setw (2) + << time.hours () << ":" << setw (2) << time.minutes (); + return ss.str(); +} + +Log::Log (const string &functName) + : functName (functName) { + ++indent; + if (debug) + cerr << *this << "> "; +} + +Log::~Log () { + if (debug) + cerr << *this << "<" << endl << flush; + --indent; +} + +ostream & +kaz::operator << (ostream &out, const Log &log) { + return out << Log::getLocalTimeStr () << setw (3) << setw ((log.indent % 20)*2) << "" << log.functName; +} + +// ================================================================================ diff --git a/src/cpp/kazMisc.cpp b/src/cpp/kazMisc.cpp new file mode 100644 index 0000000..288cb7f --- /dev/null +++ b/src/cpp/kazMisc.cpp @@ -0,0 +1,437 @@ +//////////////////////////////////////////////////////////////////////////// +// Copyright KAZ 2021 // +// // +// contact (at) kaz.bzh // +// // +// This software is a filter to shrink email by attachment extraction. // +// // +// This software is governed by the CeCILL-B license under French law and // +// abiding by the rules of distribution of free software. You can use, // +// modify and/or redistribute the software under the terms of the // +// CeCILL-B license as circulated by CEA, CNRS and INRIA at the following // +// URL "http://www.cecill.info". // +// // +// As a counterpart to the access to the source code and rights to copy, // +// modify and redistribute granted by the license, users are provided // +// only with a limited warranty and the software's author, the holder of // +// the economic rights, and the successive licensors have only limited // +// liability. // +// // +// In this respect, the user's attention is drawn to the risks associated // +// with loading, using, modifying and/or developing or reproducing the // +// software by the user in light of its specific status of free software, // +// that may mean that it is complicated to manipulate, and that also // +// therefore means that it is reserved for developers and experienced // +// professionals having in-depth computer knowledge. Users are therefore // +// encouraged to load and test the software's suitability as regards // +// their requirements in conditions enabling the security of their // +// systems and/or data to be ensured and, more generally, to use and // +// operate it in the same conditions as regards security. // +// // +// The fact that you are presently reading this means that you have had // +// knowledge of the CeCILL-B license and that you accept its terms. // +//////////////////////////////////////////////////////////////////////////// + +#include +#include +#include +#include +#include +#include + +#include "kazDebug.hpp" +#include "kazMisc.hpp" + +using namespace std; +using namespace kaz; + +//template void kaz::quotedDecoded<'='> (string &content); +//template void kaz::quotedDecoded<'%'> (string &content); + +static const string::size_type MAX_QUOTED_PRINTABLE_SIZE (78); + +const char *const kaz::base64Chars = + "ABCDEFGHIJKLMNOPQRSTUVWXYZ" + "abcdefghijklmnopqrstuvwxyz" + "0123456789" + "+/"; + +const string kaz::availableURLChars = + "!#$%&'()*+,-./" + "0123456789" + ":;=?" + "@ABCDEFGHIJKLMNOPQRSTUVWXYZ" + "[]_" + "abcdefghijklmnopqrstuvwxyz" + "~"; + + +// ================================================================================ +uint16_t +kaz::getCols () { + struct winsize w; + ioctl (0, TIOCGWINSZ, &w); + return w.ws_col; +} + +// ================================================================================ +string +kaz::ns2string (const double &delta) { + using namespace std::chrono; + + ostringstream oss; + duration ns (delta); + oss.fill ('0'); + // typedef duration > days; + // auto d = duration_cast(ns); + // ns -= d; + auto h = duration_cast (ns); + ns -= h; + auto m = duration_cast (ns); + ns -= m; + oss << setw (2) << h.count () << ":" + << setw (2) << m.count () << ":" + << setw (9) << fixed << setprecision (6) << ns.count (); + return oss.str (); +} +// ================================================================================ +void +kaz::replaceAll (string& str, const string &from, const string &to) { + DEF_LOG ("kaz::replaceAll", "form: " << from << " to: " << to); + if (str.empty () || from.empty ()) + return; + for (string::size_type startPos (0); + (startPos = str.find (from, startPos)) != string::npos; + startPos += to.length ()) + str.replace (startPos, from.size (), to); +} + +void +kaz::replaceAll (string& str, const map &subst) { + DEF_LOG ("kaz::replaceAll", "str: " << str); + for (map::const_iterator it = subst.begin (); it != subst.end (); ++it) + replaceAll (str, it->first, it->second); +} + +// ================================================================================ +void +kaz::toLower (string &content) { + DEF_LOG ("kaz::toLower", "content: " << content); + static locale loc; + for (string::size_type i = 0; i < content.length (); ++i) + content [i] = tolower (content[i], loc); + LOG ("content: " << content); +} + +const string & +kaz::toUpperIfNeed (const string &src, string &tmp) { + DEF_LOG ("kaz::toUpperIfNeed", "src: " << src); + for (string::const_iterator it = src.begin (); it != src.end (); ++it) + if (*it != toupper (*it)) { + tmp.reserve (); + for (it = src.begin (); it != src.end (); ++it) + tmp.push_back (toupper (*it)); + return tmp; + } + return src; +} + +inline bool +caseInsensitiveCharCompare (char a, char b) { + return (toupper (a) == b); +} + +string::size_type +kaz::caseInsensitiveFind (const string& s, const string& pattern, const string::size_type &pos) { + DEF_LOG ("kaz::caseInsensitiveFind", "pattern: " << pattern << " pos: " << pos << " s: " << s); + string tmp; + const string &upperPattern (toUpperIfNeed (pattern, tmp)); + LOG ("pattern: " << upperPattern); + string::const_iterator it (search (s.begin ()+pos, s.end (), upperPattern.begin (), upperPattern.end (), caseInsensitiveCharCompare)); + if (it == s.end ()) + return string::npos; + LOG ("find: " << (it - s.begin ())); + return it - s.begin (); +} + +string::size_type +kaz::caseInsensitiveRFind (const string& s, const string& pattern, const string::size_type &pos) { + DEF_LOG ("kaz::caseInsensitiveRFind", "pattern: " << pattern << " pos: " << pos << " s: " << s); + string tmp; + const string &upperPattern (toUpperIfNeed (pattern, tmp)); + LOG ("pattern: " << upperPattern); + string::const_reverse_iterator it (search (s.rbegin (), s.rend ()+pos, upperPattern.rbegin (), upperPattern.rend (), caseInsensitiveCharCompare)); + if (it == s.rend ()) + return string::npos; + LOG ("find: " << (s.rend () - it - pattern.length ())); + return s.rend () - it - pattern.length (); +} + +// ================================================================================ +template +void +kaz::quotedDecode (string &content) { + DEF_LOG ("kaz::quotedDecode", "delim: " << delim << " content: " << content); + string::size_type len (content.length ()); + if (!len) + return; + LOG ("len: " << len); + string::iterator p (content.begin ()), q (p); + for ( ; + p < content.end (); + ++p, ++q) { + if (*p != delim) { + *q = *p; + continue; + } + if (p+1 < content.end () && *(p+1) == '\n') { + + LOG_BUG (q == content.begin (), ++p;continue, "kazMisc::quotedDecode bug: bad quoted-printable format. (start with '=', content: " << content << ")"); + ++p; + --q; + continue; + } + + LOG_BUG (p+3 > content.end () || !isxdigit (p[1]) || !isxdigit (p[2]), return, "kazMisc::quotedDecode bug: bad quoted-printable format. (content: " << content << ")"); + *q = (char) ((getHexaVal (p[1]) << 4) + getHexaVal (p[2])); + p += 2; + } + content.resize (q-content.begin ()); + LOG ("content: " << content); +} + +// ================================================================================ +void +kaz::quotedEncode (string &content) { + DEF_LOG ("kaz::quotedDecode", "content: " << content); + string::size_type nbQuoted (0); + for (string::const_iterator it = content.begin (); it != content.end (); ++it) + if (isQuotedPrintable (*it)) + ++nbQuoted; + if (!nbQuoted) + return; + string::size_type estimate (content.length ()+nbQuoted*3); + estimate += (estimate/MAX_QUOTED_PRINTABLE_SIZE)*2; + string result; + result.reserve (estimate); + string::size_type cols (0); + char upper, lower; + for (string::const_iterator it = content.begin (); it != content.end (); ++it) { + const char &c (*it); + if (c == '\n') { + result.push_back ('\n'); + cols = 0; + continue; + } + if (cols >= MAX_QUOTED_PRINTABLE_SIZE) { + result.push_back ('='); + result.push_back ('\n'); + cols = 0; + } + if (!isQuotedPrintable (c) || + ((c == ' ' || c =='\t') && (it+1 == content.end () || *(it+1) == '\n'))) { + if (cols > MAX_QUOTED_PRINTABLE_SIZE-3) { + result.push_back ('='); + result.push_back ('\n'); + cols = 0; + } + getHexa (c, upper, lower); + result.push_back ('='); + result.push_back (upper); + result.push_back (lower); + cols += 3; + continue; + } + result.push_back (c); + ++cols; + } + content.swap (result); + LOG ("content: " << content); +} + +// ================================================================================ +void +kaz::base64Decode (string &content) { + DEF_LOG ("kaz::base64Decode", "content: " << content); + string::size_type len (content.length ()); + if (!len) + return; + LOG ("len: " << len); + unsigned char buff[4]; + int idx = 0; + string::iterator p (content.begin ()), q (p); + for (; + p < content.end (); + ++p) { + char c = *p; + if (c == '=') + break; + if (c == '\n') + continue; + + LOG_BUG (!isBase64 (c), return, "kazMisc::base64Decode bug: bad base64 format. (content: " << content << ")"); + buff [idx] = getBase64Val (c); + if (++idx != 4) + continue; + *q = buff [0] << 2 | (buff [1] & 0x30) >> 4; + *++q = buff [1] << 4 | (buff [2] & 0x3c) >> 2; + *++q = buff [2] << 6 | buff [3]; + ++q; + idx = 0; + } + if (idx) { + for (int j = idx; j < 4; ++j) + buff [j] = 0; + *q = buff [0] << 2 | (buff [1] & 0x30) >> 4; + ++q; + --idx; + if (idx) { + *q = buff [1] << 4 | (buff [2] & 0x3c) >> 2; + ++q; + } + } + content.resize (q-content.begin ()); + LOG ("content: " << content); +} + +// ================================================================================ +void +kaz::base64Encode (string &content) { + DEF_LOG ("kaz::base64Encode", "content: " << content); + string::size_type length (content.length ()); + std::string result; + result.reserve ((length + 2) / 3 * 4 + length / MAX_QUOTED_PRINTABLE_SIZE + 1); + for (string::size_type pos (0), cols (0); pos < length; ) { + result.push_back (base64Chars [(content [pos + 0] & 0xfc) >> 2]); + if (pos == length-1) { + result.push_back (base64Chars [(content [pos + 0] & 0x03) << 4]); + result.push_back ('='); + result.push_back ('='); + break; + } + result.push_back (base64Chars [((content [pos + 0] & 0x03) << 4) + + ((content [pos + 1] & 0xF0) >> 4)]); + if (pos == length-2) { + result.push_back (base64Chars [(content [pos + 1] & 0x0F) << 2]); + result.push_back ('='); + break; + } + result.push_back (base64Chars [((content [pos + 1] & 0x0F) << 2) + + ((content [pos + 2] & 0xC0) >> 6)]); + result.push_back (base64Chars [content [pos + 2] & 0x3F]); + pos += 3; + cols += 4; + if (cols >= MAX_QUOTED_PRINTABLE_SIZE) { + result.push_back ('\n'); + cols = 0; + } + } + content = result; + LOG ("content: " << content); +} + +// ================================================================================ +void +kaz::iso2utf (string &content) { + DEF_LOG ("kaz::iso2utf", "content: " << content); + string::size_type len (content.length ()); + if (!len) + return; + LOG ("len: " << len); + string::size_type charCount (0); + for (string::iterator it = content.begin (); it != content.end (); ++it) + if ((uint8_t) *it >= 0x80) + ++charCount; + if (!charCount) + return; + LOG ("charCount: " << charCount); + content.resize (len+charCount); + string::iterator p (content.end ()-1), q (p+charCount); + for ( ; ; --p, --q) { + uint8_t ch = *p; + if (ch < 0x80) + *q = ch; + else { + *q = 0x80 | (ch & 0x3F); + *--q = 0xc0 | ch >> 6; + LOG ("ch: " << (char) ch); + } + if (p == q) + break; + } + LOG ("content: " << content); +} + +// ================================================================================ +void +kaz::encodedWord (string &content) { + // rfc2047 + DEF_LOG ("kaz::extendedWord", "content: " << content); + string::size_type charsetPos = content.find ("=?"); + if (charsetPos == string::npos) + return; + LOG ("charsetPos: " << charsetPos); + + LOG_BUG (charsetPos != 0, return, "kazMisc::extendedWord bug: =? not at begin pos. (content: " << content << ")"); + string result; + for ( ; + (charsetPos = content.find ("=?", charsetPos)) != string::npos; + ) { + string::size_type modePos = content.find ("?", charsetPos+2); + + LOG_BUG (modePos == string::npos, return, "kazMisc::extendedWord bug: no end chartset. (content: " << content << ")"); + string::size_type contentPos = content.find ("?", modePos+1); + + LOG_BUG (contentPos != modePos+2, return, "kazMisc::extendedWord bug: no end chartset. (content: " << content << ")"); + string::size_type endPos = content.find ("?=", contentPos+1); + + LOG_BUG (endPos == string::npos, return, "kazMisc::extendedWord bug: no end chartset. (content: " << content << ")"); + string tmp (content.substr (contentPos+1, endPos-contentPos-1)); + switch (content [modePos+1]) { + case 'B': + case 'b': + base64Decode (tmp); + break; + case 'Q': + case 'q': + quotedDecode (tmp); + break; + default: + + LOG_BUG (true, return, "kazMisc::extendedWord bug: unknown mode. (mode: " << content [modePos+1] << ")"); + } + LOG ("tmp: " << tmp); + string charset (content.substr (charsetPos, modePos-charsetPos-2)); + toLower (charset); + if (! caseInsensitiveFind (charset, "ISO")) + iso2utf (tmp); + result += tmp; + charsetPos = endPos+2; + } + content = result; + LOG ("content: " << content); +} + +// ================================================================================ +void +kaz::charsetValue (string &content) { + // rfc2184 + DEF_LOG ("kaz::charsetValue", "content: " << content); + string::size_type langPos = content.find ("'"); + + LOG_BUG (langPos == string::npos, return, "kazMisc::charsetValue bug: no '. (content: " << content << ")"); + string::size_type contentPos = content.find ("'", langPos+1); + + LOG_BUG (contentPos == string::npos, return, "kazMisc::charsetValue bug: no double '. (content: " << content << ")"); + string tmp (content.substr (contentPos+1)); + quotedDecode<'%'> (tmp); + LOG ("tmp: " << tmp); + string charset (content.substr (0, langPos)); + toLower (charset); + if (! caseInsensitiveFind (charset, "ISO")) + iso2utf (tmp); + content = tmp; + LOG ("content: " << content); +} + +// ================================================================================ diff --git a/src/include/Attachment.hpp b/src/include/Attachment.hpp new file mode 100644 index 0000000..4b4577b --- /dev/null +++ b/src/include/Attachment.hpp @@ -0,0 +1,154 @@ +//////////////////////////////////////////////////////////////////////////// +// Copyright KAZ 2021 // +// // +// contact (at) kaz.bzh // +// // +// This software is a filter to shrink email by attachment extraction. // +// // +// This software is governed by the CeCILL-B license under French law and // +// abiding by the rules of distribution of free software. You can use, // +// modify and/or redistribute the software under the terms of the // +// CeCILL-B license as circulated by CEA, CNRS and INRIA at the following // +// URL "http://www.cecill.info". // +// // +// As a counterpart to the access to the source code and rights to copy, // +// modify and redistribute granted by the license, users are provided // +// only with a limited warranty and the software's author, the holder of // +// the economic rights, and the successive licensors have only limited // +// liability. // +// // +// In this respect, the user's attention is drawn to the risks associated // +// with loading, using, modifying and/or developing or reproducing the // +// software by the user in light of its specific status of free software, // +// that may mean that it is complicated to manipulate, and that also // +// therefore means that it is reserved for developers and experienced // +// professionals having in-depth computer knowledge. Users are therefore // +// encouraged to load and test the software's suitability as regards // +// their requirements in conditions enabling the security of their // +// systems and/or data to be ensured and, more generally, to use and // +// operate it in the same conditions as regards security. // +// // +// The fact that you are presently reading this means that you have had // +// knowledge of the CeCILL-B license and that you accept its terms. // +//////////////////////////////////////////////////////////////////////////// + +#ifndef _kaz_Attachment_hpp +#define _kaz_Attachment_hpp + +#include +#include +#include +#include + +#include "EmbeddedData.hpp" + +namespace kaz { + + using namespace std; + + // ================================================================================ + /*! e-mail structure */ + class Attachment { + public: + /*! tokens indicat candidats to be updated by remove attachment */ + //static const vector stringsToUpdate; + static vector stringsToUpdate; + /*! mime tokens */ + static const string contentTypeToken, contentDispositionToken, contentTransferEncodingToken, base64Token, quotedPrintableToken, contentIDToken, PLAIN, HTML, RELATED, ALTERNATIVE; + /*! pattern to extract mime values */ + static const regex nameRegEx, nameCharsetRegEx, boundaryRegEx, cidDefRegEx, textRegEx, multiRegEx; + + /*! get uniq filename */ + static string getUnknown (const string &ext = ""); + /*! remove all sections in content given by boundary tags */ + static void removeSection (string &content, const string &beginTag, const string &endTag); + /*! catenates all sections in content given by boundary tags (use temporary vector) */ + static string getSection (const string &content, const string &beginTag, const string &endTag); + /*! get all sections in content given by boundary marks and put them in result */ + static void getSection (const string &content, const string &beginTag, const string &endTag, vector &result); + + /*! return the content-type */ + const string getContentType () const; + /*! return the filename in mime (or uniq name if missing) */ + const string getAttachName () const; + /*! return reference to the saved boundary. Empty value if attachment is not a multipart */ + const string &getBoundary () const; + /*! return the size of the content */ + const streamoff getSize () const; + /*! get a part of a mime header value */ + const string getProp (const string &token, const regex ®Ex) const; + + /*! return if base64 encoded */ + const bool isBase64Encoding () const; + /*! return if quoted-printable encoded */ + const bool isQuotedPrintableEnconding () const; + /*! return if text (plain or html) and base64 encoded */ + const bool isTextBase64 () const; + /*! return check if value exists in mime header */ + const bool isDefProp (const string &token, const string &val) const; + + protected: + /*! HTML image tag*/ + static const string IMG_BEGIN, IMG_END; + + /*! Attachment level (0 is main) */ + const int level; + /*! char position in the mbox of the boundary before this attachment */ + const streamoff beginInParent; + /*! char position of attachment including mime */ + const streamoff beginPos; + /*! char position of attachment content */ + streamoff contentPos, endPos; + /*! properties of the attachment */ + bool toExtract, toUpdate, toDisclaim; + /*! id of an image embedded in mbox */ + string cid; + /*! url to replace the attachment and its short id */ + string downloadUrl, downloadId; + /*! properties of embedded image (self encoded with base64)*/ + vector embeddedData; + + /*! mime values of the attachment */ + map env; + /*! boundary if the attachment is a multipart including previous and next "--" */ + string boundary; + /*! size of boundary before the last "--" */ + streamoff boundaryMiddleSize; + + /*! sub attachment if the attachment is a multipart */ + vector subAttachements; + + /*! called during the parse process */ + Attachment (ifstream &mbox, const int &level, const streamoff beginInParent, streamoff &curPos); + + /*! called one time by the constructor */ + void readMime (ifstream &mbox, streamoff &curPos); + /*! called one time by the constructor */ + void readBoundaries (ifstream &mbox, streamoff &curPos); + /*! called for each part during the parse process add add a subAttachement. Return false when found last boundary */ + bool nextBondary (ifstream &mbox, streamoff &curPos); + + /*! recursively marks alternative attachments to be disclaim */ + void markDisclaim (bool &plainMarked, bool &htmlMarked); + /*! recursively marks big attachments to be removed and upated (including disclaim). return true when part need to be updated (can't be extracted). */ + bool markSignificant (const string &parentMultiProp, const streamoff &minAttachSize, ifstream &mbox, vector &allMarkedPtrs); + /*! get a copy of the content. Base64 is decoded. Quoted-Printable is unwarp and unquoted */ + string getContent (ifstream &mbox) const; + /*! write the content, encoded if necessary (base64 and quoted-printable) */ + void println (ofstream &outbox, string content) const; + + /*! replace embedded image */ + void replaceEmbedded (string &content) const; + + public: + friend class MainAttachment; + friend ostream& operator << (ostream& os, const Attachment& attachment); + }; + + /*! for debug pupose */ + ostream& operator << (ostream& os, const Attachment& attachment); + + // ================================================================================ +} + +#endif // _kaz_Attachment_hpp diff --git a/src/include/EmbeddedData.hpp b/src/include/EmbeddedData.hpp new file mode 100644 index 0000000..8bc8e3f --- /dev/null +++ b/src/include/EmbeddedData.hpp @@ -0,0 +1,70 @@ +//////////////////////////////////////////////////////////////////////////// +// Copyright KAZ 2021 // +// // +// contact (at) kaz.bzh // +// // +// This software is a filter to shrink email by attachment extraction. // +// // +// This software is governed by the CeCILL-B license under French law and // +// abiding by the rules of distribution of free software. You can use, // +// modify and/or redistribute the software under the terms of the // +// CeCILL-B license as circulated by CEA, CNRS and INRIA at the following // +// URL "http://www.cecill.info". // +// // +// As a counterpart to the access to the source code and rights to copy, // +// modify and redistribute granted by the license, users are provided // +// only with a limited warranty and the software's author, the holder of // +// the economic rights, and the successive licensors have only limited // +// liability. // +// // +// In this respect, the user's attention is drawn to the risks associated // +// with loading, using, modifying and/or developing or reproducing the // +// software by the user in light of its specific status of free software, // +// that may mean that it is complicated to manipulate, and that also // +// therefore means that it is reserved for developers and experienced // +// professionals having in-depth computer knowledge. Users are therefore // +// encouraged to load and test the software's suitability as regards // +// their requirements in conditions enabling the security of their // +// systems and/or data to be ensured and, more generally, to use and // +// operate it in the same conditions as regards security. // +// // +// The fact that you are presently reading this means that you have had // +// knowledge of the CeCILL-B license and that you accept its terms. // +//////////////////////////////////////////////////////////////////////////// + +#ifndef _kaz_EmbeddedData_hpp +#define _kaz_EmbeddedData_hpp + +#include +#include + +namespace kaz { + + using namespace std; + + // ================================================================================ + /*! properties of embedded image in html part (rfc2397) */ + class EmbeddedData { + public: + /*! rank of this image tag */ + int imgIdx; + /*! extracted in first pass */ + string contentType, name; + string downloadUrl, downloadId; + /*! area of base64 relative in the image section */ + string::size_type startData, dataLength; + + /*! initialisation in the first pass */ + EmbeddedData (const int &imgIdx, const string &contentType, const string &name, const string::size_type &startData, const string::size_type &dataLength); + + /*! records properties */ + static void fillEmbeddedData (const vector &imgs, const streamoff &minAttachSize, vector &data); + + // friend ostream& operator << (ostream& os, const EmbeddedData& embeddedData); + }; + ostream& operator << (ostream& os, const EmbeddedData& embeddedData); + + // ================================================================================ +} + +#endif // _kaz_EmbeddedData_hpp diff --git a/src/include/MainAttachment.hpp b/src/include/MainAttachment.hpp new file mode 100644 index 0000000..5b2e9c4 --- /dev/null +++ b/src/include/MainAttachment.hpp @@ -0,0 +1,121 @@ +//////////////////////////////////////////////////////////////////////////// +// Copyright KAZ 2021 // +// // +// contact (at) kaz.bzh // +// // +// This software is a filter to shrink email by attachment extraction. // +// // +// This software is governed by the CeCILL-B license under French law and // +// abiding by the rules of distribution of free software. You can use, // +// modify and/or redistribute the software under the terms of the // +// CeCILL-B license as circulated by CEA, CNRS and INRIA at the following // +// URL "http://www.cecill.info". // +// // +// As a counterpart to the access to the source code and rights to copy, // +// modify and redistribute granted by the license, users are provided // +// only with a limited warranty and the software's author, the holder of // +// the economic rights, and the successive licensors have only limited // +// liability. // +// // +// In this respect, the user's attention is drawn to the risks associated // +// with loading, using, modifying and/or developing or reproducing the // +// software by the user in light of its specific status of free software, // +// that may mean that it is complicated to manipulate, and that also // +// therefore means that it is reserved for developers and experienced // +// professionals having in-depth computer knowledge. Users are therefore // +// encouraged to load and test the software's suitability as regards // +// their requirements in conditions enabling the security of their // +// systems and/or data to be ensured and, more generally, to use and // +// operate it in the same conditions as regards security. // +// // +// The fact that you are presently reading this means that you have had // +// knowledge of the CeCILL-B license and that you accept its terms. // +//////////////////////////////////////////////////////////////////////////// + +#ifndef _kaz_MainAttachment_hpp +#define _kaz_MainAttachment_hpp + +#include +#include "Attachment.hpp" + +namespace kaz { + + using namespace std; + namespace bfs = boost::filesystem; + + // ================================================================================ + /*! root level of e-mail structure */ + class MainAttachment : public Attachment { + public: + /*! text to add in disclaim */ + static const string templatePlainAddLink, templatePlainAllLink, templateHtmlHeader, templateHtmlAddLink, templateHtmlOtherLink, templateHtmlAllLink, templateHtmlFooter; + + /*! white space to split a text */ + static const regex whiteSpaceRegEx; + + /*! copy a slice of mbox to stdout */ + static void copy (ifstream &mbox, ofstream &outbox, const streamoff &begin, const streamoff &end); + + /*! get url and id (space separated) from stdin */ + void fillUrlId (string &url, string &id); + + /*! location of extracted files */ + void setExtractDir (const bfs::path &extractDir); + /*! URL base for archive download of all extracted files */ + void setArchiveDownloadURL (const string &archiveDownloadURL); + /*! add a single link in disclaim */ + void addLink (string &plain, string &html, const string &url, const string &name) const; + /*! get disclaim according alls links (retreived or create) */ + void getDisclaim (string &plain, string &html) const; + + private: + /*! for boot strap the attachment constructor */ + streamoff &initTmpPos () { return tmpPos = 0; } + /*! for boot strap the attachment constructor */ + int &initTmpLevel () { return tmpLevel = 0; } + + /*! volatile values*/ + streamoff tmpPos; + int tmpLevel; + + /*! dir path for extraction */ + bfs::path extractDir; + /*! URL base for download archives */ + string archiveDownloadURL; + + /*! subset in the tree of all attachments to be consider for extraction or modification */ + vector allMarkedPtrs; + /*! previous links find in mbox */ + map previousLinks; + /*! add link only if no significant value already exist. */ + void addPrevious (const string &href, const string &name); + + /*! extract previous links from plain text. Used by extractPreviousKAZ */ + void extractLinks (const string &extractedPlainKAZ); + /*! extract previous links from html-li list. Used by extractPreviousKAZ */ + void extractLinks (const vector &liOne); + /*! extract previous links in mbox. Used by getUpdatedURL and substitute */ + void extractPreviousKAZ (ifstream &mbox); + /*! remove previous links to archive. Used by substitute */ + void removePreviousArchive (); + + public: + /*! the main attachment in mbox */ + MainAttachment (ifstream &mbox); + + /*! mark disclaim, update and extract attachments. Must be call before: getUpdatedURL, extract or substitute */ + void markSignificant (const streamoff &minAttachSize, ifstream &mbox); + /*! write to stdout le list of previous links in mbox */ + void getUpdatedURL (ifstream &mbox); + /*! create record for extraction */ + void newPjEntry (const int &attachCount, const string &contentType, const string &name, string &dirName, string &mediaName) const; + /*! extract big attachments in mbox to extractDir and write to stdout le dirname of each extraction */ + void extract (ifstream &mbox, const SizeArg &minSize) const; + /*! substitute big attachments by the url give in stdin */ + void substitute (ifstream &mbox, ofstream &outbox, const SizeArg &minSize); + }; + + // ================================================================================ +} + +#endif // _kaz_MainAttachment_hpp diff --git a/src/include/SizeArg.hpp b/src/include/SizeArg.hpp new file mode 100644 index 0000000..80c1f0d --- /dev/null +++ b/src/include/SizeArg.hpp @@ -0,0 +1,74 @@ +//////////////////////////////////////////////////////////////////////////// +// Copyright KAZ 2021 // +// // +// contact (at) kaz.bzh // +// // +// This software is a filter to shrink email by attachment extraction. // +// // +// This software is governed by the CeCILL-B license under French law and // +// abiding by the rules of distribution of free software. You can use, // +// modify and/or redistribute the software under the terms of the // +// CeCILL-B license as circulated by CEA, CNRS and INRIA at the following // +// URL "http://www.cecill.info". // +// // +// As a counterpart to the access to the source code and rights to copy, // +// modify and redistribute granted by the license, users are provided // +// only with a limited warranty and the software's author, the holder of // +// the economic rights, and the successive licensors have only limited // +// liability. // +// // +// In this respect, the user's attention is drawn to the risks associated // +// with loading, using, modifying and/or developing or reproducing the // +// software by the user in light of its specific status of free software, // +// that may mean that it is complicated to manipulate, and that also // +// therefore means that it is reserved for developers and experienced // +// professionals having in-depth computer knowledge. Users are therefore // +// encouraged to load and test the software's suitability as regards // +// their requirements in conditions enabling the security of their // +// systems and/or data to be ensured and, more generally, to use and // +// operate it in the same conditions as regards security. // +// // +// The fact that you are presently reading this means that you have had // +// knowledge of the CeCILL-B license and that you accept its terms. // +//////////////////////////////////////////////////////////////////////////// + +#ifndef _kaz_SizeArg_hpp +#define _kaz_SizeArg_hpp + +#include +#include +#include + +namespace kaz { + + using namespace std; + + // ================================================================================ + /*! human readable of size values */ + class SizeArg { + private: + /*! the size */ + size_t bytes; + + /*! human readable convertion */ + void init (const string &option); + public: + /*! scalar convertion */ + operator size_t () const { return bytes; } + + /*! initialization from scalar value */ + SizeArg (const size_t &bytes = 0); + /*! initialization from human readable value */ + SizeArg (const string &option); + + friend ostream &operator << (ostream &out, const SizeArg &sizeArg); + friend istream &operator >> (istream &in, SizeArg &sizeArg); + }; + + // ================================================================================ + /*! human readable convertion */ + ostream &operator << (ostream &out, const SizeArg &sizeArg); + istream &operator >> (istream &in, SizeArg &sizeArg); +} + +#endif // _kaz_Attachment_hpp diff --git a/src/include/kazDebug.hpp b/src/include/kazDebug.hpp new file mode 100644 index 0000000..f53a7e6 --- /dev/null +++ b/src/include/kazDebug.hpp @@ -0,0 +1,134 @@ +//////////////////////////////////////////////////////////////////////////// +// Copyright KAZ 2021 // +// // +// contact (at) kaz.bzh // +// // +// This software is a filter to shrink email by attachment extraction. // +// // +// This software is governed by the CeCILL-B license under French law and // +// abiding by the rules of distribution of free software. You can use, // +// modify and/or redistribute the software under the terms of the // +// CeCILL-B license as circulated by CEA, CNRS and INRIA at the following // +// URL "http://www.cecill.info". // +// // +// As a counterpart to the access to the source code and rights to copy, // +// modify and redistribute granted by the license, users are provided // +// only with a limited warranty and the software's author, the holder of // +// the economic rights, and the successive licensors have only limited // +// liability. // +// // +// In this respect, the user's attention is drawn to the risks associated // +// with loading, using, modifying and/or developing or reproducing the // +// software by the user in light of its specific status of free software, // +// that may mean that it is complicated to manipulate, and that also // +// therefore means that it is reserved for developers and experienced // +// professionals having in-depth computer knowledge. Users are therefore // +// encouraged to load and test the software's suitability as regards // +// their requirements in conditions enabling the security of their // +// systems and/or data to be ensured and, more generally, to use and // +// operate it in the same conditions as regards security. // +// // +// The fact that you are presently reading this means that you have had // +// knowledge of the CeCILL-B license and that you accept its terms. // +//////////////////////////////////////////////////////////////////////////// + +#ifndef _Kaz_Debug_hpp +#define _Kaz_Debug_hpp + +#include +#include + +/*! log error */ +#define LOG_BUG(cond, action, expr) {if (cond) {std::cerr << endl << expr << std::endl << std::flush; action; }} + +#ifdef ENABLE_SMART_LOG + +#ifndef SMART_DEF_LOG +#define SMART_DEF_LOG(name, expr) DEF_LOG (name, expr) +#endif + +#ifndef SMART_LOG +#define SMART_LOG(expr) LOG (expr) +#endif + +#ifndef SMART_LOG_EXPR +#define SMART_LOG_EXPR(expr) {if (::kaz::Log::debug) {expr;} } +#endif + +#else + +#ifndef SMART_DEF_LOG +#define SMART_DEF_LOG(name, expr) +#endif + +#ifndef SMART_LOG +#define SMART_LOG(expr) +#endif + +#ifndef SMART_LOG_EXPR +#define SMART_LOG_EXPR(expr) +#endif +#endif + +#ifdef DISABLE_LOG + +#ifndef DEF_LOG +#define DEF_LOG(name, expr) +#endif +#ifndef LOG +#define LOG(expr) {} +#endif + +#ifndef DEBUG +#define DEBUG(expr) {} +#endif + +#else + +#ifndef DEF_LOG +/*! to placed as the first instruction to log entry and return method */ +#define DEF_LOG(name, expr) ::kaz::Log log (name); { if (::kaz::Log::debug) std::cerr << expr << std::endl << std::flush; } +#endif + +#ifndef LOG +/*! to placed in methode where DEF_LOG if call previously */ +// _______________________________________________________ Don't forget DEF_LOG +#define LOG(expr) { if (::kaz::Log::debug) std::cerr << log << "| " << expr << std::endl << std::flush; } +#endif + +#ifndef DEBUG +/*! log without format */ +#define DEBUG(expr) { if (::kaz::Log::debug) std::cerr << expr << std::endl << std::flush; } +#endif + +#endif + +namespace kaz { + // ================================================================================ + using namespace std; + + /*! manage prety print log */ + class Log { + /*! visual indentation of call */ + static size_t indent; + /*! name recall in log */ + string functName; + public: + /*! switch on the log */ + static bool debug; + + /*! log entry of a method */ + Log (const string &functName); + /*! log return of a method */ + ~Log (); + + /*! timestamp of the log */ + static string getLocalTimeStr (); + friend ostream &operator << (ostream &out, const Log &log); + }; + ostream &operator << (ostream &out, const Log &log); + + // ================================================================================ +} // kaz + +#endif //_Kaz_Debug_hpp diff --git a/src/include/kazMisc.hpp b/src/include/kazMisc.hpp new file mode 100644 index 0000000..55f317f --- /dev/null +++ b/src/include/kazMisc.hpp @@ -0,0 +1,141 @@ +//////////////////////////////////////////////////////////////////////////// +// Copyright KAZ 2021 // +// // +// contact (at) kaz.bzh // +// // +// This software is a filter to shrink email by attachment extraction. // +// // +// This software is governed by the CeCILL-B license under French law and // +// abiding by the rules of distribution of free software. You can use, // +// modify and/or redistribute the software under the terms of the // +// CeCILL-B license as circulated by CEA, CNRS and INRIA at the following // +// URL "http://www.cecill.info". // +// // +// As a counterpart to the access to the source code and rights to copy, // +// modify and redistribute granted by the license, users are provided // +// only with a limited warranty and the software's author, the holder of // +// the economic rights, and the successive licensors have only limited // +// liability. // +// // +// In this respect, the user's attention is drawn to the risks associated // +// with loading, using, modifying and/or developing or reproducing the // +// software by the user in light of its specific status of free software, // +// that may mean that it is complicated to manipulate, and that also // +// therefore means that it is reserved for developers and experienced // +// professionals having in-depth computer knowledge. Users are therefore // +// encouraged to load and test the software's suitability as regards // +// their requirements in conditions enabling the security of their // +// systems and/or data to be ensured and, more generally, to use and // +// operate it in the same conditions as regards security. // +// // +// The fact that you are presently reading this means that you have had // +// knowledge of the CeCILL-B license and that you accept its terms. // +//////////////////////////////////////////////////////////////////////////// + +#ifndef _kaz_misc_hpp +#define _kaz_misc_hpp + +#include +#include +#include + +namespace kaz { + using namespace std; + + // ======================================================================= + /*! ordered base64 chars */ + extern const char * const base64Chars; + /*! set of chars available in URL */ + extern const string availableURLChars; + + // ======================================================================= + /*! get the width of the terminal */ + uint16_t getCols (); + + /*! display time. */ + string ns2string (const double &delta); + + // ======================================================================= + /*! side effect on str to replace "from" by "to" */ + void replaceAll (string& str, const string &from, const string &to); + /*! side effect on str to replace a set of "from" by a set of "to" */ + void replaceAll (string& str, const map &subst); + + // ======================================================================= + /*! side effect to lower case a string (in mime section) */ + void toLower (string &content); + + /*! compare strings are done in uppercase to avoid accents. Give token in uppercase spin up the process */ + const string &toUpperIfNeed (const string &src, string &tmp); + /*! find upper case of p in upper case of s */ + string::size_type caseInsensitiveFind (const string& s, const string& p, const string::size_type &pos = 0); + /*! reverse find upper case of p in upper case of s */ + string::size_type caseInsensitiveRFind (const string& s, const string& p, const string::size_type &pos = 0); + /*! side effect to repplace =XX by the char with de haxe value XX. It could be %XX in rfc2184 */ + template + void quotedDecode (string &content); + /*! side effect to quoted-printable content rfc2045 */ + void quotedEncode (string &content); + /*! side effect to decode base64 */ + void base64Decode (string &content); + /*! side effect to encode base64 */ + void base64Encode (string &content); + /*! side effect to change charset of content */ + void iso2utf (string &content); + /*! side effect to get the encodedWord according rfc2047 */ + void encodedWord (string &content); + /*! side effect to get the charsetValue according rfc2184 */ + void charsetValue (string &content); + + // ======================================================================= + /*! return if the c need no quote */ + inline bool + isQuotedPrintable (const char &c) { + return + c == ' ' || c == '\t' || (c >= 33 && c <= 126 && c != '=' && c != '.'); + // '.' is available in rfc2184 but it avoid to check '.' alone in a line :-) + } + + /*! return if the c is in available base64 chars */ + inline bool + isBase64 (const char &c) { + return (isalnum (c) || (c == '+') || (c == '/')); + } + + /*! get the order of c in the base64 set of values */ + inline unsigned char + getBase64Val (const char &c) { + if (c == '+') + return 62; + if (c == '/') + return 63; + if (c <= '9') + return (c-'0')+52; + if (c <= 'Z') + return (c-'A'); + return (c-'a')+26; + } + + /*! get the nibble value of c representation of an hexa digit */ + inline unsigned char + getHexaVal (const char &c) { + if (c <= '9') + return c-'0'; + if (c <= 'F') + return (c-'A')+10; + return (c-'a')+10; + } + + /*! get the nibble value of c representation of an hexa digit */ + inline void + getHexa (const char &c, char &upper, char &lower) { + upper = c >> 4 & 0xF; + upper += upper > 9 ? ('A'-10) : '0'; + lower = c & 0xF; + lower += lower > 9 ? ('A'-10) : '0'; + } + + // ======================================================================= +} + +#endif // _kaz_Attachment_hpp