fix rfc2047 / filter log / filterTest options

2022-12-25 06:57:44 +01:00
parent 596ae82fe4
commit 61bdc3a4ba
9 changed files with 193 additions and 101 deletions
--- a/src/cpp/Attachment.cpp
+++ b/src/cpp/Attachment.cpp
@@ -65,17 +65,9 @@ const string Attachment::ALTERNATIVE			("alternative");
 const string Attachment::KAZ_ATTACH_NAME		(".---KazAttachment---.html");
 const string Attachment::MULTIPART			("multipart/");

-
-const regex Attachment::nameCharsetRegEx		(".*name\\*=[ \t]*(.*)");
-const regex Attachment::nameRegEx			(".*name=[ \t]*((\"(\\\\.|[^\\\\\r])*\")|[^\r; ]*);?.*");
-// boundary="----=_Part_796779_1154936629.1668080348646"
-// boundary="------------040709000505010508040808"
-// boundary="----------=_1668606031-941125-91"
-// boundary="_004_PAVPR10MB6792713B313048E3A259B215B2079PAVPR10MB6792EURP_";
-// boundary="_000_PAVPR10MB6792713B313048E3A259B215B2079PAVPR10MB6792EURP_"
-// boundary=--boundary_1351_64006126-2b0e-4a3b-98ac-4797d1634188
-// boundary=--boundary_1352_7e294c9a-cfab-44a0-bfb3-7310380ac7cb;
-const regex Attachment::boundaryRegEx			(".*boundary=[ \t]*((\"(\\\\.|[^\\\\])*\")|[^; ]*);?.*");
+const regex Attachment::nameCharsetRegEx		(".*name\\*=\\s*([; \t]*)");
+const regex Attachment::nameRegEx			(    ".*name=\\s*((\"(\\\\.|[^\\\\])*\")|[^; \t]*).*");
+const regex Attachment::boundaryRegEx			(".*boundary=\\s*((\"(\\\\.|[^\\\\])*\")|[^; \t]*).*");
 const regex Attachment::cidDefRegEx			(".*<([^>]*)>.*");
 const regex Attachment::textRegEx			(".*text/("+PLAIN+"|"+HTML+").*");
 const regex Attachment::multiRegEx			("\\s*"+MULTIPART+"(mixed|"+RELATED+"|"+ALTERNATIVE+"|"+SIGNED+").*");
@@ -188,23 +180,49 @@ Attachment::getAttachName () const {
  static string tokens [] = {contentTypeToken, contentDispositionToken};
  DEF_LOG ("Attachment::getAttachName", "");
  for (string token : tokens) {
+    // name=
    string result = getProp (token, nameRegEx);
    removeQuote (result);
    if (result.length ()) {
      LOG ("name=: " << result);
-      encodedWord (result);
+      encodedWordDecode (result);
      return result;
    }
+    // name*x=
+    for (int id = 0; ; ++id) {
+      string item = getProp (token, regex (".*name\\*"+to_string (id)+"=\\s*((\"(\\\\.|[^\\\\])*\")|[; \t]*).*"));
+      if (item.empty ())
+	break;
+      result += item;
+    }
+    removeQuote (result);
+    if (result.length ()) {
+      LOG ("name*x=: " << result);
+      encodedWordDecode (result);
+      return result;
+    }
+    // name*=
    result = getProp (token, nameCharsetRegEx);
    removeQuote (result);
    if (result.length ()) {
      LOG ("name*=: " << result);
-      charsetValue (result);
+      charsetValueDecode (result);
+      return result;
+    }
+    // name*x*=
+    for (int id = 0; ; ++id) {
+      string item = getProp (token, regex (".*name\\*"+to_string (id)+"\\*=\\s*([^; ]*)"));
+      if (item.empty ())
+	break;
+      result += item;
+    }
+    removeQuote (result);
+    if (result.length ()) {
+      LOG ("name*x*=: " << result);
+      encodedWordDecode (result);
      return result;
    }
  }
-  // XXX il faut composer s'il y a plusieurs ligne filename*x=
-  // XXX il faut composer s'il y a plusieurs ligne filename*x*=
  return getUnknown (getContentType ());
 }

@@ -257,7 +275,7 @@ Attachment::isDefProp (const string &token, const string &val) const {
  if (it == env.end ())
    return false;
  // XXX case insensitive ??
-  return it->second.find (val) != string::npos;
+  return caseInsensitiveFind (it->second, val) != string::npos;
 }

 // ================================================================================
@@ -312,7 +330,8 @@ Attachment::readMime (ifstream &mbox, streamoff &curPos) {
      lastVar = line.substr (0, colonPos);
      toLower (lastVar);
      LOG ("find var: " << lastVar);
-      string val (cleanString (line.length () >= colonPos+2 ? line.substr (colonPos+2) : "")); // XXX check RFC " " after ": "
+      // XXX check in RFC if " " after ": " (=> +2 or +1)
+      string val (cleanString (line.length () >= colonPos+2 ? line.substr (colonPos+2) : ""));
      LOG ("new var: <" << lastVar << " <=> " << val << ">");
      env [lastVar] = val;
    }
--- a/src/cpp/MainAttachment.cpp
+++ b/src/cpp/MainAttachment.cpp
@@ -61,7 +61,7 @@ static const string TMPL_FILENAME		("{{FILENAME}}");
 static const string CID				("cid:");

 // "l=/" => v1 compatibility
-static const regex archiveURLSignature		(".*(([&?]g=)|([&?]l=/)).*");
+static const regex archiveURLRegex		(".*(([&?]g=)|([&?]l=/)).*");

 static const string KAZ_PLAIN_HR		("______________________________________________________________________________");
 static const string KAZ_PLAIN_START		("~~ PJ-KAZ !"); // don't end whith space
@@ -423,7 +423,7 @@ MainAttachment::removePreviousArchive () {
  vector<string> toRemove;
  for (map <string, string>::const_iterator it = previousLinks.begin (); it != previousLinks.end (); ++it) {
    const string key (it->first);
-    if (regex_match (key, archiveURLSignature))
+    if (regex_match (key, archiveURLRegex))
      toRemove.push_back (key);
  }
  for (string old : toRemove)
@@ -567,7 +567,7 @@ MainAttachment::extract (ifstream &mbox, const SizeArg &minSize) const {

 // ================================================================================
 void
-MainAttachment::substitute (ifstream &mbox, ofstream &outbox, const SizeArg &minSize, const AttachMode &attachMode) {
+MainAttachment::substitute (ifstream &mbox, ofstream &outbox, const SizeArg &minSize, AttachMode attachMode) {
  DEF_LOG ("MainAttachment::substitute", "minSize: " << minSize << " AttachMode: " << attachMode);

  // preparation
@@ -601,30 +601,34 @@ MainAttachment::substitute (ifstream &mbox, ofstream &outbox, const SizeArg &min
  getDisclaim (plainDisclaim, htmlDisclaim);

  // copy email
-  if (plainDisclaim.size () && emptyEMail && boundary.empty ()) {
-    // only one attachment must be replace
-    cerr << "eMailShrinker: force one attachment" << endl;
-    string mime (getMime (mbox));
-    string::size_type startPos = (0);
-    for (string token : {string ("Content-Transfer-Encoding"), Attachment::contentTypeToken}) {
-      startPos = caseInsensitiveFind (mime, "Content-Transfer-Encoding");
-      for (string::size_type stopPos (startPos);
-	   (stopPos = mime.find ("\n", stopPos)) != string::npos;
-	   ) {
-	if (string (" \t").find (mime [stopPos+1]) == string::npos) {
-	  mime.erase (startPos, stopPos-startPos);
-	  break;
+  if (!boundary.size () && plainDisclaim.size ()) {
+    if (attachMode & ATTACHMENT)
+      attachMode = FOOTER;
+    if (emptyEMail) {
+      // only one attachment must be replace
+      cerr << "eMailShrinker: force one attachment" << endl;
+      string mime (getMime (mbox));
+      string::size_type startPos = (0);
+      for (string token : {string ("Content-Transfer-Encoding"), Attachment::contentTypeToken}) {
+	startPos = caseInsensitiveFind (mime, "Content-Transfer-Encoding");
+	for (string::size_type stopPos (startPos);
+	     (stopPos = mime.find ("\n", stopPos)) != string::npos;
+	     ) {
+	  if (string (" \t").find (mime [stopPos+1]) == string::npos) {
+	    mime.erase (startPos, stopPos-startPos);
+	    break;
+	  }
 	}
      }
+      mime.insert (startPos, KAZ_EMPTY_TEXT_PLAIN);
+      string content (plainDisclaim);
+      base64Encode (content);
+      outbox << mime
+	     << content << endl;
+      outbox.flush ();
+      outbox.close ();
+      return;
    }
-    mime.insert (startPos, KAZ_EMPTY_TEXT_PLAIN);
-    string content (plainDisclaim);
-    base64Encode (content);
-    outbox << mime
-	   << content << endl;
-    outbox.flush ();
-    outbox.close ();
-    return;
  }
  streamoff curPos = 0;
  copy (mbox, outbox, curPos, contentPos);
@@ -694,8 +698,7 @@ MainAttachment::substitute (ifstream &mbox, ofstream &outbox, const SizeArg &min
 	}
 	removeSection (content, KAZ_HTML_START, KAZ_HTML_STOP);
 	removeSection (content, KAZ_PLAIN_START, KAZ_PLAIN_STOP);
-	// XXX case insensitive ??
-	if (content.find (CID) != string::npos)
+	if (caseInsensitiveFind (content, CID) != string::npos)
 	  replaceAll (content, translateHtml);
 	attachP->replaceEmbedded (content);
      }
@@ -733,12 +736,9 @@ MainAttachment::substitute (ifstream &mbox, ofstream &outbox, const SizeArg &min
    string content (KAZ_HTML_CONTENT+htmlDisclaim+BODY_END+HTML_END);
    base64Encode (content);

-    if (boundary.size ()) 
-      outbox << boundary.substr (0, boundary.length () -2) << endl
-	     << KAZ_ATTACHMENT_TEXT_HTML << endl
-	     << content << endl;
-    else
-      outbox << "coucou No multipart" << endl;
+    outbox << boundary.substr (0, boundary.length () -2) << endl
+	   << KAZ_ATTACHMENT_TEXT_HTML << endl
+	   << content << endl;
    outbox.flush ();
  }
  copy (mbox, outbox, curPos, endPos);
--- a/src/cpp/eMailShrinker.cpp
+++ b/src/cpp/eMailShrinker.cpp
@@ -33,8 +33,8 @@
 ////////////////////////////////////////////////////////////////////////////

 #include "version.hpp"
-const std::string kaz::LAST_VERSION_NUM ("2.8");
-const std::string kaz::LAST_VERSION_DATE ("2022-12-23");
+const std::string kaz::LAST_VERSION_NUM ("2.9");
+const std::string kaz::LAST_VERSION_DATE ("2022-12-24");
 const std::string kaz::LAST_VERSION (LAST_VERSION_NUM+" "+LAST_VERSION_DATE+" eMailShrinker");

 #include <iostream>
@@ -106,7 +106,7 @@ static const char *const inputFileC = inputFile.c_str ();

 int
 main (int argc, char** argv) {
-  // XXX debug before parse options
+  // uncomment next line in case of debug parse options
  // Log::debug = true;
  DEF_LOG ("main:", "");
  prog = argv [0];
--- a/src/cpp/jirafeauAPI.cpp
+++ b/src/cpp/jirafeauAPI.cpp
@@ -106,7 +106,7 @@ static const char *const inputFileC = inputFile.c_str ();

 int
 main (int argc, char** argv) {
-  // XXX debug before parse options
+  // uncomment next line in case of debug parse options
  // Log::debug = true;
  DEF_LOG ("main:", "");
  prog = argv [0];
--- a/src/cpp/kazMisc.cpp
+++ b/src/cpp/kazMisc.cpp
@@ -65,6 +65,14 @@ const string kaz::availableURLChars =
  "abcdefghijklmnopqrstuvwxyz"
  "~";

+const regex kaz::encodedWordRegex ("\\s*=\\?"				// flag begin
+				   "([0-9A-Za-z!#$%&'+^_`{}~-]+)"	// charset
+				   "\\?"				// flag sep
+				   "([QqBb])"				// quoted our base64
+				   "\\?"				// flag sep
+				   "([^ ?]+)"				// encoded string
+				   "\\?=\\s*");			// flag end
+

 // ================================================================================
 uint16_t
@@ -369,65 +377,66 @@ kaz::iso2utf (string &content) {

 // ================================================================================
 void
-kaz::encodedWord (string &content) {
+kaz::encodedWordDecode (string &content) {
  // rfc2047
-  DEF_LOG ("kaz::extendedWord", "content: " << content);
+  DEF_LOG ("kaz::encodedWordDecode", "content: " << content);
  string::size_type charsetPos = content.find ("=?");
  if (charsetPos == string::npos)
    return;
  LOG ("charsetPos: " << charsetPos);

-  LOG_BUG (charsetPos != 0, return, "kazMisc::extendedWord bug: =? not at begin pos. (content: " << content << ")");
  string result;
-  for ( ;
-	(charsetPos = content.find ("=?", charsetPos)) != string::npos;
-	) {
-    string::size_type modePos = content.find ("?", charsetPos+2);
+  auto pos (0);
+  sregex_iterator ewItEnd;
+  for (sregex_iterator ewIt (content.begin (), content.end (), encodedWordRegex);
+       ewIt != ewItEnd;
+       ++ewIt) {
+    smatch m = *ewIt;
+    if (pos != m.position ()) {
+      result += content.substr (pos, m.position () - pos);
+      LOG ("stantad " << content.substr (pos, m.position () - pos));
+    }
+    string encoded (m[3]);
+    replace (encoded.begin (), encoded.end (), '_', ' ');

-    LOG_BUG (modePos == string::npos, return, "kazMisc::extendedWord bug: no end chartset. (content: " << content << ")");
-    string::size_type contentPos = content.find ("?", modePos+1);
+    LOG ("charset: " << m[1] << " mode: " << m[2] << " string: " << encoded);

-    LOG_BUG (contentPos != modePos+2, return, "kazMisc::extendedWord bug: no end chartset. (content: " << content << ")");
-    string::size_type endPos = content.find ("?=", contentPos+1);
-
-    LOG_BUG (endPos == string::npos, return, "kazMisc::extendedWord bug: no end chartset. (content: " << content << ")");
-    string tmp (content.substr (contentPos+1, endPos-contentPos-1));
-    switch (content [modePos+1]) {
+    switch (m[2].str ()[0]) {
    case 'B':
    case 'b':
-      base64Decode (tmp);
+      base64Decode (encoded);
      break;
    case 'Q':
    case 'q':
-      quotedDecode (tmp);
+      quotedDecode (encoded);
      break;
    default:

-      LOG_BUG (true, return, "kazMisc::extendedWord bug: unknown mode. (mode: " << content [modePos+1] << ")");
+      LOG_BUG (true, return, "kazMisc::encodedWordDecode bug: unknown mode. (mode: " << m[2] << ")");
    }
-    LOG ("tmp: " << tmp);
-    string charset (content.substr (charsetPos, modePos-charsetPos-2));
+    LOG ("decoded: " << encoded);
+    string charset (m[1]);
    toLower (charset);
    if (! caseInsensitiveFind (charset, "ISO"))
-      iso2utf (tmp);
-    result += tmp;
-    charsetPos = endPos+2;
+      iso2utf (encoded);
+    result += encoded;
+    pos = m.position () + m.str ().length ();
  }
-  content = result;
+  content = result + content.substr (pos);
  LOG ("content: " << content);
 }

 // ================================================================================
 void
-kaz::charsetValue (string &content) {
+kaz::charsetValueDecode (string &content) {
  // rfc2184
-  DEF_LOG ("kaz::charsetValue", "content: " << content);
+  DEF_LOG ("kaz::charsetValueDecode", "content: " << content);
  string::size_type langPos = content.find ("'");

-  LOG_BUG (langPos == string::npos, return, "kazMisc::charsetValue bug: no '. (content: " << content << ")");
+  LOG_BUG (langPos == string::npos, return, "kazMisc::charsetValueDecode bug: no '. (content: " << content << ")");
  string::size_type contentPos = content.find ("'", langPos+1);

-  LOG_BUG (contentPos == string::npos, return, "kazMisc::charsetValue bug: no double '. (content: " << content << ")");
+  LOG_BUG (contentPos == string::npos, return, "kazMisc::charsetValueDecode bug: no double '. (content: " << content << ")");
  string tmp (content.substr (contentPos+1));
  quotedDecode<'%'> (tmp);
  LOG ("tmp: " << tmp);