depollueur/src/cpp/kazMisc.cpp

////////////////////////////////////////////////////////////////////////////
// Copyright KAZ 2021							  //
// 									  //
// contact (at) kaz.bzh							  //
// 									  //
// This software is a filter to shrink email by attachment extraction.	  //
// 									  //
// This software is governed by the CeCILL-B license under French law and //
// abiding by  the rules of distribution  of free software. You  can use, //
// modify  and/or  redistribute  the  software under  the  terms  of  the //
// CeCILL-B license as circulated by CEA, CNRS and INRIA at the following //
// URL "http://www.cecill.info".					  //
// 									  //
// As a counterpart to the access to  the source code and rights to copy, //
// modify and  redistribute granted  by the  license, users  are provided //
// only with a limited warranty and  the software's author, the holder of //
// the economic  rights, and the  successive licensors have  only limited //
// liability.								  //
// 									  //
// In this respect, the user's attention is drawn to the risks associated //
// with loading,  using, modifying  and/or developing or  reproducing the //
// software by the user in light of its specific status of free software, //
// that may  mean that  it is  complicated to  manipulate, and  that also //
// therefore means  that it  is reserved  for developers  and experienced //
// professionals having in-depth computer  knowledge. Users are therefore //
// encouraged  to load  and test  the software's  suitability as  regards //
// their  requirements  in  conditions  enabling the  security  of  their //
// systems and/or  data to  be ensured  and, more  generally, to  use and //
// operate it in the same conditions as regards security.		  //
// 									  //
// The fact that  you are presently reading this means  that you have had //
// knowledge of the CeCILL-B license and that you accept its terms.	  //
////////////////////////////////////////////////////////////////////////////

#include <iostream>
#include <sys/ioctl.h>
#include <algorithm>
#include <chrono>
#include <sstream>
#include <iomanip>

#include "kazDebug.hpp"
#include "kazMisc.hpp"

using namespace std;
using namespace kaz;

//template void kaz::quotedDecoded<'='> (string &content);
//template void kaz::quotedDecoded<'%'> (string &content);

static const string::size_type MAX_QUOTED_PRINTABLE_SIZE (78);

const char *const kaz::base64Chars =
  "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
  "abcdefghijklmnopqrstuvwxyz"
  "0123456789"
  "+/";

const string kaz::availableURLChars =
  "!#$%&'()*+,-./"
  "0123456789"
  ":;=?"
  "@ABCDEFGHIJKLMNOPQRSTUVWXYZ"
  "[]_"
  "abcdefghijklmnopqrstuvwxyz"
  "~";

const regex kaz::encodedWordRegex ("\\s*=\\?"				// flag begin
				   "([0-9A-Za-z!#$%&'+^_`{}~-]+)"	// charset
				   "\\?"				// flag sep
				   "([QqBb])"				// quoted our base64
				   "\\?"				// flag sep
				   "([^ ?]+)"				// encoded string
				   "\\?=\\s*");			// flag end


// ================================================================================
uint16_t
kaz::getCols () {
  struct winsize w;
  ioctl (0, TIOCGWINSZ, &w);
  return w.ws_col;
}

// ================================================================================
string
kaz::ns2string (const double &delta) {
  using namespace std::chrono;

  ostringstream oss;
  duration<double> ns (delta);
  oss.fill ('0');
  // typedef duration<int, ratio<86400> > days;
  // auto d = duration_cast<days>(ns);
  // ns -= d;
  auto h = duration_cast<hours> (ns);
  ns -= h;
  auto m = duration_cast<minutes> (ns);
  ns -= m;
  oss << setw (2) << h.count () << ":"
      << setw (2) << m.count () << ":"
      << setw (9) << fixed << setprecision (6) << ns.count ();
  return oss.str ();
}
// ================================================================================
void
kaz::replaceAll (string& str, const string &from, const string &to) {
  DEF_LOG ("kaz::replaceAll", "form: " << from << " to: " << to);
  if (str.empty () || from.empty ())
    return;
  for (string::size_type startPos (0);
       (startPos = str.find (from, startPos)) != string::npos;
       startPos += to.length ())
    str.replace (startPos, from.size (), to);
}

void
kaz::replaceAll (string& str, const map<const string, const string> &subst) {
  DEF_LOG ("kaz::replaceAll", "str: " << str);
  for (map<const string, const string>::const_iterator it = subst.begin (); it != subst.end (); ++it)
    replaceAll (str, it->first, it->second);
}

// ================================================================================
void
kaz::toLower (string &content) {
  DEF_LOG ("kaz::toLower", "content: " << content);
  static locale loc;
  for (string::size_type i = 0; i < content.length (); ++i)
    content [i] = tolower (content[i], loc);
  LOG ("content: " << content);
}

const string &
kaz::toUpperIfNeed (const string &src, string &tmp) {
  DEF_LOG ("kaz::toUpperIfNeed", "src: " << src);
  for (string::const_iterator it = src.begin (); it != src.end (); ++it)
    if (*it != toupper (*it)) {
      tmp.reserve ();
      for (it = src.begin (); it != src.end (); ++it)
	tmp.push_back (toupper (*it));
      return tmp;
    }
  return src;
}

inline bool
caseInsensitiveCharCompare (char a, char b) {
  return (toupper (a) == b);
}

string::size_type
kaz::caseInsensitiveFind (const string& s, const string& pattern, const string::size_type &pos) {
  DEF_LOG ("kaz::caseInsensitiveFind", "pattern: " << pattern << " pos: " << pos <<  " s: " << s);
  string tmp;
  const string &upperPattern (toUpperIfNeed (pattern, tmp));
  LOG ("pattern: " << upperPattern);
  string::const_iterator it (search (s.begin ()+pos, s.end (), upperPattern.begin (), upperPattern.end (), caseInsensitiveCharCompare));
  if (it == s.end ())
    return string::npos;
  LOG ("find: " << (it - s.begin ()));
  return it - s.begin ();
}

string::size_type
kaz::caseInsensitiveRFind (const string& s, const string& pattern, const string::size_type &pos) {
  DEF_LOG ("kaz::caseInsensitiveRFind", "pattern: " << pattern << " pos: " << pos <<  " s: " << s);
  string tmp;
  const string &upperPattern (toUpperIfNeed (pattern, tmp));
  LOG ("pattern: " << upperPattern);
  string::const_reverse_iterator it (search (s.rbegin (), s.rend ()+pos, upperPattern.rbegin (), upperPattern.rend (), caseInsensitiveCharCompare));
  if (it == s.rend ())
    return string::npos;
  LOG ("find: " << (s.rend () - it - pattern.length ()));
  return s.rend () - it - pattern.length ();
}

string
kaz::boundaryGen (const int &size) {
  static const char alphanum[] =
    "0123456789"
    "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
    "abcdefghijklmnopqrstuvwxyz";
  string result;
  result.reserve (size);
  for (int i = 0; i < size; ++i)
    result += alphanum[rand() % (sizeof (alphanum) - 1)];
  return result;
}

// ================================================================================
template<char delim>
void
kaz::quotedDecode (string &content) {
  DEF_LOG ("kaz::quotedDecode", "delim: " << delim << " content: " << content);
  string::size_type len (content.length ());
  if (!len)
    return;
  LOG ("len: " << len);
  string::iterator p (content.begin ()), q (p);
  for ( ;
       p < content.end ();
       ++p, ++q) {
    if (*p != delim) {
      *q = *p;
      continue;
    }
    if (p+1 < content.end () && *(p+1) == '\n') {

      LOG_BUG (q == content.begin (), ++p;continue, "kazMisc::quotedDecode bug: bad quoted-printable format. (start with '=', delim: " << int (delim) << " content: " << content << ")");
      ++p;
      --q;
      continue;
    }

    LOG_BUG (p+3 > content.end () || !isxdigit (p[1]) || !isxdigit (p[2]), return, "kazMisc::quotedDecode bug: bad quoted-printable format. (delim: " << int (delim)  << " content: " << content << ")");
    *q = (char) ((getHexaVal (p[1]) << 4) + getHexaVal (p[2]));
    p += 2;
  }
  content.resize (q-content.begin ());
  LOG ("content: " << content);
}

// ================================================================================
void
kaz::quotedEncode (string &content) {
  DEF_LOG ("kaz::quotedDecode", "content: " << content);
  string::size_type nbQuoted (0);
  for (string::const_iterator it = content.begin (); it != content.end (); ++it)
    if (isQuotedPrintable (*it))
      ++nbQuoted;
  if (!nbQuoted)
    return;
  string::size_type estimate (content.length ()+nbQuoted*3);
  estimate += (estimate/MAX_QUOTED_PRINTABLE_SIZE)*2;
  string result;
  result.reserve (estimate);
  string::size_type cols (0);
  char upper, lower;
  for (string::const_iterator it = content.begin (); it != content.end (); ++it) {
    const char &c (*it);
    if (c == '\n') {
      result.push_back ('\n');
      cols = 0;
      continue;
    }
    if (cols >= MAX_QUOTED_PRINTABLE_SIZE) {
      result.push_back ('=');
      result.push_back ('\n');
      cols = 0;
    }
    if (!isQuotedPrintable (c) ||
	((c == ' ' || c =='\t') && (it+1 == content.end () || *(it+1) == '\n'))) {
      if (cols > MAX_QUOTED_PRINTABLE_SIZE-3) {
	result.push_back ('=');
	result.push_back ('\n');
	cols = 0;
      }
      getHexa (c, upper, lower);
      result.push_back ('=');
      result.push_back (upper);
      result.push_back (lower);
      cols += 3;
      continue;
    }
    result.push_back (c);
    ++cols;
  }
  content.swap (result);
  LOG ("content: " << content);
}

// ================================================================================
void
kaz::base64Decode (string &content) {
  DEF_LOG ("kaz::base64Decode", "content: " << content);
  string::size_type len (content.length ());
  if (!len)
    return;
  LOG ("len: " << len);
  unsigned char buff[4];
  int idx = 0;
  string::iterator p (content.begin ()), q (p), lastOK (p);
  for (;
       p < content.end ();
       ++p) {
    char c = *p;
    if (c == '=')
      break;
    if (c == '\n') {
      lastOK = p;
      continue;
    }

    if (!isBase64 (c)) {
      content.resize (lastOK-content.begin ());
      LOG ("kazMisc::base64Decode bug: bad base64 format. (content: " << content << ")");
    }
    buff [idx] = getBase64Val (c);
    if (++idx != 4)
      continue;
    *q = buff [0] << 2 | (buff [1] & 0x30) >> 4;
    *++q = buff [1] << 4 | (buff [2] & 0x3c) >> 2;
    *++q = buff [2] << 6 | buff [3];
    ++q;
    idx = 0;
  }
  if (idx) {
    for (int j = idx; j < 4; ++j)
      buff [j] = 0;
    *q = buff [0] << 2 | (buff [1] & 0x30) >> 4;
    ++q;
    --idx;
    if (idx) {
      *q = buff [1] << 4 | (buff [2] & 0x3c) >> 2;
      ++q;
    }
  }
  content.resize (q-content.begin ());
  LOG ("content: " << content);
}

// ================================================================================
void
kaz::base64Encode (string &content) {
  DEF_LOG ("kaz::base64Encode", "content: " << content);
  string::size_type length (content.length ());
  std::string result;
  result.reserve ((length + 2) / 3 * 4 + length / MAX_QUOTED_PRINTABLE_SIZE + 1);
  for (string::size_type pos (0), cols (0); pos < length; ) {
    result.push_back (base64Chars [(content [pos + 0] & 0xfc) >> 2]);
    if (pos == length-1) {
      result.push_back (base64Chars [(content [pos + 0] & 0x03) << 4]);
      result.push_back ('=');
      result.push_back ('=');
      break;
    }
    result.push_back (base64Chars [((content [pos + 0] & 0x03) << 4) +
				   ((content [pos + 1] & 0xF0) >> 4)]);
    if (pos == length-2) {
      result.push_back (base64Chars [(content [pos + 1] & 0x0F) << 2]);
      result.push_back ('=');
      break;
    }
    result.push_back (base64Chars [((content [pos + 1] & 0x0F) << 2) +
				   ((content [pos + 2] & 0xC0) >> 6)]);
    result.push_back (base64Chars [content [pos + 2] & 0x3F]);
    pos += 3;
    cols += 4;
    if (cols >= MAX_QUOTED_PRINTABLE_SIZE) {
      result.push_back ('\n');
      cols = 0;
    }
  }
  content = result;
  LOG ("content: " << content);
}

// ================================================================================
void
kaz::iso2utf (string &content) {
  DEF_LOG ("kaz::iso2utf", "content: " << content);
  string::size_type len (content.length ());
  if (!len)
    return;
  LOG ("len: " << len);
  string::size_type charCount (0);
  for (string::iterator it = content.begin (); it != content.end (); ++it)
    if ((uint8_t) *it >= 0x80)
      ++charCount;
  if (!charCount)
    return;
  LOG ("charCount: " << charCount);
  content.resize (len+charCount);
  string::iterator p (content.end ()-1), q (p+charCount);
  for ( ; ; --p, --q) {
    uint8_t ch = *p;
    if (ch < 0x80)
      *q = ch;
    else {
      *q = 0x80 | (ch & 0x3F);
      *--q = 0xc0 | ch >> 6;
      LOG ("ch: " << (char) ch);
    }
    if (p == q)
      break;
  }
  LOG ("content: " << content);
}

// ================================================================================
void
kaz::encodedWordDecode (string &content) {
  // rfc2047
  DEF_LOG ("kaz::encodedWordDecode", "content: " << content);
  string::size_type charsetPos = content.find ("=?");
  if (charsetPos == string::npos)
    return;
  LOG ("charsetPos: " << charsetPos);

  string result;
  auto pos (0);
  sregex_iterator ewItEnd;
  for (sregex_iterator ewIt (content.begin (), content.end (), encodedWordRegex);
       ewIt != ewItEnd;
       ++ewIt) {
    smatch m = *ewIt;
    if (pos != m.position ()) {
      result += content.substr (pos, m.position () - pos);
      LOG ("stantad " << content.substr (pos, m.position () - pos));
    }
    string encoded (m[3]);
    replace (encoded.begin (), encoded.end (), '_', ' ');

    LOG ("charset: " << m[1] << " mode: " << m[2] << " string: " << encoded);

    switch (m[2].str ()[0]) {
    case 'B':
    case 'b':
      base64Decode (encoded);
      break;
    case 'Q':
    case 'q':
      quotedDecode (encoded);
      break;
    default:

      LOG_BUG (true, return, "kazMisc::encodedWordDecode bug: unknown mode. (mode: " << m[2] << ")");
    }
    LOG ("decoded: " << encoded);
    string charset (m[1]);
    toLower (charset);
    if (! caseInsensitiveFind (charset, "ISO"))
      iso2utf (encoded);
    result += encoded;
    pos = m.position () + m.str ().length ();
  }
  content = result + content.substr (pos);
  LOG ("content: " << content);
}

// ================================================================================
void
kaz::charsetValueDecode (string &content) {
  // rfc2184
  DEF_LOG ("kaz::charsetValueDecode", "content: " << content);
  string::size_type langPos = content.find ("'");

  LOG_BUG (langPos == string::npos, return, "kazMisc::charsetValueDecode bug: no '. (content: " << content << ")");
  string::size_type contentPos = content.find ("'", langPos+1);

  LOG_BUG (contentPos == string::npos, return, "kazMisc::charsetValueDecode bug: no double '. (content: " << content << ")");
  string tmp (content.substr (contentPos+1));
  quotedDecode<'%'> (tmp);
  LOG ("tmp: " << tmp);
  string charset (content.substr (0, langPos));
  toLower (charset);
  if (! caseInsensitiveFind (charset, "ISO"))
    iso2utf (tmp);
  content = tmp;
  LOG ("content: " << content);
}

// ================================================================================
void
kaz::removeQuote (string &content) {
  if (content.empty () || content [0] != '"')
    return;
  string::size_type stop = (1);
  for (;;) {
    stop = content.find ('"', stop);
    if (stop == string::npos || content [stop-1] != '\\')
      break;
    ++stop;
  }
  content = (stop != string::npos) ?
    content.substr (1, stop-1) :
    content.substr (1);
}

// ================================================================================