Spam scanning investigation
[gnucomo.git] / src / spam / scanspam.cpp
diff --git a/src/spam/scanspam.cpp b/src/spam/scanspam.cpp
new file mode 100644 (file)
index 0000000..c551091
--- /dev/null
@@ -0,0 +1,134 @@
+#include "object.h"
+
+int LevenshteinDistance(const String &source, const String &target)
+{
+    if (~source > ~target)
+    {
+        return LevenshteinDistance(target, source);
+    }
+
+    const int min_size = ~source, max_size = ~target;
+    std::vector<int> lev_dist(min_size + 1);
+
+    for (int i = 0; i <= min_size; ++i)
+    {
+        lev_dist[i] = i;
+    }
+
+    for (int j = 1; j <= max_size; ++j)
+    {
+        int previous_diagonal = lev_dist[0], previous_diagonal_save;
+        ++lev_dist[0];
+
+        for (int i = 1; i <= min_size; ++i)
+        {
+            previous_diagonal_save = lev_dist[i];
+            if (source[i - 1] == target[j - 1])
+            {
+                lev_dist[i] = previous_diagonal;
+            }
+            else
+            {
+                lev_dist[i] = std::min(std::min(lev_dist[i - 1], lev_dist[i]), previous_diagonal) + 1;
+            }
+            previous_diagonal = previous_diagonal_save;
+        }
+    }
+
+    return lev_dist[min_size];
+}
+
+bool begin_filtered(String s, SuperString &filter)
+{
+   int i;
+   bool found;
+
+   found = false;
+   for (i = 0; i < ~filter; i++)
+   {
+      if (!found && ~s > ~filter[i])
+      {
+         String part_to_check = s(0, ~filter[i]);
+
+         found = part_to_check == filter[i];
+      }
+   }
+
+   return found;
+}
+
+std::ostream *Log = &std::cerr;
+
+int main(int argc, char *argv[])
+{
+   String hostname("skiathos.andromeda.nl");
+   gnucomo_config    cfg;
+   String           config_name("gnucomo");
+
+   /*  Get the configuration file */
+
+   if (!cfg.read(config_name))
+   {
+      std::cerr << "Can not read Gnucomo configuration file for " << config_name << ".\n";
+      exit(1);
+   }
+   gnucomo_database db(&cfg);
+   Object host(db, hostname);
+
+   std::list<ObjectLog> spam_logs;
+   SuperString          spam_headers;
+   SuperString          pre_filter;
+
+   // Logs starting with these strings are irrelevant
+
+   pre_filter += "MIME-Version:";
+   pre_filter += "Content-Type: text";
+   pre_filter += "To:";
+   pre_filter += "X-Original-To:";
+   pre_filter += "Delivered-To:";
+   pre_filter += "Date:";
+   pre_filter += "X-Greylist:";
+   pre_filter += "Importance:";
+   pre_filter += "X-Priority:";
+   pre_filter += "X-AntiAbuse:";
+
+   UTC start_date;
+   date last_week;
+
+   last_week = today() - 7;
+   start_date = UTC(last_week, now());
+
+   std::cout << "Scanning spam from " << start_date << "\n";
+
+   spam_logs = host.select_logs(start_date, Now(), "gnucomo");
+
+   std::list<ObjectLog>::iterator spam_i;
+   for (spam_i = spam_logs.begin(); spam_i != spam_logs.end(); spam_i++)
+   {
+      String log_string = spam_i->raw() << 34;
+
+      if (!begin_filtered(log_string, pre_filter))
+      {
+         spam_headers += log_string;
+      }
+   }
+
+   int i, j;
+
+   for (i = 0; i < ~spam_headers; i++)
+   {
+      std::cout << "\n====================================================================\n";
+      std::cout << "[" << ~spam_headers[i] << "] " << spam_headers[i] << "\n";
+      std::cout << "====================================================================\n";
+      for (j = i + 1; j < ~spam_headers; j++)
+      {
+         int d = LevenshteinDistance(spam_headers[i], spam_headers[j]);
+         if (d * 3 < ~spam_headers[i])
+         {
+            std::cout << "    " << d << " - " << spam_headers[j] << "\n";
+         }
+      }
+   }
+
+   std::cout << "FINISH.\n";
+}