Spam scanning investigation
[gnucomo.git] / src / spam / scanspam.cpp
1 #include "object.h"
2
3 int LevenshteinDistance(const String &source, const String &target)
4 {
5     if (~source > ~target)
6     {
7         return LevenshteinDistance(target, source);
8     }
9
10     const int min_size = ~source, max_size = ~target;
11     std::vector<int> lev_dist(min_size + 1);
12
13     for (int i = 0; i <= min_size; ++i)
14     {
15         lev_dist[i] = i;
16     }
17
18     for (int j = 1; j <= max_size; ++j)
19     {
20         int previous_diagonal = lev_dist[0], previous_diagonal_save;
21         ++lev_dist[0];
22
23         for (int i = 1; i <= min_size; ++i)
24         {
25             previous_diagonal_save = lev_dist[i];
26             if (source[i - 1] == target[j - 1])
27             {
28                 lev_dist[i] = previous_diagonal;
29             }
30             else
31             {
32                 lev_dist[i] = std::min(std::min(lev_dist[i - 1], lev_dist[i]), previous_diagonal) + 1;
33             }
34             previous_diagonal = previous_diagonal_save;
35         }
36     }
37
38     return lev_dist[min_size];
39 }
40
41 bool begin_filtered(String s, SuperString &filter)
42 {
43    int i;
44    bool found;
45
46    found = false;
47    for (i = 0; i < ~filter; i++)
48    {
49       if (!found && ~s > ~filter[i])
50       {
51          String part_to_check = s(0, ~filter[i]);
52
53          found = part_to_check == filter[i];
54       }
55    }
56
57    return found;
58 }
59
60 std::ostream *Log = &std::cerr;
61
62 int main(int argc, char *argv[])
63 {
64    String hostname("skiathos.andromeda.nl");
65    gnucomo_config    cfg;
66    String           config_name("gnucomo");
67
68    /*  Get the configuration file */
69
70    if (!cfg.read(config_name))
71    {
72       std::cerr << "Can not read Gnucomo configuration file for " << config_name << ".\n";
73       exit(1);
74    }
75    gnucomo_database db(&cfg);
76    Object host(db, hostname);
77
78    std::list<ObjectLog> spam_logs;
79    SuperString          spam_headers;
80    SuperString          pre_filter;
81
82    // Logs starting with these strings are irrelevant
83
84    pre_filter += "MIME-Version:";
85    pre_filter += "Content-Type: text";
86    pre_filter += "To:";
87    pre_filter += "X-Original-To:";
88    pre_filter += "Delivered-To:";
89    pre_filter += "Date:";
90    pre_filter += "X-Greylist:";
91    pre_filter += "Importance:";
92    pre_filter += "X-Priority:";
93    pre_filter += "X-AntiAbuse:";
94
95    UTC start_date;
96    date last_week;
97
98    last_week = today() - 7;
99    start_date = UTC(last_week, now());
100
101    std::cout << "Scanning spam from " << start_date << "\n";
102
103    spam_logs = host.select_logs(start_date, Now(), "gnucomo");
104
105    std::list<ObjectLog>::iterator spam_i;
106    for (spam_i = spam_logs.begin(); spam_i != spam_logs.end(); spam_i++)
107    {
108       String log_string = spam_i->raw() << 34;
109
110       if (!begin_filtered(log_string, pre_filter))
111       {
112          spam_headers += log_string;
113       }
114    }
115
116    int i, j;
117
118    for (i = 0; i < ~spam_headers; i++)
119    {
120       std::cout << "\n====================================================================\n";
121       std::cout << "[" << ~spam_headers[i] << "] " << spam_headers[i] << "\n";
122       std::cout << "====================================================================\n";
123       for (j = i + 1; j < ~spam_headers; j++)
124       {
125          int d = LevenshteinDistance(spam_headers[i], spam_headers[j]);
126          if (d * 3 < ~spam_headers[i])
127          {
128             std::cout << "    " << d << " - " << spam_headers[j] << "\n";
129          }
130       }
131    }
132
133    std::cout << "FINISH.\n";
134 }