]>
Commit | Line | Data |
---|---|---|
37ef5775 SI |
1 | #!/usr/bin/perl -w -T |
2 | # <@LICENSE> | |
3 | # Licensed to the Apache Software Foundation (ASF) under one or more | |
4 | # contributor license agreements. See the NOTICE file distributed with | |
5 | # this work for additional information regarding copyright ownership. | |
6 | # The ASF licenses this file to you under the Apache License, Version 2.0 | |
7 | # (the "License"); you may not use this file except in compliance with | |
8 | # the License. You may obtain a copy of the License at: | |
9 | # | |
10 | # http://www.apache.org/licenses/LICENSE-2.0 | |
11 | # | |
12 | # Unless required by applicable law or agreed to in writing, software | |
13 | # distributed under the License is distributed on an "AS IS" BASIS, | |
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
15 | # See the License for the specific language governing permissions and | |
16 | # limitations under the License. | |
17 | # </@LICENSE> | |
18 | ||
19 | use strict; | |
20 | use warnings; | |
21 | # use bytes; | |
22 | ||
23 | use Errno qw(EBADF); | |
24 | use Getopt::Long; | |
25 | use Pod::Usage; | |
26 | use File::Spec; | |
27 | use POSIX qw(locale_h setsid sigprocmask _exit); | |
28 | ||
29 | POSIX::setlocale(LC_TIME,'C'); | |
30 | ||
31 | our ( $spamtest, %opt, $isspam, $forget, $messagecount, $learnedcount, $messagelimit, $progress, | |
32 | $total_messages, $init_results, $start_time, $synconly, $learnprob, @targets, $bayes_override_path ); | |
33 | ||
34 | my $PREFIX = '@@PREFIX@@'; # substituted at 'make' time | |
35 | my $DEF_RULES_DIR = '@@DEF_RULES_DIR@@'; # substituted at 'make' time | |
36 | my $LOCAL_RULES_DIR = '@@LOCAL_RULES_DIR@@'; # substituted at 'make' time | |
37 | ||
38 | use lib '@@INSTALLSITELIB@@'; # substituted at 'make' time | |
39 | ||
40 | BEGIN { # see comments in "spamassassin.raw" for doco | |
41 | my @bin = File::Spec->splitpath($0); | |
42 | my $bin = ($bin[0] ? File::Spec->catpath(@bin[0..1], '') : $bin[1]) | |
43 | || File::Spec->curdir; | |
44 | ||
45 | if (-e $bin.'/lib/Mail/SpamAssassin.pm' | |
46 | || !-e '@@INSTALLSITELIB@@/Mail/SpamAssassin.pm' ) | |
47 | { | |
48 | my $searchrelative; | |
49 | $searchrelative = 1; # disabled during "make install": REMOVEFORINST | |
50 | if ($searchrelative && $bin eq '../' && -e '../blib/lib/Mail/SpamAssassin.pm') | |
51 | { | |
52 | unshift ( @INC, '../blib/lib' ); | |
53 | } else { | |
54 | foreach ( qw(lib ../lib/site_perl | |
55 | ../lib/spamassassin ../share/spamassassin/lib)) | |
56 | { | |
57 | my $dir = File::Spec->catdir( $bin, split ( '/', $_ ) ); | |
58 | if ( -f File::Spec->catfile( $dir, "Mail", "SpamAssassin.pm" ) ) | |
59 | { unshift ( @INC, $dir ); last; } | |
60 | } | |
61 | } | |
62 | } | |
63 | } | |
64 | ||
65 | use Mail::SpamAssassin; | |
66 | use Mail::SpamAssassin::ArchiveIterator; | |
67 | use Mail::SpamAssassin::Message; | |
68 | use Mail::SpamAssassin::PerMsgLearner; | |
69 | use Mail::SpamAssassin::Util::Progress; | |
70 | use Mail::SpamAssassin::Logger; | |
71 | ||
72 | ########################################################################### | |
73 | ||
74 | $SIG{PIPE} = 'IGNORE'; | |
75 | ||
76 | # used to be CmdLearn::cmd_run() ... | |
77 | ||
78 | %opt = ( | |
79 | 'force-expire' => 0, | |
80 | 'use-ignores' => 0, | |
81 | 'nosync' => 0, | |
82 | 'quiet' => 0, | |
83 | 'cf' => [] | |
84 | ); | |
85 | ||
86 | Getopt::Long::Configure( | |
87 | qw(bundling no_getopt_compat | |
88 | permute no_auto_abbrev no_ignore_case) | |
89 | ); | |
90 | ||
91 | GetOptions( | |
92 | 'forget' => \$forget, | |
93 | 'ham|nonspam' => sub { $isspam = 0; }, | |
94 | 'spam' => sub { $isspam = 1; }, | |
95 | 'sync' => \$synconly, | |
96 | 'rebuild' => sub { $synconly = 1; warn "The --rebuild option has been deprecated. Please use --sync instead.\n" }, | |
97 | ||
98 | 'q|quiet' => \$opt{'quiet'}, | |
99 | 'username|u=s' => \$opt{'username'}, | |
100 | 'configpath|config-file|config-dir|c|C=s' => \$opt{'configpath'}, | |
101 | 'prefspath|prefs-file|p=s' => \$opt{'prefspath'}, | |
102 | 'siteconfigpath=s' => \$opt{'siteconfigpath'}, | |
103 | 'cf=s' => \@{$opt{'cf'}}, | |
104 | ||
105 | 'folders|f=s' => \$opt{'folders'}, | |
106 | 'force-expire|expire' => \$opt{'force-expire'}, | |
107 | 'local|L' => \$opt{'local'}, | |
108 | 'no-sync|nosync' => \$opt{'nosync'}, | |
109 | 'showdots' => \$opt{'showdots'}, | |
110 | 'progress' => \$opt{'progress'}, | |
111 | 'use-ignores' => \$opt{'use-ignores'}, | |
112 | 'no-rebuild|norebuild' => sub { $opt{'nosync'} = 1; warn "The --no-rebuild option has been deprecated. Please use --no-sync instead.\n" }, | |
113 | ||
114 | 'learnprob=f' => \$opt{'learnprob'}, | |
115 | 'randseed=i' => \$opt{'randseed'}, | |
116 | 'stopafter=i' => \$opt{'stopafter'}, | |
117 | 'max-size=i' => \$opt{'max-size'}, | |
118 | ||
119 | 'debug|debug-level|D:s' => \$opt{'debug'}, | |
120 | 'help|h|?' => \$opt{'help'}, | |
121 | 'version|V' => \$opt{'version'}, | |
122 | ||
123 | 'dump:s' => \$opt{'dump'}, | |
124 | 'import' => \$opt{'import'}, | |
125 | ||
126 | 'backup' => \$opt{'backup'}, | |
127 | 'clear' => \$opt{'clear'}, | |
128 | 'restore=s' => \$opt{'restore'}, | |
129 | ||
130 | 'dir' => sub { $opt{'old_format'} = 'dir'; }, | |
131 | 'file' => sub { $opt{'old_format'} = 'file'; }, | |
132 | 'mbox' => sub { $opt{'format'} = 'mbox'; }, | |
133 | 'mbx' => sub { $opt{'format'} = 'mbx'; }, | |
134 | 'single' => sub { $opt{'old_format'} = 'single'; }, | |
135 | ||
136 | 'db|dbpath=s' => \$bayes_override_path, | |
137 | 're|regexp=s' => \$opt{'regexp'}, | |
138 | ||
139 | '<>' => \&target, | |
140 | ) | |
141 | or usage( 0, "Unknown option!" ); | |
142 | ||
143 | if ( defined $opt{'help'} ) { | |
144 | usage( 0, "For more information read the manual page" ); | |
145 | } | |
146 | if ( defined $opt{'version'} ) { | |
147 | print "SpamAssassin version " . Mail::SpamAssassin::Version() . "\n"; | |
148 | exit 0; | |
149 | } | |
150 | ||
151 | # set debug areas, if any specified (only useful for command-line tools) | |
152 | if (defined $opt{'debug'}) { | |
153 | $opt{'debug'} ||= 'all'; | |
154 | } | |
155 | ||
156 | if ( $opt{'force-expire'} ) { | |
157 | $synconly = 1; | |
158 | } | |
159 | ||
160 | if ($opt{'showdots'} && $opt{'progress'}) { | |
161 | print "--showdots and --progress may not be used together, please select just one\n"; | |
162 | exit 0; | |
163 | } | |
164 | ||
165 | if ( !defined $isspam | |
166 | && !defined $synconly | |
167 | && !defined $forget | |
168 | && !defined $opt{'dump'} | |
169 | && !defined $opt{'import'} | |
170 | && !defined $opt{'clear'} | |
171 | && !defined $opt{'backup'} | |
172 | && !defined $opt{'restore'} | |
173 | && !defined $opt{'folders'} ) | |
174 | { | |
175 | usage( 0, | |
176 | "Please select either --spam, --ham, --folders, --forget, --sync, --import,\n--dump, --clear, --backup or --restore" | |
177 | ); | |
178 | } | |
179 | ||
180 | # We need to make sure the journal syncs pre-forget... | |
181 | if ( defined $forget && $opt{'nosync'} ) { | |
182 | $opt{'nosync'} = 0; | |
183 | warn | |
184 | "sa-learn warning: --forget requires read/write access to the database, and is incompatible with --no-sync\n"; | |
185 | } | |
186 | ||
187 | if ( defined $opt{'old_format'} ) { | |
188 | ||
189 | #Format specified in the 2.5x form of --dir, --file, --mbox, --mbx or --single. | |
190 | #Convert it to the new behavior: | |
191 | if ( $opt{'old_format'} eq 'single' ) { | |
192 | push ( @ARGV, '-' ); | |
193 | } | |
194 | } | |
195 | ||
196 | my $post_config = ''; | |
197 | ||
198 | # kluge to support old check_bayes_db operation | |
199 | # bug 3799: init() will go r/o with the configured DB, and then dbpath needs | |
200 | # to override. Just access the dbpath version via post_config_text. | |
201 | if ( defined $bayes_override_path ) { | |
202 | # Add a default prefix if the path is a directory | |
203 | if ( -d $bayes_override_path ) { | |
204 | $bayes_override_path = File::Spec->catfile( $bayes_override_path, 'bayes' ); | |
205 | } | |
206 | ||
207 | $post_config .= "bayes_path $bayes_override_path\n"; | |
208 | } | |
209 | ||
210 | # These options require bayes_scanner, which requires "use_bayes 1", but | |
211 | # that's not necessary for these commands. | |
212 | if (defined $opt{'dump'} || defined $opt{'import'} || defined $opt{'clear'} || | |
213 | defined $opt{'backup'} || defined $opt{'restore'}) { | |
214 | $post_config .= "use_bayes 1\n"; | |
215 | } | |
216 | ||
217 | $post_config .= join("\n", @{$opt{'cf'}})."\n"; | |
218 | ||
219 | # create the tester factory | |
220 | $spamtest = new Mail::SpamAssassin( | |
221 | { | |
222 | rules_filename => $opt{'configpath'}, | |
223 | site_rules_filename => $opt{'siteconfigpath'}, | |
224 | userprefs_filename => $opt{'prefspath'}, | |
225 | username => $opt{'username'}, | |
226 | debug => $opt{'debug'}, | |
227 | local_tests_only => $opt{'local'}, | |
228 | dont_copy_prefs => 1, | |
229 | PREFIX => $PREFIX, | |
230 | DEF_RULES_DIR => $DEF_RULES_DIR, | |
231 | LOCAL_RULES_DIR => $LOCAL_RULES_DIR, | |
232 | post_config_text => $post_config, | |
233 | } | |
234 | ); | |
235 | ||
236 | $spamtest->init(1); | |
237 | dbg("sa-learn: spamtest initialized"); | |
238 | ||
239 | # Bug 6228 hack: bridge the transition gap of moving Bayes.pm into a plugin; | |
240 | # To be resolved more cleanly!!! | |
241 | if ($spamtest->{bayes_scanner}) { | |
242 | foreach my $plugin ( @{ $spamtest->{plugins}->{plugins} } ) { | |
243 | if ($plugin->isa('Mail::SpamAssassin::Plugin::Bayes')) { | |
244 | # copy plugin's "store" object ref one level up! | |
245 | $spamtest->{bayes_scanner}->{store} = $plugin->{store}; | |
246 | } | |
247 | } | |
248 | } | |
249 | ||
250 | if (Mail::SpamAssassin::Util::am_running_on_windows()) { | |
251 | binmode(STDIN) or die "cannot set binmode on STDIN: $!"; # bug 4363 | |
252 | binmode(STDOUT) or die "cannot set binmode on STDOUT: $!"; | |
253 | } | |
254 | ||
255 | if ( defined $opt{'dump'} ) { | |
256 | my ( $magic, $toks ); | |
257 | ||
258 | if ( $opt{'dump'} eq 'all' || $opt{'dump'} eq '' ) { # show us all tokens! | |
259 | ( $magic, $toks ) = ( 1, 1 ); | |
260 | } | |
261 | elsif ( $opt{'dump'} eq 'magic' ) { # show us magic tokens only | |
262 | ( $magic, $toks ) = ( 1, 0 ); | |
263 | } | |
264 | elsif ( $opt{'dump'} eq 'data' ) { # show us data tokens only | |
265 | ( $magic, $toks ) = ( 0, 1 ); | |
266 | } | |
267 | else { # unknown option | |
268 | warn "Unknown dump option '" . $opt{'dump'} . "'\n"; | |
269 | $spamtest->finish_learner(); | |
270 | exit 1; | |
271 | } | |
272 | ||
273 | if (!$spamtest->dump_bayes_db( $magic, $toks, $opt{'regexp'}) ) { | |
274 | $spamtest->finish_learner(); | |
275 | die "ERROR: Bayes dump returned an error, please re-run with -D for more information\n"; | |
276 | } | |
277 | ||
278 | $spamtest->finish_learner(); | |
279 | # make sure we notice any write errors while flushing output buffer | |
280 | close STDOUT or die "error closing STDOUT: $!"; | |
281 | close STDIN or die "error closing STDIN: $!"; | |
282 | exit 0; | |
283 | } | |
284 | ||
285 | if ( defined $opt{'import'} ) { | |
286 | my $ret = $spamtest->{bayes_scanner}->{store}->perform_upgrade(); | |
287 | $spamtest->finish_learner(); | |
288 | # make sure we notice any write errors while flushing output buffer | |
289 | close STDOUT or die "error closing STDOUT: $!"; | |
290 | close STDIN or die "error closing STDIN: $!"; | |
291 | exit( !$ret ); | |
292 | } | |
293 | ||
294 | if (defined $opt{'clear'}) { | |
295 | unless ($spamtest->{bayes_scanner}->{store}->clear_database()) { | |
296 | $spamtest->finish_learner(); | |
297 | die "ERROR: Bayes clear returned an error, please re-run with -D for more information\n"; | |
298 | } | |
299 | ||
300 | $spamtest->finish_learner(); | |
301 | # make sure we notice any write errors while flushing output buffer | |
302 | close STDOUT or die "error closing STDOUT: $!"; | |
303 | close STDIN or die "error closing STDIN: $!"; | |
304 | exit 0; | |
305 | } | |
306 | ||
307 | if (defined $opt{'backup'}) { | |
308 | unless ($spamtest->{bayes_scanner}->{store}->backup_database()) { | |
309 | $spamtest->finish_learner(); | |
310 | die "ERROR: Bayes backup returned an error, please re-run with -D for more information\n"; | |
311 | } | |
312 | ||
313 | $spamtest->finish_learner(); | |
314 | # make sure we notice any write errors while flushing output buffer | |
315 | close STDOUT or die "error closing STDOUT: $!"; | |
316 | close STDIN or die "error closing STDIN: $!"; | |
317 | exit 0; | |
318 | } | |
319 | ||
320 | if (defined $opt{'restore'}) { | |
321 | ||
322 | my $filename = $opt{'restore'}; | |
323 | ||
324 | unless ($filename) { | |
325 | $spamtest->finish_learner(); | |
326 | die "ERROR: You must specify a filename to restore.\n"; | |
327 | } | |
328 | ||
329 | unless ($spamtest->{bayes_scanner}->{store}->restore_database($filename, $opt{'showdots'})) { | |
330 | $spamtest->finish_learner(); | |
331 | die "ERROR: Bayes restore returned an error, please re-run with -D for more information\n"; | |
332 | } | |
333 | ||
334 | $spamtest->finish_learner(); | |
335 | # make sure we notice any write errors while flushing output buffer | |
336 | close STDOUT or die "error closing STDOUT: $!"; | |
337 | close STDIN or die "error closing STDIN: $!"; | |
338 | exit 0; | |
339 | } | |
340 | ||
341 | if ( !$spamtest->{conf}->{use_bayes} ) { | |
342 | warn "ERROR: configuration specifies 'use_bayes 0', sa-learn disabled\n"; | |
343 | exit 1; | |
344 | } | |
345 | ||
346 | $spamtest->init_learner( | |
347 | { | |
348 | force_expire => $opt{'force-expire'}, | |
349 | learn_to_journal => $opt{'nosync'}, | |
350 | wait_for_lock => 1, | |
351 | caller_will_untie => 1 | |
352 | } | |
353 | ); | |
354 | ||
355 | $spamtest->{bayes_scanner}{use_ignores} = $opt{'use-ignores'}; | |
356 | ||
357 | if ($synconly) { | |
358 | $spamtest->rebuild_learner_caches( | |
359 | { | |
360 | verbose => !$opt{'quiet'}, | |
361 | showdots => $opt{'showdots'} | |
362 | } | |
363 | ); | |
364 | $spamtest->finish_learner(); | |
365 | # make sure we notice any write errors while flushing output buffer | |
366 | close STDOUT or die "error closing STDOUT: $!"; | |
367 | close STDIN or die "error closing STDIN: $!"; | |
368 | exit 0; | |
369 | } | |
370 | ||
371 | $messagelimit = $opt{'stopafter'}; | |
372 | $learnprob = $opt{'learnprob'}; | |
373 | ||
374 | if ( defined $opt{'randseed'} ) { | |
375 | srand( $opt{'randseed'} ); | |
376 | } | |
377 | ||
378 | # sync the journal first if we're going to go r/w so we make sure to | |
379 | # learn everything before doing anything else. | |
380 | # | |
381 | if ( !$opt{nosync} ) { | |
382 | $spamtest->rebuild_learner_caches(); | |
383 | } | |
384 | ||
385 | # what is the result of the run? will end up being the exit code. | |
386 | my $exit_status = 0; | |
387 | ||
388 | # run this lot in an eval block, so we can catch die's and clear | |
389 | # up the dbs. | |
390 | eval { | |
391 | $SIG{HUP} = \&killed; | |
392 | $SIG{INT} = \&killed; | |
393 | $SIG{TERM} = \&killed; | |
394 | ||
395 | if ( $opt{folders} ) { | |
396 | open( F, $opt{folders} ) or die "cannot open $opt{folders}: $!"; | |
397 | for ($!=0; <F>; $!=0) { | |
398 | chomp; | |
399 | next if /^\s*$/; | |
400 | if (/^(ham|spam):(\w*):(.*)/) { | |
401 | my $class = $1; | |
402 | my $format = $2 || "detect"; | |
403 | my $target = $3; | |
404 | push ( @targets, "$class:$format:$target" ); | |
405 | } | |
406 | else { | |
407 | target($_); | |
408 | } | |
409 | } | |
410 | defined $_ || $!==0 or | |
411 | $!==EBADF ? dbg("error reading from $opt{folders}: $!") | |
412 | : die "error reading from $opt{folders}: $!"; | |
413 | close(F) or die "error closing $opt{folders}: $!"; | |
414 | } | |
415 | ||
416 | ########################################################################### | |
417 | # Deal with the target listing, and STDIN -> tempfile | |
418 | ||
419 | my $tempfile; # will be defined if stdin -> tempfile | |
420 | push(@targets, @ARGV); | |
421 | @targets = ('-') unless @targets || $opt{folders}; | |
422 | ||
423 | for(my $elem = 0; $elem <= $#targets; $elem++) { | |
424 | # ArchiveIterator doesn't really like STDIN, so if "-" is specified | |
425 | # as a target, make it a temp file instead. | |
426 | if ( $targets[$elem] =~ /(?:^|:)-$/ ) { | |
427 | if (defined $tempfile) { | |
428 | # uh-oh, stdin specified multiple times? | |
429 | warn "skipping extra stdin target (".$targets[$elem].")\n"; | |
430 | splice @targets, $elem, 1; | |
431 | $elem--; # go back to this element again | |
432 | next; | |
433 | } | |
434 | else { | |
435 | my $handle; | |
436 | ( $tempfile, $handle ) = Mail::SpamAssassin::Util::secure_tmpfile(); | |
437 | binmode $handle or die "cannot set binmode on file $tempfile: $!"; | |
438 | ||
439 | # avoid slurping the whole file into memory, copy chunk by chunk | |
440 | my($inbuf,$nread); | |
441 | while ( $nread=sysread(STDIN,$inbuf,16384) ) | |
442 | { print {$handle} $inbuf or die "error writing to $tempfile: $!" } | |
443 | defined $nread or die "error reading from STDIN: $!"; | |
444 | close $handle or die "error closing $tempfile: $!"; | |
445 | ||
446 | # re-aim the targets at the tempfile instead of STDIN | |
447 | $targets[$elem] =~ s/-$/$tempfile/; | |
448 | } | |
449 | } | |
450 | ||
451 | # make sure the target list is in the normal AI format | |
452 | if ($targets[$elem] !~ /^[^:]*:[a-z]+:/) { | |
453 | my $item = splice @targets, $elem, 1; | |
454 | target($item); # add back to the list | |
455 | $elem--; # go back to this element again | |
456 | next; | |
457 | } | |
458 | } | |
459 | ||
460 | ########################################################################### | |
461 | ||
462 | my $iter = new Mail::SpamAssassin::ArchiveIterator( | |
463 | { | |
464 | # skip messages larger than max-size bytes, | |
465 | # 0 for no limit, undef defaults to 256 KB | |
466 | 'opt_max_size' => $opt{'max-size'}, | |
467 | 'opt_want_date' => 0, | |
468 | 'opt_from_regex' => $spamtest->{conf}->{mbox_format_from_regex}, | |
469 | } | |
470 | ); | |
471 | ||
472 | $iter->set_functions(\&wanted, \&result); | |
473 | $messagecount = 0; | |
474 | $learnedcount = 0; | |
475 | ||
476 | $init_results = 0; | |
477 | $start_time = time; | |
478 | ||
479 | # if exit_status isn't already set to non-zero, set it to the reverse of the | |
480 | # run result (0 is bad, 1+ is good -- the opposite of exit status codes) | |
481 | my $run_ok = eval { $exit_status ||= ! $iter->run(@targets); 1 }; | |
482 | ||
483 | print STDERR "\n" if ($opt{showdots}); | |
484 | $progress->final() if ($opt{progress} && $progress); | |
485 | ||
486 | my $phrase = defined $forget ? "Forgot" : "Learned"; | |
487 | print "$phrase tokens from $learnedcount message(s) ($messagecount message(s) examined)\n" | |
488 | if !$opt{'quiet'}; | |
489 | ||
490 | # If we needed to make a tempfile, go delete it. | |
491 | if (defined $tempfile) { | |
492 | unlink $tempfile or die "cannot unlink temporary file $tempfile: $!"; | |
493 | undef $tempfile; | |
494 | } | |
495 | ||
496 | if (!$run_ok && $@ !~ /HITLIMIT/) { die $@ } | |
497 | 1; | |
498 | } or do { | |
499 | my $eval_stat = $@ ne '' ? $@ : "errno=$!"; chomp $eval_stat; | |
500 | $spamtest->finish_learner(); | |
501 | die $eval_stat; | |
502 | }; | |
503 | ||
504 | $spamtest->finish_learner(); | |
505 | # make sure we notice any write errors while flushing output buffer | |
506 | close STDOUT or die "error closing STDOUT: $!"; | |
507 | close STDIN or die "error closing STDIN: $!"; | |
508 | exit $exit_status; | |
509 | ||
510 | ########################################################################### | |
511 | ||
512 | sub killed { | |
513 | $spamtest->finish_learner(); | |
514 | die "interrupted"; | |
515 | } | |
516 | ||
517 | sub target { | |
518 | my ($target) = @_; | |
519 | ||
520 | my $class = ( $isspam ? "spam" : "ham" ); | |
521 | my $format = ( defined( $opt{'format'} ) ? $opt{'format'} : "detect" ); | |
522 | ||
523 | push ( @targets, "$class:$format:$target" ); | |
524 | } | |
525 | ||
526 | ########################################################################### | |
527 | ||
528 | sub init_results { | |
529 | $init_results = 1; | |
530 | ||
531 | return unless $opt{'progress'}; | |
532 | ||
533 | $total_messages = $Mail::SpamAssassin::ArchiveIterator::MESSAGES; | |
534 | ||
535 | $progress = Mail::SpamAssassin::Util::Progress->new({total => $total_messages,}); | |
536 | } | |
537 | ||
538 | ########################################################################### | |
539 | ||
540 | sub result { | |
541 | my ($class, $result, $time) = @_; | |
542 | ||
543 | # don't open results files until we get here to avoid overwriting files | |
544 | &init_results if !$init_results; | |
545 | ||
546 | $progress->update($messagecount) if ($opt{progress} && $progress); | |
547 | } | |
548 | ||
549 | ########################################################################### | |
550 | ||
551 | sub wanted { | |
552 | my ( $class, $id, $time, $dataref ) = @_; | |
553 | ||
554 | my $spam = $class eq "s" ? 1 : 0; | |
555 | ||
556 | if ( defined($learnprob) ) { | |
557 | if ( int( rand( 1 / $learnprob ) ) != 0 ) { | |
558 | print STDERR '_' if ( $opt{showdots} ); | |
559 | return 1; | |
560 | } | |
561 | } | |
562 | ||
563 | if ( defined($messagelimit) && $learnedcount > $messagelimit ) { | |
564 | $progress->final() if ($opt{progress} && $progress); | |
565 | die 'HITLIMIT'; | |
566 | } | |
567 | ||
568 | $messagecount++; | |
569 | my $ma = $spamtest->parse($dataref); | |
570 | ||
571 | if ( $ma->get_header("X-Spam-Checker-Version") ) { | |
572 | my $new_ma = $spamtest->parse($spamtest->remove_spamassassin_markup($ma), 1); | |
573 | $ma->finish(); | |
574 | $ma = $new_ma; | |
575 | } | |
576 | ||
577 | my $status = $spamtest->learn( $ma, undef, $spam, $forget ); | |
578 | my $learned = $status->did_learn(); | |
579 | ||
580 | if ( !defined $learned ) { # undef=learning unavailable | |
581 | die "ERROR: the Bayes learn function returned an error, please re-run with -D for more information\n"; | |
582 | } | |
583 | elsif ( $learned == 1 ) { # 1=message was learned. 0=message wasn't learned | |
584 | $learnedcount++; | |
585 | } | |
586 | ||
587 | # Do cleanup ... | |
588 | $status->finish(); | |
589 | undef $status; | |
590 | ||
591 | $ma->finish(); | |
592 | undef $ma; | |
593 | ||
594 | print STDERR '.' if ( $opt{showdots} ); | |
595 | return 1; | |
596 | } | |
597 | ||
598 | ########################################################################### | |
599 | ||
600 | sub usage { | |
601 | my ( $verbose, $message ) = @_; | |
602 | my $ver = Mail::SpamAssassin::Version(); | |
603 | print "SpamAssassin version $ver\n"; | |
604 | pod2usage( -verbose => $verbose, -message => $message, -exitval => 64 ); | |
605 | } | |
606 | ||
607 | # --------------------------------------------------------------------------- | |
608 | ||
609 | =head1 NAME | |
610 | ||
611 | sa-learn - train SpamAssassin's Bayesian classifier | |
612 | ||
613 | =head1 SYNOPSIS | |
614 | ||
615 | B<sa-learn> [options] [file]... | |
616 | ||
617 | B<sa-learn> [options] --dump [ all | data | magic ] | |
618 | ||
619 | Options: | |
620 | ||
621 | --ham Learn messages as ham (non-spam) | |
622 | --spam Learn messages as spam | |
623 | --forget Forget a message | |
624 | --use-ignores Use bayes_ignore_from and bayes_ignore_to | |
625 | --sync Synchronize the database and the journal if needed | |
626 | --force-expire Force a database sync and expiry run | |
627 | --dbpath <path> Allows commandline override (in bayes_path form) | |
628 | for where to read the Bayes DB from | |
629 | --dump [all|data|magic] Display the contents of the Bayes database | |
630 | Takes optional argument for what to display | |
631 | --regexp <re> For dump only, specifies which tokens to | |
632 | dump based on a regular expression. | |
633 | -f file, --folders=file Read list of files/directories from file | |
634 | --dir Ignored; historical compatibility | |
635 | --file Ignored; historical compatibility | |
636 | --mbox Input sources are in mbox format | |
637 | --mbx Input sources are in mbx format | |
638 | --max-size <b> Skip messages larger than b bytes; | |
639 | defaults to 256 KB, 0 implies no limit | |
640 | --showdots Show progress using dots | |
641 | --progress Show progress using progress bar | |
642 | --no-sync Skip synchronizing the database and journal | |
643 | after learning | |
644 | -L, --local Operate locally, no network accesses | |
645 | --import Migrate data from older version/non DB_File | |
646 | based databases | |
647 | --clear Wipe out existing database | |
648 | --backup Backup, to STDOUT, existing database | |
649 | --restore <filename> Restore a database from filename | |
650 | -u username, --username=username | |
651 | Override username taken from the runtime | |
652 | environment, used with SQL | |
653 | -C path, --configpath=path, --config-file=path | |
654 | Path to standard configuration dir | |
655 | -p prefs, --prefspath=file, --prefs-file=file | |
656 | Set user preferences file | |
657 | --siteconfigpath=path Path for site configs | |
658 | (default: @@PREFIX@@/etc/mail/spamassassin) | |
659 | --cf='config line' Additional line of configuration | |
660 | -D, --debug [area=n,...] Print debugging messages | |
661 | -V, --version Print version | |
662 | -h, --help Print usage message | |
663 | ||
664 | =head1 DESCRIPTION | |
665 | ||
666 | Given a typical selection of your incoming mail classified as spam or ham | |
667 | (non-spam), this tool will feed each mail to SpamAssassin, allowing it | |
668 | to 'learn' what signs are likely to mean spam, and which are likely to | |
669 | mean ham. | |
670 | ||
671 | Simply run this command once for each of your mail folders, and it will | |
672 | ''learn'' from the mail therein. | |
673 | ||
674 | Note that csh-style I<globbing> in the mail folder names is supported; | |
675 | in other words, listing a folder name as C<*> will scan every folder | |
676 | that matches. See C<Mail::SpamAssassin::ArchiveIterator> for more details. | |
677 | ||
678 | If you are using mail boxes in format other than maildir you should use | |
679 | the B<--mbox> or B<--mbx> parameters. | |
680 | ||
681 | SpamAssassin remembers which mail messages it has learnt already, and will not | |
682 | re-learn those messages again, unless you use the B<--forget> option. Messages | |
683 | learnt as spam will have SpamAssassin markup removed, on the fly. | |
684 | ||
685 | If you make a mistake and scan a mail as ham when it is spam, or vice | |
686 | versa, simply rerun this command with the correct classification, and the | |
687 | mistake will be corrected. SpamAssassin will automatically 'forget' the | |
688 | previous indications. | |
689 | ||
690 | Users of C<spamd> who wish to perform training remotely, over a network, | |
691 | should investigate the C<spamc -L> switch. | |
692 | ||
693 | =head1 OPTIONS | |
694 | ||
695 | =over 4 | |
696 | ||
697 | =item B<--ham> | |
698 | ||
699 | Learn the input message(s) as ham. If you have previously learnt any of the | |
700 | messages as spam, SpamAssassin will forget them first, then re-learn them as | |
701 | ham. Alternatively, if you have previously learnt them as ham, it'll skip them | |
702 | this time around. If the messages have already been filtered through | |
703 | SpamAssassin, the learner will ignore any modifications SpamAssassin may have | |
704 | made. | |
705 | ||
706 | =item B<--spam> | |
707 | ||
708 | Learn the input message(s) as spam. If you have previously learnt any of the | |
709 | messages as ham, SpamAssassin will forget them first, then re-learn them as | |
710 | spam. Alternatively, if you have previously learnt them as spam, it'll skip | |
711 | them this time around. If the messages have already been filtered through | |
712 | SpamAssassin, the learner will ignore any modifications SpamAssassin may have | |
713 | made. | |
714 | ||
715 | =item B<--folders>=I<filename>, B<-f> I<filename> | |
716 | ||
717 | sa-learn will read in the list of folders from the specified file, one folder | |
718 | per line in the file. If the folder is prefixed with C<ham:type:> or C<spam:type:>, | |
719 | sa-learn will learn that folder appropriately, otherwise the folders will be | |
720 | assumed to be of the type specified by B<--ham> or B<--spam>. | |
721 | ||
722 | C<type> above is optional, but is the same as the standard for | |
723 | ArchiveIterator: mbox, mbx, dir, file, or detect (the default if not | |
724 | specified). | |
725 | ||
726 | =item B<--mbox> | |
727 | ||
728 | sa-learn will read in the file(s) containing the emails to be learned, | |
729 | and will process them in mbox format (one or more emails per file). | |
730 | ||
731 | =item B<--mbx> | |
732 | ||
733 | sa-learn will read in the file(s) containing the emails to be learned, | |
734 | and will process them in mbx format (one or more emails per file). | |
735 | ||
736 | =item B<--use-ignores> | |
737 | ||
738 | Don't learn the message if a from address matches configuration file | |
739 | item C<bayes_ignore_from> or a to address matches C<bayes_ignore_to>. | |
740 | The option might be used when learning from a large file of messages | |
741 | from which the hammy spam messages or spammy ham messages have not | |
742 | been removed. | |
743 | ||
744 | =item B<--sync> | |
745 | ||
746 | Synchronize the journal and databases. Upon successfully syncing the | |
747 | database with the entries in the journal, the journal file is removed. | |
748 | ||
749 | =item B<--force-expire> | |
750 | ||
751 | Forces an expiry attempt, regardless of whether it may be necessary | |
752 | or not. Note: This doesn't mean any tokens will actually expire. | |
753 | Please see the EXPIRATION section below. | |
754 | ||
755 | Note: C<--force-expire> also causes the journal data to be synchronized | |
756 | into the Bayes databases. | |
757 | ||
758 | =item B<--forget> | |
759 | ||
760 | Forget a given message previously learnt. | |
761 | ||
762 | =item B<--dbpath> | |
763 | ||
764 | Allows a commandline override of the I<bayes_path> configuration option. | |
765 | ||
766 | =item B<--dump> I<option> | |
767 | ||
768 | Display the contents of the Bayes database. Without an option or with | |
769 | the I<all> option, all magic tokens and data tokens will be displayed. | |
770 | I<magic> will only display magic tokens, and I<data> will only display | |
771 | the data tokens. | |
772 | ||
773 | Can also use the B<--regexp> I<RE> option to specify which tokens to | |
774 | display based on a regular expression. | |
775 | ||
776 | =item B<--clear> | |
777 | ||
778 | Clear an existing Bayes database by removing all traces of the database. | |
779 | ||
780 | WARNING: This is destructive and should be used with care. | |
781 | ||
782 | =item B<--backup> | |
783 | ||
784 | Performs a dump of the Bayes database in machine/human readable format. | |
785 | ||
786 | The dump will include token and seen data. It is suitable for input back | |
787 | into the --restore command. | |
788 | ||
789 | =item B<--restore>=I<filename> | |
790 | ||
791 | Performs a restore of the Bayes database defined by I<filename>. | |
792 | ||
793 | WARNING: This is a destructive operation, previous Bayes data will be wiped out. | |
794 | ||
795 | =item B<-h>, B<--help> | |
796 | ||
797 | Print help message and exit. | |
798 | ||
799 | =item B<-u> I<username>, B<--username>=I<username> | |
800 | ||
801 | If specified this username will override the username taken from the runtime | |
802 | environment. You can use this option to specify users in a virtual user | |
803 | configuration when using SQL as the Bayes backend. | |
804 | ||
805 | NOTE: This option will not change to the given I<username>, it will only attempt | |
806 | to act on behalf of that user. Because of this you will need to have proper | |
807 | permissions to be able to change files owned by I<username>. In the case of SQL | |
808 | this generally is not a problem. | |
809 | ||
810 | =item B<-C> I<path>, B<--configpath>=I<path>, B<--config-file>=I<path> | |
811 | ||
812 | Use the specified path for locating the distributed configuration files. | |
813 | Ignore the default directories (usually C</usr/share/spamassassin> or similar). | |
814 | ||
815 | =item B<--siteconfigpath>=I<path> | |
816 | ||
817 | Use the specified path for locating site-specific configuration files. Ignore | |
818 | the default directories (usually C</etc/mail/spamassassin> or similar). | |
819 | ||
820 | =item B<--cf='config line'> | |
821 | ||
822 | Add additional lines of configuration directly from the command-line, parsed | |
823 | after the configuration files are read. Multiple B<--cf> arguments can be | |
824 | used, and each will be considered a separate line of configuration. | |
825 | ||
826 | =item B<-p> I<prefs>, B<--prefspath>=I<prefs>, B<--prefs-file>=I<prefs> | |
827 | ||
828 | Read user score preferences from I<prefs> (usually C<$HOME/.spamassassin/user_prefs>). | |
829 | ||
830 | =item B<--progress> | |
831 | ||
832 | Prints a progress bar (to STDERR) showing the current progress. In the case | |
833 | where no valid terminal is found this option will behave very much like the | |
834 | --showdots option. | |
835 | ||
836 | =item B<-D> [I<area,...>], B<--debug> [I<area,...>] | |
837 | ||
838 | Produce debugging output. If no areas are listed, all debugging information is | |
839 | printed. Diagnostic output can also be enabled for each area individually; | |
840 | I<area> is the area of the code to instrument. For example, to produce | |
841 | diagnostic output on bayes, learn, and dns, use: | |
842 | ||
843 | spamassassin -D bayes,learn,dns | |
844 | ||
845 | For more information about which areas (also known as channels) are available, | |
846 | please see the documentation at: | |
847 | ||
848 | C<http://wiki.apache.org/spamassassin/DebugChannels> | |
849 | ||
850 | Higher priority informational messages that are suitable for logging in normal | |
851 | circumstances are available with an area of "info". | |
852 | ||
853 | =item B<--no-sync> | |
854 | ||
855 | Skip the slow synchronization step which normally takes place after | |
856 | changing database entries. If you plan to learn from many folders in | |
857 | a batch, or to learn many individual messages one-by-one, it is faster | |
858 | to use this switch and run C<sa-learn --sync> once all the folders have | |
859 | been scanned. | |
860 | ||
861 | Clarification: The state of I<--no-sync> overrides the | |
862 | I<bayes_learn_to_journal> configuration option. If not specified, | |
863 | sa-learn will learn to the database directly. If specified, sa-learn | |
864 | will learn to the journal file. | |
865 | ||
866 | Note: I<--sync> and I<--no-sync> can be specified on the same commandline, | |
867 | which is slightly confusing. In this case, the I<--no-sync> option is | |
868 | ignored since there is no learn operation. | |
869 | ||
870 | =item B<-L>, B<--local> | |
871 | ||
872 | Do not perform any network accesses while learning details about the mail | |
873 | messages. This will speed up the learning process, but may result in a | |
874 | slightly lower accuracy. | |
875 | ||
876 | Note that this is currently ignored, as current versions of SpamAssassin will | |
877 | not perform network access while learning; but future versions may. | |
878 | ||
879 | =item B<--import> | |
880 | ||
881 | If you previously used SpamAssassin's Bayesian learner without the C<DB_File> | |
882 | module installed, it will have created files in other formats, such as | |
883 | C<GDBM_File>, C<NDBM_File>, or C<SDBM_File>. This switch allows you to migrate | |
884 | that old data into the C<DB_File> format. It will overwrite any data currently | |
885 | in the C<DB_File>. | |
886 | ||
887 | Can also be used with the B<--dbpath> I<path> option to specify the location of | |
888 | the Bayes files to use. | |
889 | ||
890 | =back | |
891 | ||
892 | =head1 MIGRATION | |
893 | ||
894 | There are now multiple backend storage modules available for storing | |
895 | user's bayesian data. As such you might want to migrate from one | |
896 | backend to another. Here is a simple procedure for migrating from one | |
897 | backend to another. | |
898 | ||
899 | Note that if you have individual user databases you will have to | |
900 | perform a similar procedure for each one of them. | |
901 | ||
902 | =over 4 | |
903 | ||
904 | =item sa-learn --sync | |
905 | ||
906 | This will sync any outstanding journal entries | |
907 | ||
908 | =item sa-learn --backup > backup.txt | |
909 | ||
910 | This will save all your Bayes data to a plain text file. | |
911 | ||
912 | =item sa-learn --clear | |
913 | ||
914 | This is optional, but good to do to clear out the old database. | |
915 | ||
916 | =item Repeat! | |
917 | ||
918 | At this point, if you have multiple databases, you should perform the | |
919 | procedure above for each of them. (i.e. each user's database needs to | |
920 | be backed up before continuing.) | |
921 | ||
922 | =item Switch backends | |
923 | ||
924 | Once you have backed up all databases you can update your | |
925 | configuration for the new database backend. This will involve at least | |
926 | the bayes_store_module config option and may involve some additional | |
927 | config options depending on what is required by the module. (For | |
928 | example, you may need to configure an SQL database.) | |
929 | ||
930 | =item sa-learn --restore backup.txt | |
931 | ||
932 | Again, you need to do this for every database. | |
933 | ||
934 | =back | |
935 | ||
936 | If you are migrating to SQL you can make use of the -u <username> | |
937 | option in sa-learn to populate each user's database. Otherwise, you | |
938 | must run sa-learn as the user who database you are restoring. | |
939 | ||
940 | ||
941 | =head1 INTRODUCTION TO BAYESIAN FILTERING | |
942 | ||
943 | (Thanks to Michael Bell for this section!) | |
944 | ||
945 | For a more lengthy description of how this works, go to | |
946 | http://www.paulgraham.com/ and see "A Plan for Spam". It's reasonably | |
947 | readable, even if statistics make me break out in hives. | |
948 | ||
949 | The short semi-inaccurate version: Given training, a spam heuristics engine | |
950 | can take the most "spammy" and "hammy" words and apply probabilistic | |
951 | analysis. Furthermore, once given a basis for the analysis, the engine can | |
952 | continue to learn iteratively by applying both the non-Bayesian and Bayesian | |
953 | rulesets together to create evolving "intelligence". | |
954 | ||
955 | SpamAssassin 2.50 and later supports Bayesian spam analysis, in | |
956 | the form of the BAYES rules. This is a new feature, quite powerful, | |
957 | and is disabled until enough messages have been learnt. | |
958 | ||
959 | The pros of Bayesian spam analysis: | |
960 | ||
961 | =over 4 | |
962 | ||
963 | =item Can greatly reduce false positives and false negatives. | |
964 | ||
965 | It learns from your mail, so it is tailored to your unique e-mail flow. | |
966 | ||
967 | =item Once it starts learning, it can continue to learn from SpamAssassin | |
968 | and improve over time. | |
969 | ||
970 | =back | |
971 | ||
972 | And the cons: | |
973 | ||
974 | =over 4 | |
975 | ||
976 | =item A decent number of messages are required before results are useful | |
977 | for ham/spam determination. | |
978 | ||
979 | =item It's hard to explain why a message is or isn't marked as spam. | |
980 | ||
981 | i.e.: a straightforward rule, that matches, say, "VIAGRA" is | |
982 | easy to understand. If it generates a false positive or false negative, | |
983 | it is fairly easy to understand why. | |
984 | ||
985 | With Bayesian analysis, it's all probabilities - "because the past says | |
986 | it is likely as this falls into a probabilistic distribution common to past | |
987 | spam in your systems". Tell that to your users! Tell that to the client | |
988 | when he asks "what can I do to change this". (By the way, the answer in | |
989 | this case is "use whitelisting".) | |
990 | ||
991 | =item It will take disk space and memory. | |
992 | ||
993 | The databases it maintains take quite a lot of resources to store and use. | |
994 | ||
995 | =back | |
996 | ||
997 | =head1 GETTING STARTED | |
998 | ||
999 | Still interested? Ok, here's the guidelines for getting this working. | |
1000 | ||
1001 | First a high-level overview: | |
1002 | ||
1003 | =over 4 | |
1004 | ||
1005 | =item Build a significant sample of both ham and spam. | |
1006 | ||
1007 | I suggest several thousand of each, placed in SPAM and HAM directories or | |
1008 | mailboxes. Yes, you MUST hand-sort this - otherwise the results won't be much | |
1009 | better than SpamAssassin on its own. Verify the spamminess/haminess of EVERY | |
1010 | message. You're urged to avoid using a publicly available corpus (sample) - | |
1011 | this must be taken from YOUR mail server, if it is to be statistically useful. | |
1012 | Otherwise, the results may be pretty skewed. | |
1013 | ||
1014 | =item Use this tool to teach SpamAssassin about these samples, like so: | |
1015 | ||
1016 | sa-learn --spam /path/to/spam/folder | |
1017 | sa-learn --ham /path/to/ham/folder | |
1018 | ... | |
1019 | ||
1020 | Let SpamAssassin proceed, learning stuff. When it finds ham and spam | |
1021 | it will add the "interesting tokens" to the database. | |
1022 | ||
1023 | =item If you need SpamAssassin to forget about specific messages, use | |
1024 | the B<--forget> option. | |
1025 | ||
1026 | This can be applied to either ham or spam that has run through the | |
1027 | B<sa-learn> processes. It's a bit of a hammer, really, lowering the | |
1028 | weighting of the specific tokens in that message (only if that message has | |
1029 | been processed before). | |
1030 | ||
1031 | =item Learning from single messages uses a command like this: | |
1032 | ||
1033 | sa-learn --ham --no-sync mailmessage | |
1034 | ||
1035 | This is handy for binding to a key in your mail user agent. It's very fast, as | |
1036 | all the time-consuming stuff is deferred until you run with the C<--sync> | |
1037 | option. | |
1038 | ||
1039 | =item Autolearning is enabled by default | |
1040 | ||
1041 | If you don't have a corpus of mail saved to learn, you can let | |
1042 | SpamAssassin automatically learn the mail that you receive. If you are | |
1043 | autolearning from scratch, the amount of mail you receive will determine | |
1044 | how long until the BAYES_* rules are activated. | |
1045 | ||
1046 | =back | |
1047 | ||
1048 | =head1 EFFECTIVE TRAINING | |
1049 | ||
1050 | Learning filters require training to be effective. If you don't train | |
1051 | them, they won't work. In addition, you need to train them with new | |
1052 | messages regularly to keep them up-to-date, or their data will become | |
1053 | stale and impact accuracy. | |
1054 | ||
1055 | You need to train with both spam I<and> ham mails. One type of mail | |
1056 | alone will not have any effect. | |
1057 | ||
1058 | Note that if your mail folders contain things like forwarded spam, | |
1059 | discussions of spam-catching rules, etc., this will cause trouble. You | |
1060 | should avoid scanning those messages if possible. (An easy way to do this | |
1061 | is to move them aside, into a folder which is not scanned.) | |
1062 | ||
1063 | If the messages you are learning from have already been filtered through | |
1064 | SpamAssassin, the learner will compensate for this. In effect, it learns what | |
1065 | each message would look like if you had run C<spamassassin -d> over it in | |
1066 | advance. | |
1067 | ||
1068 | Another thing to be aware of, is that typically you should aim to train | |
1069 | with at least 1000 messages of spam, and 1000 ham messages, if | |
1070 | possible. More is better, but anything over about 5000 messages does not | |
1071 | improve accuracy significantly in our tests. | |
1072 | ||
1073 | Be careful that you train from the same source -- for example, if you train | |
1074 | on old spam, but new ham mail, then the classifier will think that | |
1075 | a mail with an old date stamp is likely to be spam. | |
1076 | ||
1077 | It's also worth noting that training with a very small quantity of | |
1078 | ham, will produce atrocious results. You should aim to train with at | |
1079 | least the same amount (or more if possible!) of ham data than spam. | |
1080 | ||
1081 | On an on-going basis, it is best to keep training the filter to make | |
1082 | sure it has fresh data to work from. There are various ways to do | |
1083 | this: | |
1084 | ||
1085 | =over 4 | |
1086 | ||
1087 | =item 1. Supervised learning | |
1088 | ||
1089 | This means keeping a copy of all or most of your mail, separated into spam | |
1090 | and ham piles, and periodically re-training using those. It produces | |
1091 | the best results, but requires more work from you, the user. | |
1092 | ||
1093 | (An easy way to do this, by the way, is to create a new folder for | |
1094 | 'deleted' messages, and instead of deleting them from other folders, | |
1095 | simply move them in there instead. Then keep all spam in a separate | |
1096 | folder and never delete it. As long as you remember to move misclassified | |
1097 | mails into the correct folder set, it is easy enough to keep up to date.) | |
1098 | ||
1099 | =item 2. Unsupervised learning from Bayesian classification | |
1100 | ||
1101 | Another way to train is to chain the results of the Bayesian classifier | |
1102 | back into the training, so it reinforces its own decisions. This is only | |
1103 | safe if you then retrain it based on any errors you discover. | |
1104 | ||
1105 | SpamAssassin does not support this method, due to experimental results | |
1106 | which strongly indicate that it does not work well, and since Bayes is | |
1107 | only one part of the resulting score presented to the user (while Bayes | |
1108 | may have made the wrong decision about a mail, it may have been overridden | |
1109 | by another system). | |
1110 | ||
1111 | =item 3. Unsupervised learning from SpamAssassin rules | |
1112 | ||
1113 | Also called 'auto-learning' in SpamAssassin. Based on statistical | |
1114 | analysis of the SpamAssassin success rates, we can automatically train the | |
1115 | Bayesian database with a certain degree of confidence that our training | |
1116 | data is accurate. | |
1117 | ||
1118 | It should be supplemented with some supervised training in addition, if | |
1119 | possible. | |
1120 | ||
1121 | This is the default, but can be turned off by setting the SpamAssassin | |
1122 | configuration parameter C<bayes_auto_learn> to 0. | |
1123 | ||
1124 | =item 4. Mistake-based training | |
1125 | ||
1126 | This means training on a small number of mails, then only training on | |
1127 | messages that SpamAssassin classifies incorrectly. This works, but it | |
1128 | takes longer to get it right than a full training session would. | |
1129 | ||
1130 | =back | |
1131 | ||
1132 | =head1 FILES | |
1133 | ||
1134 | B<sa-learn> and the other parts of SpamAssassin's Bayesian learner, | |
1135 | use a set of persistent database files to store the learnt tokens, as follows. | |
1136 | ||
1137 | =over 4 | |
1138 | ||
1139 | =item bayes_toks | |
1140 | ||
1141 | The database of tokens, containing the tokens learnt, their count of | |
1142 | occurrences in ham and spam, and the timestamp when the token was last | |
1143 | seen in a message. | |
1144 | ||
1145 | This database also contains some 'magic' tokens, as follows: the version | |
1146 | number of the database, the number of ham and spam messages learnt, the | |
1147 | number of tokens in the database, and timestamps of: the last journal | |
1148 | sync, the last expiry run, the last expiry token reduction count, the | |
1149 | last expiry timestamp delta, the oldest token timestamp in the database, | |
1150 | and the newest token timestamp in the database. | |
1151 | ||
1152 | This is a database file, using C<DB_File>. The database 'version | |
1153 | number' is 0 for databases from 2.5x, 1 for databases from certain 2.6x | |
1154 | development releases, 2 for 2.6x, and 3 for 3.0 and later releases. | |
1155 | ||
1156 | =item bayes_seen | |
1157 | ||
1158 | A map of Message-Id and some data from headers and body to what that | |
1159 | message was learnt as. This is used so that SpamAssassin can avoid | |
1160 | re-learning a message it has already seen, and so it can reverse the | |
1161 | training if you later decide that message was learnt incorrectly. | |
1162 | ||
1163 | This is a database file, using C<DB_File>. | |
1164 | ||
1165 | =item bayes_journal | |
1166 | ||
1167 | While SpamAssassin is scanning mails, it needs to track which tokens | |
1168 | it uses in its calculations. To avoid the contention of having each | |
1169 | SpamAssassin process attempting to gain write access to the Bayes DB, | |
1170 | the token timestamps are written to a 'journal' file which will later | |
1171 | (either automatically or via C<sa-learn --sync>) be used to synchronize | |
1172 | the Bayes DB. | |
1173 | ||
1174 | Also, through the use of C<bayes_learn_to_journal>, or when using the | |
1175 | C<--no-sync> option with sa-learn, the actual learning data will take | |
1176 | be placed into the journal for later synchronization. This is typically | |
1177 | useful for high-traffic sites to avoid the same contention as stated | |
1178 | above. | |
1179 | ||
1180 | =back | |
1181 | ||
1182 | =head1 EXPIRATION | |
1183 | ||
1184 | Since SpamAssassin can auto-learn messages, the Bayes database files | |
1185 | could increase perpetually until they fill your disk. To control this, | |
1186 | SpamAssassin performs journal synchronization and bayes expiration | |
1187 | periodically when certain criteria (listed below) are met. | |
1188 | ||
1189 | SpamAssassin can sync the journal and expire the DB tokens either | |
1190 | manually or opportunistically. A journal sync is due if I<--sync> | |
1191 | is passed to sa-learn (manual), or if the following is true | |
1192 | (opportunistic): | |
1193 | ||
1194 | =over 4 | |
1195 | ||
1196 | =item - bayes_journal_max_size does not equal 0 (means don't sync) | |
1197 | ||
1198 | =item - the journal file exists | |
1199 | ||
1200 | =back | |
1201 | ||
1202 | and either: | |
1203 | ||
1204 | =over 4 | |
1205 | ||
1206 | =item - the journal file has a size greater than bayes_journal_max_size | |
1207 | ||
1208 | =back | |
1209 | ||
1210 | or | |
1211 | ||
1212 | =over 4 | |
1213 | ||
1214 | =item - a journal sync has previously occurred, and at least 1 day has | |
1215 | passed since that sync | |
1216 | ||
1217 | =back | |
1218 | ||
1219 | Expiry is due if I<--force-expire> is passed to sa-learn (manual), | |
1220 | or if all of the following are true (opportunistic): | |
1221 | ||
1222 | =over 4 | |
1223 | ||
1224 | =item - the last expire was attempted at least 12hrs ago | |
1225 | ||
1226 | =item - bayes_auto_expire does not equal 0 | |
1227 | ||
1228 | =item - the number of tokens in the DB is > 100,000 | |
1229 | ||
1230 | =item - the number of tokens in the DB is > bayes_expiry_max_db_size | |
1231 | ||
1232 | =item - there is at least a 12 hr difference between the oldest and newest token atimes | |
1233 | ||
1234 | =back | |
1235 | ||
1236 | =head2 EXPIRE LOGIC | |
1237 | ||
1238 | If either the manual or opportunistic method causes an expire run | |
1239 | to start, here is the logic that is used: | |
1240 | ||
1241 | =over 4 | |
1242 | ||
1243 | =item - figure out how many tokens to keep. take the larger of | |
1244 | either bayes_expiry_max_db_size * 75% or 100,000 tokens. therefore, the goal | |
1245 | reduction is number of tokens - number of tokens to keep. | |
1246 | ||
1247 | =item - if the reduction number is < 1000 tokens, abort (not worth the effort). | |
1248 | ||
1249 | =item - if an expire has been done before, guesstimate the new | |
1250 | atime delta based on the old atime delta. (new_atime_delta = | |
1251 | old_atime_delta * old_reduction_count / goal) | |
1252 | ||
1253 | =item - if no expire has been done before, or the last expire looks | |
1254 | "weird", do an estimation pass. The definition of "weird" is: | |
1255 | ||
1256 | =over 8 | |
1257 | ||
1258 | =item - last expire over 30 days ago | |
1259 | ||
1260 | =item - last atime delta was < 12 hrs | |
1261 | ||
1262 | =item - last reduction count was < 1000 tokens | |
1263 | ||
1264 | =item - estimated new atime delta is < 12 hrs | |
1265 | ||
1266 | =item - the difference between the last reduction count and the goal reduction count is > 50% | |
1267 | ||
1268 | =back | |
1269 | ||
1270 | =back | |
1271 | ||
1272 | =head2 ESTIMATION PASS LOGIC | |
1273 | ||
1274 | Go through each of the DB's tokens. Starting at 12hrs, calculate | |
1275 | whether or not the token would be expired (based on the difference | |
1276 | between the token's atime and the db's newest token atime) and keep | |
1277 | the count. Work out from 12hrs exponentially by powers of 2. ie: | |
1278 | 12hrs * 1, 12hrs * 2, 12hrs * 4, 12hrs * 8, and so on, up to 12hrs | |
1279 | * 512 (6144hrs, or 256 days). | |
1280 | ||
1281 | The larger the delta, the smaller the number of tokens that will | |
1282 | be expired. Conversely, the number of tokens goes up as the delta | |
1283 | gets smaller. So starting at the largest atime delta, figure out | |
1284 | which delta will expire the most tokens without going above the | |
1285 | goal expiration count. Use this to choose the atime delta to use, | |
1286 | unless one of the following occurs: | |
1287 | ||
1288 | =over 8 | |
1289 | ||
1290 | =item - the largest atime (smallest reduction count) would expire | |
1291 | too many tokens. this means the learned tokens are mostly old and | |
1292 | there needs to be new tokens learned before an expire can | |
1293 | occur. | |
1294 | ||
1295 | =item - all of the atime choices result in 0 tokens being removed. | |
1296 | this means the tokens are all newer than 12 hours and there needs | |
1297 | to be new tokens learned before an expire can occur. | |
1298 | ||
1299 | =item - the number of tokens that would be removed is < 1000. the | |
1300 | benefit isn't worth the effort. more tokens need to be learned. | |
1301 | ||
1302 | =back | |
1303 | ||
1304 | If the expire run gets past this point, it will continue to the end. | |
1305 | A new DB is created since the majority of DB libraries don't shrink the | |
1306 | DB file when tokens are removed. So we do the "create new, migrate old | |
1307 | to new, remove old, rename new" shuffle. | |
1308 | ||
1309 | =head2 EXPIRY RELATED CONFIGURATION SETTINGS | |
1310 | ||
1311 | =over 4 | |
1312 | ||
1313 | =item C<bayes_auto_expire> is used to specify whether or not SpamAssassin | |
1314 | ought to opportunistically attempt to expire the Bayes database. | |
1315 | The default is 1 (yes). | |
1316 | ||
1317 | =item C<bayes_expiry_max_db_size> specifies both the auto-expire token | |
1318 | count point, as well as the resulting number of tokens after expiry | |
1319 | as described above. The default value is 150,000, which is roughly | |
1320 | equivalent to a 6Mb database file if you're using DB_File. | |
1321 | ||
1322 | =item C<bayes_journal_max_size> specifies how large the Bayes | |
1323 | journal will grow before it is opportunistically synced. The | |
1324 | default value is 102400. | |
1325 | ||
1326 | =back | |
1327 | ||
1328 | =head1 INSTALLATION | |
1329 | ||
1330 | The B<sa-learn> command is part of the B<Mail::SpamAssassin> Perl module. | |
1331 | Install this as a normal Perl module, using C<perl -MCPAN -e shell>, | |
1332 | or by hand. | |
1333 | ||
1334 | =head1 SEE ALSO | |
1335 | ||
1336 | spamassassin(1) | |
1337 | spamc(1) | |
1338 | Mail::SpamAssassin(3) | |
1339 | Mail::SpamAssassin::ArchiveIterator(3) | |
1340 | ||
1341 | E<lt>http://www.paulgraham.com/E<gt> | |
1342 | Paul Graham's "A Plan For Spam" paper | |
1343 | ||
1344 | E<lt>http://www.linuxjournal.com/article/6467E<gt> | |
1345 | Gary Robinson's f(x) and combining algorithms, as used in SpamAssassin | |
1346 | ||
1347 | E<lt>http://www.bgl.nu/~glouis/bogofilter/E<gt> | |
1348 | 'Training on error' page. A discussion of various Bayes training regimes, | |
1349 | including 'train on error' and unsupervised training. | |
1350 | ||
1351 | =head1 PREREQUISITES | |
1352 | ||
1353 | C<Mail::SpamAssassin> | |
1354 | ||
1355 | =head1 AUTHORS | |
1356 | ||
1357 | The SpamAssassin(tm) Project E<lt>https://spamassassin.apache.org/E<gt> | |
1358 | ||
1359 | =cut | |
1360 |