]> git.proxmox.com Git - proxmox-spamassassin.git/blame - upstream/sa-learn.raw
buildsys: track debug package
[proxmox-spamassassin.git] / upstream / sa-learn.raw
CommitLineData
37ef5775
SI
1#!/usr/bin/perl -w -T
2# <@LICENSE>
3# Licensed to the Apache Software Foundation (ASF) under one or more
4# contributor license agreements. See the NOTICE file distributed with
5# this work for additional information regarding copyright ownership.
6# The ASF licenses this file to you under the Apache License, Version 2.0
7# (the "License"); you may not use this file except in compliance with
8# the License. You may obtain a copy of the License at:
9#
10# http://www.apache.org/licenses/LICENSE-2.0
11#
12# Unless required by applicable law or agreed to in writing, software
13# distributed under the License is distributed on an "AS IS" BASIS,
14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15# See the License for the specific language governing permissions and
16# limitations under the License.
17# </@LICENSE>
18
19use strict;
20use warnings;
21# use bytes;
22
23use Errno qw(EBADF);
24use Getopt::Long;
25use Pod::Usage;
26use File::Spec;
27use POSIX qw(locale_h setsid sigprocmask _exit);
28
29POSIX::setlocale(LC_TIME,'C');
30
31our ( $spamtest, %opt, $isspam, $forget, $messagecount, $learnedcount, $messagelimit, $progress,
32 $total_messages, $init_results, $start_time, $synconly, $learnprob, @targets, $bayes_override_path );
33
34my $PREFIX = '@@PREFIX@@'; # substituted at 'make' time
35my $DEF_RULES_DIR = '@@DEF_RULES_DIR@@'; # substituted at 'make' time
36my $LOCAL_RULES_DIR = '@@LOCAL_RULES_DIR@@'; # substituted at 'make' time
37
38use lib '@@INSTALLSITELIB@@'; # substituted at 'make' time
39
40BEGIN { # see comments in "spamassassin.raw" for doco
41 my @bin = File::Spec->splitpath($0);
42 my $bin = ($bin[0] ? File::Spec->catpath(@bin[0..1], '') : $bin[1])
43 || File::Spec->curdir;
44
45 if (-e $bin.'/lib/Mail/SpamAssassin.pm'
46 || !-e '@@INSTALLSITELIB@@/Mail/SpamAssassin.pm' )
47 {
48 my $searchrelative;
49 $searchrelative = 1; # disabled during "make install": REMOVEFORINST
50 if ($searchrelative && $bin eq '../' && -e '../blib/lib/Mail/SpamAssassin.pm')
51 {
52 unshift ( @INC, '../blib/lib' );
53 } else {
54 foreach ( qw(lib ../lib/site_perl
55 ../lib/spamassassin ../share/spamassassin/lib))
56 {
57 my $dir = File::Spec->catdir( $bin, split ( '/', $_ ) );
58 if ( -f File::Spec->catfile( $dir, "Mail", "SpamAssassin.pm" ) )
59 { unshift ( @INC, $dir ); last; }
60 }
61 }
62 }
63}
64
65use Mail::SpamAssassin;
66use Mail::SpamAssassin::ArchiveIterator;
67use Mail::SpamAssassin::Message;
68use Mail::SpamAssassin::PerMsgLearner;
69use Mail::SpamAssassin::Util::Progress;
70use Mail::SpamAssassin::Logger;
71
72###########################################################################
73
74$SIG{PIPE} = 'IGNORE';
75
76# used to be CmdLearn::cmd_run() ...
77
78%opt = (
79 'force-expire' => 0,
80 'use-ignores' => 0,
81 'nosync' => 0,
82 'quiet' => 0,
83 'cf' => []
84);
85
86Getopt::Long::Configure(
87 qw(bundling no_getopt_compat
88 permute no_auto_abbrev no_ignore_case)
89);
90
91GetOptions(
92 'forget' => \$forget,
93 'ham|nonspam' => sub { $isspam = 0; },
94 'spam' => sub { $isspam = 1; },
95 'sync' => \$synconly,
96 'rebuild' => sub { $synconly = 1; warn "The --rebuild option has been deprecated. Please use --sync instead.\n" },
97
98 'q|quiet' => \$opt{'quiet'},
99 'username|u=s' => \$opt{'username'},
100 'configpath|config-file|config-dir|c|C=s' => \$opt{'configpath'},
101 'prefspath|prefs-file|p=s' => \$opt{'prefspath'},
102 'siteconfigpath=s' => \$opt{'siteconfigpath'},
103 'cf=s' => \@{$opt{'cf'}},
104
105 'folders|f=s' => \$opt{'folders'},
106 'force-expire|expire' => \$opt{'force-expire'},
107 'local|L' => \$opt{'local'},
108 'no-sync|nosync' => \$opt{'nosync'},
109 'showdots' => \$opt{'showdots'},
110 'progress' => \$opt{'progress'},
111 'use-ignores' => \$opt{'use-ignores'},
112 'no-rebuild|norebuild' => sub { $opt{'nosync'} = 1; warn "The --no-rebuild option has been deprecated. Please use --no-sync instead.\n" },
113
114 'learnprob=f' => \$opt{'learnprob'},
115 'randseed=i' => \$opt{'randseed'},
116 'stopafter=i' => \$opt{'stopafter'},
117 'max-size=i' => \$opt{'max-size'},
118
119 'debug|debug-level|D:s' => \$opt{'debug'},
120 'help|h|?' => \$opt{'help'},
121 'version|V' => \$opt{'version'},
122
123 'dump:s' => \$opt{'dump'},
124 'import' => \$opt{'import'},
125
126 'backup' => \$opt{'backup'},
127 'clear' => \$opt{'clear'},
128 'restore=s' => \$opt{'restore'},
129
130 'dir' => sub { $opt{'old_format'} = 'dir'; },
131 'file' => sub { $opt{'old_format'} = 'file'; },
132 'mbox' => sub { $opt{'format'} = 'mbox'; },
133 'mbx' => sub { $opt{'format'} = 'mbx'; },
134 'single' => sub { $opt{'old_format'} = 'single'; },
135
136 'db|dbpath=s' => \$bayes_override_path,
137 're|regexp=s' => \$opt{'regexp'},
138
139 '<>' => \&target,
140 )
141 or usage( 0, "Unknown option!" );
142
143if ( defined $opt{'help'} ) {
144 usage( 0, "For more information read the manual page" );
145}
146if ( defined $opt{'version'} ) {
147 print "SpamAssassin version " . Mail::SpamAssassin::Version() . "\n";
148 exit 0;
149}
150
151# set debug areas, if any specified (only useful for command-line tools)
152if (defined $opt{'debug'}) {
153 $opt{'debug'} ||= 'all';
154}
155
156if ( $opt{'force-expire'} ) {
157 $synconly = 1;
158}
159
160if ($opt{'showdots'} && $opt{'progress'}) {
161 print "--showdots and --progress may not be used together, please select just one\n";
162 exit 0;
163}
164
165if ( !defined $isspam
166 && !defined $synconly
167 && !defined $forget
168 && !defined $opt{'dump'}
169 && !defined $opt{'import'}
170 && !defined $opt{'clear'}
171 && !defined $opt{'backup'}
172 && !defined $opt{'restore'}
173 && !defined $opt{'folders'} )
174{
175 usage( 0,
176"Please select either --spam, --ham, --folders, --forget, --sync, --import,\n--dump, --clear, --backup or --restore"
177 );
178}
179
180# We need to make sure the journal syncs pre-forget...
181if ( defined $forget && $opt{'nosync'} ) {
182 $opt{'nosync'} = 0;
183 warn
184"sa-learn warning: --forget requires read/write access to the database, and is incompatible with --no-sync\n";
185}
186
187if ( defined $opt{'old_format'} ) {
188
189 #Format specified in the 2.5x form of --dir, --file, --mbox, --mbx or --single.
190 #Convert it to the new behavior:
191 if ( $opt{'old_format'} eq 'single' ) {
192 push ( @ARGV, '-' );
193 }
194}
195
196my $post_config = '';
197
198# kluge to support old check_bayes_db operation
199# bug 3799: init() will go r/o with the configured DB, and then dbpath needs
200# to override. Just access the dbpath version via post_config_text.
201if ( defined $bayes_override_path ) {
202 # Add a default prefix if the path is a directory
203 if ( -d $bayes_override_path ) {
204 $bayes_override_path = File::Spec->catfile( $bayes_override_path, 'bayes' );
205 }
206
207 $post_config .= "bayes_path $bayes_override_path\n";
208}
209
210# These options require bayes_scanner, which requires "use_bayes 1", but
211# that's not necessary for these commands.
212if (defined $opt{'dump'} || defined $opt{'import'} || defined $opt{'clear'} ||
213 defined $opt{'backup'} || defined $opt{'restore'}) {
214 $post_config .= "use_bayes 1\n";
215}
216
217$post_config .= join("\n", @{$opt{'cf'}})."\n";
218
219# create the tester factory
220$spamtest = new Mail::SpamAssassin(
221 {
222 rules_filename => $opt{'configpath'},
223 site_rules_filename => $opt{'siteconfigpath'},
224 userprefs_filename => $opt{'prefspath'},
225 username => $opt{'username'},
226 debug => $opt{'debug'},
227 local_tests_only => $opt{'local'},
228 dont_copy_prefs => 1,
229 PREFIX => $PREFIX,
230 DEF_RULES_DIR => $DEF_RULES_DIR,
231 LOCAL_RULES_DIR => $LOCAL_RULES_DIR,
232 post_config_text => $post_config,
233 }
234);
235
236$spamtest->init(1);
237dbg("sa-learn: spamtest initialized");
238
239# Bug 6228 hack: bridge the transition gap of moving Bayes.pm into a plugin;
240# To be resolved more cleanly!!!
241if ($spamtest->{bayes_scanner}) {
242 foreach my $plugin ( @{ $spamtest->{plugins}->{plugins} } ) {
243 if ($plugin->isa('Mail::SpamAssassin::Plugin::Bayes')) {
244 # copy plugin's "store" object ref one level up!
245 $spamtest->{bayes_scanner}->{store} = $plugin->{store};
246 }
247 }
248}
249
250if (Mail::SpamAssassin::Util::am_running_on_windows()) {
251 binmode(STDIN) or die "cannot set binmode on STDIN: $!"; # bug 4363
252 binmode(STDOUT) or die "cannot set binmode on STDOUT: $!";
253}
254
255if ( defined $opt{'dump'} ) {
256 my ( $magic, $toks );
257
258 if ( $opt{'dump'} eq 'all' || $opt{'dump'} eq '' ) { # show us all tokens!
259 ( $magic, $toks ) = ( 1, 1 );
260 }
261 elsif ( $opt{'dump'} eq 'magic' ) { # show us magic tokens only
262 ( $magic, $toks ) = ( 1, 0 );
263 }
264 elsif ( $opt{'dump'} eq 'data' ) { # show us data tokens only
265 ( $magic, $toks ) = ( 0, 1 );
266 }
267 else { # unknown option
268 warn "Unknown dump option '" . $opt{'dump'} . "'\n";
269 $spamtest->finish_learner();
270 exit 1;
271 }
272
273 if (!$spamtest->dump_bayes_db( $magic, $toks, $opt{'regexp'}) ) {
274 $spamtest->finish_learner();
275 die "ERROR: Bayes dump returned an error, please re-run with -D for more information\n";
276 }
277
278 $spamtest->finish_learner();
279 # make sure we notice any write errors while flushing output buffer
280 close STDOUT or die "error closing STDOUT: $!";
281 close STDIN or die "error closing STDIN: $!";
282 exit 0;
283}
284
285if ( defined $opt{'import'} ) {
286 my $ret = $spamtest->{bayes_scanner}->{store}->perform_upgrade();
287 $spamtest->finish_learner();
288 # make sure we notice any write errors while flushing output buffer
289 close STDOUT or die "error closing STDOUT: $!";
290 close STDIN or die "error closing STDIN: $!";
291 exit( !$ret );
292}
293
294if (defined $opt{'clear'}) {
295 unless ($spamtest->{bayes_scanner}->{store}->clear_database()) {
296 $spamtest->finish_learner();
297 die "ERROR: Bayes clear returned an error, please re-run with -D for more information\n";
298 }
299
300 $spamtest->finish_learner();
301 # make sure we notice any write errors while flushing output buffer
302 close STDOUT or die "error closing STDOUT: $!";
303 close STDIN or die "error closing STDIN: $!";
304 exit 0;
305}
306
307if (defined $opt{'backup'}) {
308 unless ($spamtest->{bayes_scanner}->{store}->backup_database()) {
309 $spamtest->finish_learner();
310 die "ERROR: Bayes backup returned an error, please re-run with -D for more information\n";
311 }
312
313 $spamtest->finish_learner();
314 # make sure we notice any write errors while flushing output buffer
315 close STDOUT or die "error closing STDOUT: $!";
316 close STDIN or die "error closing STDIN: $!";
317 exit 0;
318}
319
320if (defined $opt{'restore'}) {
321
322 my $filename = $opt{'restore'};
323
324 unless ($filename) {
325 $spamtest->finish_learner();
326 die "ERROR: You must specify a filename to restore.\n";
327 }
328
329 unless ($spamtest->{bayes_scanner}->{store}->restore_database($filename, $opt{'showdots'})) {
330 $spamtest->finish_learner();
331 die "ERROR: Bayes restore returned an error, please re-run with -D for more information\n";
332 }
333
334 $spamtest->finish_learner();
335 # make sure we notice any write errors while flushing output buffer
336 close STDOUT or die "error closing STDOUT: $!";
337 close STDIN or die "error closing STDIN: $!";
338 exit 0;
339}
340
341if ( !$spamtest->{conf}->{use_bayes} ) {
342 warn "ERROR: configuration specifies 'use_bayes 0', sa-learn disabled\n";
343 exit 1;
344}
345
346$spamtest->init_learner(
347 {
348 force_expire => $opt{'force-expire'},
349 learn_to_journal => $opt{'nosync'},
350 wait_for_lock => 1,
351 caller_will_untie => 1
352 }
353);
354
355$spamtest->{bayes_scanner}{use_ignores} = $opt{'use-ignores'};
356
357if ($synconly) {
358 $spamtest->rebuild_learner_caches(
359 {
360 verbose => !$opt{'quiet'},
361 showdots => $opt{'showdots'}
362 }
363 );
364 $spamtest->finish_learner();
365 # make sure we notice any write errors while flushing output buffer
366 close STDOUT or die "error closing STDOUT: $!";
367 close STDIN or die "error closing STDIN: $!";
368 exit 0;
369}
370
371$messagelimit = $opt{'stopafter'};
372$learnprob = $opt{'learnprob'};
373
374if ( defined $opt{'randseed'} ) {
375 srand( $opt{'randseed'} );
376}
377
378# sync the journal first if we're going to go r/w so we make sure to
379# learn everything before doing anything else.
380#
381if ( !$opt{nosync} ) {
382 $spamtest->rebuild_learner_caches();
383}
384
385# what is the result of the run? will end up being the exit code.
386my $exit_status = 0;
387
388# run this lot in an eval block, so we can catch die's and clear
389# up the dbs.
390eval {
391 $SIG{HUP} = \&killed;
392 $SIG{INT} = \&killed;
393 $SIG{TERM} = \&killed;
394
395 if ( $opt{folders} ) {
396 open( F, $opt{folders} ) or die "cannot open $opt{folders}: $!";
397 for ($!=0; <F>; $!=0) {
398 chomp;
399 next if /^\s*$/;
400 if (/^(ham|spam):(\w*):(.*)/) {
401 my $class = $1;
402 my $format = $2 || "detect";
403 my $target = $3;
404 push ( @targets, "$class:$format:$target" );
405 }
406 else {
407 target($_);
408 }
409 }
410 defined $_ || $!==0 or
411 $!==EBADF ? dbg("error reading from $opt{folders}: $!")
412 : die "error reading from $opt{folders}: $!";
413 close(F) or die "error closing $opt{folders}: $!";
414 }
415
416 ###########################################################################
417 # Deal with the target listing, and STDIN -> tempfile
418
419 my $tempfile; # will be defined if stdin -> tempfile
420 push(@targets, @ARGV);
421 @targets = ('-') unless @targets || $opt{folders};
422
423 for(my $elem = 0; $elem <= $#targets; $elem++) {
424 # ArchiveIterator doesn't really like STDIN, so if "-" is specified
425 # as a target, make it a temp file instead.
426 if ( $targets[$elem] =~ /(?:^|:)-$/ ) {
427 if (defined $tempfile) {
428 # uh-oh, stdin specified multiple times?
429 warn "skipping extra stdin target (".$targets[$elem].")\n";
430 splice @targets, $elem, 1;
431 $elem--; # go back to this element again
432 next;
433 }
434 else {
435 my $handle;
436 ( $tempfile, $handle ) = Mail::SpamAssassin::Util::secure_tmpfile();
437 binmode $handle or die "cannot set binmode on file $tempfile: $!";
438
439 # avoid slurping the whole file into memory, copy chunk by chunk
440 my($inbuf,$nread);
441 while ( $nread=sysread(STDIN,$inbuf,16384) )
442 { print {$handle} $inbuf or die "error writing to $tempfile: $!" }
443 defined $nread or die "error reading from STDIN: $!";
444 close $handle or die "error closing $tempfile: $!";
445
446 # re-aim the targets at the tempfile instead of STDIN
447 $targets[$elem] =~ s/-$/$tempfile/;
448 }
449 }
450
451 # make sure the target list is in the normal AI format
452 if ($targets[$elem] !~ /^[^:]*:[a-z]+:/) {
453 my $item = splice @targets, $elem, 1;
454 target($item); # add back to the list
455 $elem--; # go back to this element again
456 next;
457 }
458 }
459
460 ###########################################################################
461
462 my $iter = new Mail::SpamAssassin::ArchiveIterator(
463 {
464 # skip messages larger than max-size bytes,
465 # 0 for no limit, undef defaults to 256 KB
466 'opt_max_size' => $opt{'max-size'},
467 'opt_want_date' => 0,
468 'opt_from_regex' => $spamtest->{conf}->{mbox_format_from_regex},
469 }
470 );
471
472 $iter->set_functions(\&wanted, \&result);
473 $messagecount = 0;
474 $learnedcount = 0;
475
476 $init_results = 0;
477 $start_time = time;
478
479 # if exit_status isn't already set to non-zero, set it to the reverse of the
480 # run result (0 is bad, 1+ is good -- the opposite of exit status codes)
481 my $run_ok = eval { $exit_status ||= ! $iter->run(@targets); 1 };
482
483 print STDERR "\n" if ($opt{showdots});
484 $progress->final() if ($opt{progress} && $progress);
485
486 my $phrase = defined $forget ? "Forgot" : "Learned";
487 print "$phrase tokens from $learnedcount message(s) ($messagecount message(s) examined)\n"
488 if !$opt{'quiet'};
489
490 # If we needed to make a tempfile, go delete it.
491 if (defined $tempfile) {
492 unlink $tempfile or die "cannot unlink temporary file $tempfile: $!";
493 undef $tempfile;
494 }
495
496 if (!$run_ok && $@ !~ /HITLIMIT/) { die $@ }
497 1;
498} or do {
499 my $eval_stat = $@ ne '' ? $@ : "errno=$!"; chomp $eval_stat;
500 $spamtest->finish_learner();
501 die $eval_stat;
502};
503
504$spamtest->finish_learner();
505# make sure we notice any write errors while flushing output buffer
506close STDOUT or die "error closing STDOUT: $!";
507close STDIN or die "error closing STDIN: $!";
508exit $exit_status;
509
510###########################################################################
511
512sub killed {
513 $spamtest->finish_learner();
514 die "interrupted";
515}
516
517sub target {
518 my ($target) = @_;
519
520 my $class = ( $isspam ? "spam" : "ham" );
521 my $format = ( defined( $opt{'format'} ) ? $opt{'format'} : "detect" );
522
523 push ( @targets, "$class:$format:$target" );
524}
525
526###########################################################################
527
528sub init_results {
529 $init_results = 1;
530
531 return unless $opt{'progress'};
532
533 $total_messages = $Mail::SpamAssassin::ArchiveIterator::MESSAGES;
534
535 $progress = Mail::SpamAssassin::Util::Progress->new({total => $total_messages,});
536}
537
538###########################################################################
539
540sub result {
541 my ($class, $result, $time) = @_;
542
543 # don't open results files until we get here to avoid overwriting files
544 &init_results if !$init_results;
545
546 $progress->update($messagecount) if ($opt{progress} && $progress);
547}
548
549###########################################################################
550
551sub wanted {
552 my ( $class, $id, $time, $dataref ) = @_;
553
554 my $spam = $class eq "s" ? 1 : 0;
555
556 if ( defined($learnprob) ) {
557 if ( int( rand( 1 / $learnprob ) ) != 0 ) {
558 print STDERR '_' if ( $opt{showdots} );
559 return 1;
560 }
561 }
562
563 if ( defined($messagelimit) && $learnedcount > $messagelimit ) {
564 $progress->final() if ($opt{progress} && $progress);
565 die 'HITLIMIT';
566 }
567
568 $messagecount++;
569 my $ma = $spamtest->parse($dataref);
570
571 if ( $ma->get_header("X-Spam-Checker-Version") ) {
572 my $new_ma = $spamtest->parse($spamtest->remove_spamassassin_markup($ma), 1);
573 $ma->finish();
574 $ma = $new_ma;
575 }
576
577 my $status = $spamtest->learn( $ma, undef, $spam, $forget );
578 my $learned = $status->did_learn();
579
580 if ( !defined $learned ) { # undef=learning unavailable
581 die "ERROR: the Bayes learn function returned an error, please re-run with -D for more information\n";
582 }
583 elsif ( $learned == 1 ) { # 1=message was learned. 0=message wasn't learned
584 $learnedcount++;
585 }
586
587 # Do cleanup ...
588 $status->finish();
589 undef $status;
590
591 $ma->finish();
592 undef $ma;
593
594 print STDERR '.' if ( $opt{showdots} );
595 return 1;
596}
597
598###########################################################################
599
600sub usage {
601 my ( $verbose, $message ) = @_;
602 my $ver = Mail::SpamAssassin::Version();
603 print "SpamAssassin version $ver\n";
604 pod2usage( -verbose => $verbose, -message => $message, -exitval => 64 );
605}
606
607# ---------------------------------------------------------------------------
608
609=head1 NAME
610
611sa-learn - train SpamAssassin's Bayesian classifier
612
613=head1 SYNOPSIS
614
615B<sa-learn> [options] [file]...
616
617B<sa-learn> [options] --dump [ all | data | magic ]
618
619Options:
620
621 --ham Learn messages as ham (non-spam)
622 --spam Learn messages as spam
623 --forget Forget a message
624 --use-ignores Use bayes_ignore_from and bayes_ignore_to
625 --sync Synchronize the database and the journal if needed
626 --force-expire Force a database sync and expiry run
627 --dbpath <path> Allows commandline override (in bayes_path form)
628 for where to read the Bayes DB from
629 --dump [all|data|magic] Display the contents of the Bayes database
630 Takes optional argument for what to display
631 --regexp <re> For dump only, specifies which tokens to
632 dump based on a regular expression.
633 -f file, --folders=file Read list of files/directories from file
634 --dir Ignored; historical compatibility
635 --file Ignored; historical compatibility
636 --mbox Input sources are in mbox format
637 --mbx Input sources are in mbx format
638 --max-size <b> Skip messages larger than b bytes;
639 defaults to 256 KB, 0 implies no limit
640 --showdots Show progress using dots
641 --progress Show progress using progress bar
642 --no-sync Skip synchronizing the database and journal
643 after learning
644 -L, --local Operate locally, no network accesses
645 --import Migrate data from older version/non DB_File
646 based databases
647 --clear Wipe out existing database
648 --backup Backup, to STDOUT, existing database
649 --restore <filename> Restore a database from filename
650 -u username, --username=username
651 Override username taken from the runtime
652 environment, used with SQL
653 -C path, --configpath=path, --config-file=path
654 Path to standard configuration dir
655 -p prefs, --prefspath=file, --prefs-file=file
656 Set user preferences file
657 --siteconfigpath=path Path for site configs
658 (default: @@PREFIX@@/etc/mail/spamassassin)
659 --cf='config line' Additional line of configuration
660 -D, --debug [area=n,...] Print debugging messages
661 -V, --version Print version
662 -h, --help Print usage message
663
664=head1 DESCRIPTION
665
666Given a typical selection of your incoming mail classified as spam or ham
667(non-spam), this tool will feed each mail to SpamAssassin, allowing it
668to 'learn' what signs are likely to mean spam, and which are likely to
669mean ham.
670
671Simply run this command once for each of your mail folders, and it will
672''learn'' from the mail therein.
673
674Note that csh-style I<globbing> in the mail folder names is supported;
675in other words, listing a folder name as C<*> will scan every folder
676that matches. See C<Mail::SpamAssassin::ArchiveIterator> for more details.
677
678If you are using mail boxes in format other than maildir you should use
679the B<--mbox> or B<--mbx> parameters.
680
681SpamAssassin remembers which mail messages it has learnt already, and will not
682re-learn those messages again, unless you use the B<--forget> option. Messages
683learnt as spam will have SpamAssassin markup removed, on the fly.
684
685If you make a mistake and scan a mail as ham when it is spam, or vice
686versa, simply rerun this command with the correct classification, and the
687mistake will be corrected. SpamAssassin will automatically 'forget' the
688previous indications.
689
690Users of C<spamd> who wish to perform training remotely, over a network,
691should investigate the C<spamc -L> switch.
692
693=head1 OPTIONS
694
695=over 4
696
697=item B<--ham>
698
699Learn the input message(s) as ham. If you have previously learnt any of the
700messages as spam, SpamAssassin will forget them first, then re-learn them as
701ham. Alternatively, if you have previously learnt them as ham, it'll skip them
702this time around. If the messages have already been filtered through
703SpamAssassin, the learner will ignore any modifications SpamAssassin may have
704made.
705
706=item B<--spam>
707
708Learn the input message(s) as spam. If you have previously learnt any of the
709messages as ham, SpamAssassin will forget them first, then re-learn them as
710spam. Alternatively, if you have previously learnt them as spam, it'll skip
711them this time around. If the messages have already been filtered through
712SpamAssassin, the learner will ignore any modifications SpamAssassin may have
713made.
714
715=item B<--folders>=I<filename>, B<-f> I<filename>
716
717sa-learn will read in the list of folders from the specified file, one folder
718per line in the file. If the folder is prefixed with C<ham:type:> or C<spam:type:>,
719sa-learn will learn that folder appropriately, otherwise the folders will be
720assumed to be of the type specified by B<--ham> or B<--spam>.
721
722C<type> above is optional, but is the same as the standard for
723ArchiveIterator: mbox, mbx, dir, file, or detect (the default if not
724specified).
725
726=item B<--mbox>
727
728sa-learn will read in the file(s) containing the emails to be learned,
729and will process them in mbox format (one or more emails per file).
730
731=item B<--mbx>
732
733sa-learn will read in the file(s) containing the emails to be learned,
734and will process them in mbx format (one or more emails per file).
735
736=item B<--use-ignores>
737
738Don't learn the message if a from address matches configuration file
739item C<bayes_ignore_from> or a to address matches C<bayes_ignore_to>.
740The option might be used when learning from a large file of messages
741from which the hammy spam messages or spammy ham messages have not
742been removed.
743
744=item B<--sync>
745
746Synchronize the journal and databases. Upon successfully syncing the
747database with the entries in the journal, the journal file is removed.
748
749=item B<--force-expire>
750
751Forces an expiry attempt, regardless of whether it may be necessary
752or not. Note: This doesn't mean any tokens will actually expire.
753Please see the EXPIRATION section below.
754
755Note: C<--force-expire> also causes the journal data to be synchronized
756into the Bayes databases.
757
758=item B<--forget>
759
760Forget a given message previously learnt.
761
762=item B<--dbpath>
763
764Allows a commandline override of the I<bayes_path> configuration option.
765
766=item B<--dump> I<option>
767
768Display the contents of the Bayes database. Without an option or with
769the I<all> option, all magic tokens and data tokens will be displayed.
770I<magic> will only display magic tokens, and I<data> will only display
771the data tokens.
772
773Can also use the B<--regexp> I<RE> option to specify which tokens to
774display based on a regular expression.
775
776=item B<--clear>
777
778Clear an existing Bayes database by removing all traces of the database.
779
780WARNING: This is destructive and should be used with care.
781
782=item B<--backup>
783
784Performs a dump of the Bayes database in machine/human readable format.
785
786The dump will include token and seen data. It is suitable for input back
787into the --restore command.
788
789=item B<--restore>=I<filename>
790
791Performs a restore of the Bayes database defined by I<filename>.
792
793WARNING: This is a destructive operation, previous Bayes data will be wiped out.
794
795=item B<-h>, B<--help>
796
797Print help message and exit.
798
799=item B<-u> I<username>, B<--username>=I<username>
800
801If specified this username will override the username taken from the runtime
802environment. You can use this option to specify users in a virtual user
803configuration when using SQL as the Bayes backend.
804
805NOTE: This option will not change to the given I<username>, it will only attempt
806to act on behalf of that user. Because of this you will need to have proper
807permissions to be able to change files owned by I<username>. In the case of SQL
808this generally is not a problem.
809
810=item B<-C> I<path>, B<--configpath>=I<path>, B<--config-file>=I<path>
811
812Use the specified path for locating the distributed configuration files.
813Ignore the default directories (usually C</usr/share/spamassassin> or similar).
814
815=item B<--siteconfigpath>=I<path>
816
817Use the specified path for locating site-specific configuration files. Ignore
818the default directories (usually C</etc/mail/spamassassin> or similar).
819
820=item B<--cf='config line'>
821
822Add additional lines of configuration directly from the command-line, parsed
823after the configuration files are read. Multiple B<--cf> arguments can be
824used, and each will be considered a separate line of configuration.
825
826=item B<-p> I<prefs>, B<--prefspath>=I<prefs>, B<--prefs-file>=I<prefs>
827
828Read user score preferences from I<prefs> (usually C<$HOME/.spamassassin/user_prefs>).
829
830=item B<--progress>
831
832Prints a progress bar (to STDERR) showing the current progress. In the case
833where no valid terminal is found this option will behave very much like the
834--showdots option.
835
836=item B<-D> [I<area,...>], B<--debug> [I<area,...>]
837
838Produce debugging output. If no areas are listed, all debugging information is
839printed. Diagnostic output can also be enabled for each area individually;
840I<area> is the area of the code to instrument. For example, to produce
841diagnostic output on bayes, learn, and dns, use:
842
843 spamassassin -D bayes,learn,dns
844
845For more information about which areas (also known as channels) are available,
846please see the documentation at:
847
848 C<http://wiki.apache.org/spamassassin/DebugChannels>
849
850Higher priority informational messages that are suitable for logging in normal
851circumstances are available with an area of "info".
852
853=item B<--no-sync>
854
855Skip the slow synchronization step which normally takes place after
856changing database entries. If you plan to learn from many folders in
857a batch, or to learn many individual messages one-by-one, it is faster
858to use this switch and run C<sa-learn --sync> once all the folders have
859been scanned.
860
861Clarification: The state of I<--no-sync> overrides the
862I<bayes_learn_to_journal> configuration option. If not specified,
863sa-learn will learn to the database directly. If specified, sa-learn
864will learn to the journal file.
865
866Note: I<--sync> and I<--no-sync> can be specified on the same commandline,
867which is slightly confusing. In this case, the I<--no-sync> option is
868ignored since there is no learn operation.
869
870=item B<-L>, B<--local>
871
872Do not perform any network accesses while learning details about the mail
873messages. This will speed up the learning process, but may result in a
874slightly lower accuracy.
875
876Note that this is currently ignored, as current versions of SpamAssassin will
877not perform network access while learning; but future versions may.
878
879=item B<--import>
880
881If you previously used SpamAssassin's Bayesian learner without the C<DB_File>
882module installed, it will have created files in other formats, such as
883C<GDBM_File>, C<NDBM_File>, or C<SDBM_File>. This switch allows you to migrate
884that old data into the C<DB_File> format. It will overwrite any data currently
885in the C<DB_File>.
886
887Can also be used with the B<--dbpath> I<path> option to specify the location of
888the Bayes files to use.
889
890=back
891
892=head1 MIGRATION
893
894There are now multiple backend storage modules available for storing
895user's bayesian data. As such you might want to migrate from one
896backend to another. Here is a simple procedure for migrating from one
897backend to another.
898
899Note that if you have individual user databases you will have to
900perform a similar procedure for each one of them.
901
902=over 4
903
904=item sa-learn --sync
905
906This will sync any outstanding journal entries
907
908=item sa-learn --backup > backup.txt
909
910This will save all your Bayes data to a plain text file.
911
912=item sa-learn --clear
913
914This is optional, but good to do to clear out the old database.
915
916=item Repeat!
917
918At this point, if you have multiple databases, you should perform the
919procedure above for each of them. (i.e. each user's database needs to
920be backed up before continuing.)
921
922=item Switch backends
923
924Once you have backed up all databases you can update your
925configuration for the new database backend. This will involve at least
926the bayes_store_module config option and may involve some additional
927config options depending on what is required by the module. (For
928example, you may need to configure an SQL database.)
929
930=item sa-learn --restore backup.txt
931
932Again, you need to do this for every database.
933
934=back
935
936If you are migrating to SQL you can make use of the -u <username>
937option in sa-learn to populate each user's database. Otherwise, you
938must run sa-learn as the user who database you are restoring.
939
940
941=head1 INTRODUCTION TO BAYESIAN FILTERING
942
943(Thanks to Michael Bell for this section!)
944
945For a more lengthy description of how this works, go to
946http://www.paulgraham.com/ and see "A Plan for Spam". It's reasonably
947readable, even if statistics make me break out in hives.
948
949The short semi-inaccurate version: Given training, a spam heuristics engine
950can take the most "spammy" and "hammy" words and apply probabilistic
951analysis. Furthermore, once given a basis for the analysis, the engine can
952continue to learn iteratively by applying both the non-Bayesian and Bayesian
953rulesets together to create evolving "intelligence".
954
955SpamAssassin 2.50 and later supports Bayesian spam analysis, in
956the form of the BAYES rules. This is a new feature, quite powerful,
957and is disabled until enough messages have been learnt.
958
959The pros of Bayesian spam analysis:
960
961=over 4
962
963=item Can greatly reduce false positives and false negatives.
964
965It learns from your mail, so it is tailored to your unique e-mail flow.
966
967=item Once it starts learning, it can continue to learn from SpamAssassin
968and improve over time.
969
970=back
971
972And the cons:
973
974=over 4
975
976=item A decent number of messages are required before results are useful
977for ham/spam determination.
978
979=item It's hard to explain why a message is or isn't marked as spam.
980
981i.e.: a straightforward rule, that matches, say, "VIAGRA" is
982easy to understand. If it generates a false positive or false negative,
983it is fairly easy to understand why.
984
985With Bayesian analysis, it's all probabilities - "because the past says
986it is likely as this falls into a probabilistic distribution common to past
987spam in your systems". Tell that to your users! Tell that to the client
988when he asks "what can I do to change this". (By the way, the answer in
989this case is "use whitelisting".)
990
991=item It will take disk space and memory.
992
993The databases it maintains take quite a lot of resources to store and use.
994
995=back
996
997=head1 GETTING STARTED
998
999Still interested? Ok, here's the guidelines for getting this working.
1000
1001First a high-level overview:
1002
1003=over 4
1004
1005=item Build a significant sample of both ham and spam.
1006
1007I suggest several thousand of each, placed in SPAM and HAM directories or
1008mailboxes. Yes, you MUST hand-sort this - otherwise the results won't be much
1009better than SpamAssassin on its own. Verify the spamminess/haminess of EVERY
1010message. You're urged to avoid using a publicly available corpus (sample) -
1011this must be taken from YOUR mail server, if it is to be statistically useful.
1012Otherwise, the results may be pretty skewed.
1013
1014=item Use this tool to teach SpamAssassin about these samples, like so:
1015
1016 sa-learn --spam /path/to/spam/folder
1017 sa-learn --ham /path/to/ham/folder
1018 ...
1019
1020Let SpamAssassin proceed, learning stuff. When it finds ham and spam
1021it will add the "interesting tokens" to the database.
1022
1023=item If you need SpamAssassin to forget about specific messages, use
1024the B<--forget> option.
1025
1026This can be applied to either ham or spam that has run through the
1027B<sa-learn> processes. It's a bit of a hammer, really, lowering the
1028weighting of the specific tokens in that message (only if that message has
1029been processed before).
1030
1031=item Learning from single messages uses a command like this:
1032
1033 sa-learn --ham --no-sync mailmessage
1034
1035This is handy for binding to a key in your mail user agent. It's very fast, as
1036all the time-consuming stuff is deferred until you run with the C<--sync>
1037option.
1038
1039=item Autolearning is enabled by default
1040
1041If you don't have a corpus of mail saved to learn, you can let
1042SpamAssassin automatically learn the mail that you receive. If you are
1043autolearning from scratch, the amount of mail you receive will determine
1044how long until the BAYES_* rules are activated.
1045
1046=back
1047
1048=head1 EFFECTIVE TRAINING
1049
1050Learning filters require training to be effective. If you don't train
1051them, they won't work. In addition, you need to train them with new
1052messages regularly to keep them up-to-date, or their data will become
1053stale and impact accuracy.
1054
1055You need to train with both spam I<and> ham mails. One type of mail
1056alone will not have any effect.
1057
1058Note that if your mail folders contain things like forwarded spam,
1059discussions of spam-catching rules, etc., this will cause trouble. You
1060should avoid scanning those messages if possible. (An easy way to do this
1061is to move them aside, into a folder which is not scanned.)
1062
1063If the messages you are learning from have already been filtered through
1064SpamAssassin, the learner will compensate for this. In effect, it learns what
1065each message would look like if you had run C<spamassassin -d> over it in
1066advance.
1067
1068Another thing to be aware of, is that typically you should aim to train
1069with at least 1000 messages of spam, and 1000 ham messages, if
1070possible. More is better, but anything over about 5000 messages does not
1071improve accuracy significantly in our tests.
1072
1073Be careful that you train from the same source -- for example, if you train
1074on old spam, but new ham mail, then the classifier will think that
1075a mail with an old date stamp is likely to be spam.
1076
1077It's also worth noting that training with a very small quantity of
1078ham, will produce atrocious results. You should aim to train with at
1079least the same amount (or more if possible!) of ham data than spam.
1080
1081On an on-going basis, it is best to keep training the filter to make
1082sure it has fresh data to work from. There are various ways to do
1083this:
1084
1085=over 4
1086
1087=item 1. Supervised learning
1088
1089This means keeping a copy of all or most of your mail, separated into spam
1090and ham piles, and periodically re-training using those. It produces
1091the best results, but requires more work from you, the user.
1092
1093(An easy way to do this, by the way, is to create a new folder for
1094'deleted' messages, and instead of deleting them from other folders,
1095simply move them in there instead. Then keep all spam in a separate
1096folder and never delete it. As long as you remember to move misclassified
1097mails into the correct folder set, it is easy enough to keep up to date.)
1098
1099=item 2. Unsupervised learning from Bayesian classification
1100
1101Another way to train is to chain the results of the Bayesian classifier
1102back into the training, so it reinforces its own decisions. This is only
1103safe if you then retrain it based on any errors you discover.
1104
1105SpamAssassin does not support this method, due to experimental results
1106which strongly indicate that it does not work well, and since Bayes is
1107only one part of the resulting score presented to the user (while Bayes
1108may have made the wrong decision about a mail, it may have been overridden
1109by another system).
1110
1111=item 3. Unsupervised learning from SpamAssassin rules
1112
1113Also called 'auto-learning' in SpamAssassin. Based on statistical
1114analysis of the SpamAssassin success rates, we can automatically train the
1115Bayesian database with a certain degree of confidence that our training
1116data is accurate.
1117
1118It should be supplemented with some supervised training in addition, if
1119possible.
1120
1121This is the default, but can be turned off by setting the SpamAssassin
1122configuration parameter C<bayes_auto_learn> to 0.
1123
1124=item 4. Mistake-based training
1125
1126This means training on a small number of mails, then only training on
1127messages that SpamAssassin classifies incorrectly. This works, but it
1128takes longer to get it right than a full training session would.
1129
1130=back
1131
1132=head1 FILES
1133
1134B<sa-learn> and the other parts of SpamAssassin's Bayesian learner,
1135use a set of persistent database files to store the learnt tokens, as follows.
1136
1137=over 4
1138
1139=item bayes_toks
1140
1141The database of tokens, containing the tokens learnt, their count of
1142occurrences in ham and spam, and the timestamp when the token was last
1143seen in a message.
1144
1145This database also contains some 'magic' tokens, as follows: the version
1146number of the database, the number of ham and spam messages learnt, the
1147number of tokens in the database, and timestamps of: the last journal
1148sync, the last expiry run, the last expiry token reduction count, the
1149last expiry timestamp delta, the oldest token timestamp in the database,
1150and the newest token timestamp in the database.
1151
1152This is a database file, using C<DB_File>. The database 'version
1153number' is 0 for databases from 2.5x, 1 for databases from certain 2.6x
1154development releases, 2 for 2.6x, and 3 for 3.0 and later releases.
1155
1156=item bayes_seen
1157
1158A map of Message-Id and some data from headers and body to what that
1159message was learnt as. This is used so that SpamAssassin can avoid
1160re-learning a message it has already seen, and so it can reverse the
1161training if you later decide that message was learnt incorrectly.
1162
1163This is a database file, using C<DB_File>.
1164
1165=item bayes_journal
1166
1167While SpamAssassin is scanning mails, it needs to track which tokens
1168it uses in its calculations. To avoid the contention of having each
1169SpamAssassin process attempting to gain write access to the Bayes DB,
1170the token timestamps are written to a 'journal' file which will later
1171(either automatically or via C<sa-learn --sync>) be used to synchronize
1172the Bayes DB.
1173
1174Also, through the use of C<bayes_learn_to_journal>, or when using the
1175C<--no-sync> option with sa-learn, the actual learning data will take
1176be placed into the journal for later synchronization. This is typically
1177useful for high-traffic sites to avoid the same contention as stated
1178above.
1179
1180=back
1181
1182=head1 EXPIRATION
1183
1184Since SpamAssassin can auto-learn messages, the Bayes database files
1185could increase perpetually until they fill your disk. To control this,
1186SpamAssassin performs journal synchronization and bayes expiration
1187periodically when certain criteria (listed below) are met.
1188
1189SpamAssassin can sync the journal and expire the DB tokens either
1190manually or opportunistically. A journal sync is due if I<--sync>
1191is passed to sa-learn (manual), or if the following is true
1192(opportunistic):
1193
1194=over 4
1195
1196=item - bayes_journal_max_size does not equal 0 (means don't sync)
1197
1198=item - the journal file exists
1199
1200=back
1201
1202and either:
1203
1204=over 4
1205
1206=item - the journal file has a size greater than bayes_journal_max_size
1207
1208=back
1209
1210or
1211
1212=over 4
1213
1214=item - a journal sync has previously occurred, and at least 1 day has
1215passed since that sync
1216
1217=back
1218
1219Expiry is due if I<--force-expire> is passed to sa-learn (manual),
1220or if all of the following are true (opportunistic):
1221
1222=over 4
1223
1224=item - the last expire was attempted at least 12hrs ago
1225
1226=item - bayes_auto_expire does not equal 0
1227
1228=item - the number of tokens in the DB is > 100,000
1229
1230=item - the number of tokens in the DB is > bayes_expiry_max_db_size
1231
1232=item - there is at least a 12 hr difference between the oldest and newest token atimes
1233
1234=back
1235
1236=head2 EXPIRE LOGIC
1237
1238If either the manual or opportunistic method causes an expire run
1239to start, here is the logic that is used:
1240
1241=over 4
1242
1243=item - figure out how many tokens to keep. take the larger of
1244either bayes_expiry_max_db_size * 75% or 100,000 tokens. therefore, the goal
1245reduction is number of tokens - number of tokens to keep.
1246
1247=item - if the reduction number is < 1000 tokens, abort (not worth the effort).
1248
1249=item - if an expire has been done before, guesstimate the new
1250atime delta based on the old atime delta. (new_atime_delta =
1251old_atime_delta * old_reduction_count / goal)
1252
1253=item - if no expire has been done before, or the last expire looks
1254"weird", do an estimation pass. The definition of "weird" is:
1255
1256=over 8
1257
1258=item - last expire over 30 days ago
1259
1260=item - last atime delta was < 12 hrs
1261
1262=item - last reduction count was < 1000 tokens
1263
1264=item - estimated new atime delta is < 12 hrs
1265
1266=item - the difference between the last reduction count and the goal reduction count is > 50%
1267
1268=back
1269
1270=back
1271
1272=head2 ESTIMATION PASS LOGIC
1273
1274Go through each of the DB's tokens. Starting at 12hrs, calculate
1275whether or not the token would be expired (based on the difference
1276between the token's atime and the db's newest token atime) and keep
1277the count. Work out from 12hrs exponentially by powers of 2. ie:
127812hrs * 1, 12hrs * 2, 12hrs * 4, 12hrs * 8, and so on, up to 12hrs
1279* 512 (6144hrs, or 256 days).
1280
1281The larger the delta, the smaller the number of tokens that will
1282be expired. Conversely, the number of tokens goes up as the delta
1283gets smaller. So starting at the largest atime delta, figure out
1284which delta will expire the most tokens without going above the
1285goal expiration count. Use this to choose the atime delta to use,
1286unless one of the following occurs:
1287
1288=over 8
1289
1290=item - the largest atime (smallest reduction count) would expire
1291too many tokens. this means the learned tokens are mostly old and
1292there needs to be new tokens learned before an expire can
1293occur.
1294
1295=item - all of the atime choices result in 0 tokens being removed.
1296this means the tokens are all newer than 12 hours and there needs
1297to be new tokens learned before an expire can occur.
1298
1299=item - the number of tokens that would be removed is < 1000. the
1300benefit isn't worth the effort. more tokens need to be learned.
1301
1302=back
1303
1304If the expire run gets past this point, it will continue to the end.
1305A new DB is created since the majority of DB libraries don't shrink the
1306DB file when tokens are removed. So we do the "create new, migrate old
1307to new, remove old, rename new" shuffle.
1308
1309=head2 EXPIRY RELATED CONFIGURATION SETTINGS
1310
1311=over 4
1312
1313=item C<bayes_auto_expire> is used to specify whether or not SpamAssassin
1314ought to opportunistically attempt to expire the Bayes database.
1315The default is 1 (yes).
1316
1317=item C<bayes_expiry_max_db_size> specifies both the auto-expire token
1318count point, as well as the resulting number of tokens after expiry
1319as described above. The default value is 150,000, which is roughly
1320equivalent to a 6Mb database file if you're using DB_File.
1321
1322=item C<bayes_journal_max_size> specifies how large the Bayes
1323journal will grow before it is opportunistically synced. The
1324default value is 102400.
1325
1326=back
1327
1328=head1 INSTALLATION
1329
1330The B<sa-learn> command is part of the B<Mail::SpamAssassin> Perl module.
1331Install this as a normal Perl module, using C<perl -MCPAN -e shell>,
1332or by hand.
1333
1334=head1 SEE ALSO
1335
1336spamassassin(1)
1337spamc(1)
1338Mail::SpamAssassin(3)
1339Mail::SpamAssassin::ArchiveIterator(3)
1340
1341E<lt>http://www.paulgraham.com/E<gt>
1342Paul Graham's "A Plan For Spam" paper
1343
1344E<lt>http://www.linuxjournal.com/article/6467E<gt>
1345Gary Robinson's f(x) and combining algorithms, as used in SpamAssassin
1346
1347E<lt>http://www.bgl.nu/~glouis/bogofilter/E<gt>
1348'Training on error' page. A discussion of various Bayes training regimes,
1349including 'train on error' and unsupervised training.
1350
1351=head1 PREREQUISITES
1352
1353C<Mail::SpamAssassin>
1354
1355=head1 AUTHORS
1356
1357The SpamAssassin(tm) Project E<lt>https://spamassassin.apache.org/E<gt>
1358
1359=cut
1360