root/tags/FuzzyOcr-3.4.1/FuzzyOcr.pm

Revision 30, 40.1 kB (checked in by decoder, 2 years ago)

Added Development releases 3.4.0 and 3.4.1

Line 
1 # FuzzyOcr plugin, version 3.4
2 #
3 # written by Christian Holler (decoder_at_own-hero_dot_net)
4 #   and Jorge Valdes (jorge_at_joval_dot_info)
5
6 package FuzzyOcr;
7
8 use strict;
9 use warnings;
10 use Mail::SpamAssassin;
11 use Mail::SpamAssassin::Logger;
12 use Mail::SpamAssassin::Util;
13 use Mail::SpamAssassin::Timeout;
14 use Mail::SpamAssassin::Plugin;
15
16 use String::Approx 'adistr';
17 use MLDBM qw(DB_File Storable);
18 use FileHandle;
19 use Fcntl ':flock';
20
21 our @ISA = qw (Mail::SpamAssassin::Plugin);
22
23 our %App = ();
24 our %Option = ();
25 our %Score = ();
26 our %Threshold = ();
27 our %words = ();
28 our $self;
29 our $pms;
30 our @scansets;
31
32 our @bin_utils = qw/gifsicle giffix giftext gifinter giftopnm
33     jpegtopnm pngtopnm bmptopnm tifftopnm ppmhist pamfile gocr ocrad/;
34
35 our @pgm_scores = qw/base add corrupt corrupt_unfixable wrongctype
36     autodisable/;
37
38 our @pgm_opts = qw/personal_wordlist global_wordlist logfile
39     threshold counts_required verbose timeout
40     db_hash db_safe db_max_days path_bin scansets keep_bad_images
41     score_ham enable_image_hashing digest_db hashing_learn_scanned/;
42
43 our @paths = qw(/usr/local/netpbm/bin /usr/local/bin /usr/bin);
44
45 # Default values
46 $Option{threshold} = 0.3;
47 $Option{counts_required} = 2;
48 $Option{verbose} = 1;
49 $Option{timeout} = 10;
50 $Option{logfile} = undef;
51 $Option{enable_image_hashing} = 0;
52 $Option{hashing_learn_scanned} = 1;
53 $Option{digest_db} = "/etc/mail/spamassassin/FuzzyOcr.hashdb";
54 $Option{global_wordlist} = "/etc/mail/spamassassin/FuzzyOcr.words";
55 $Option{personal_wordlist} = ".spamassassin/fuzzyocr.words";
56 $Option{db_hash} = "/etc/mail/spamassassin/FuzzyOcr.db";
57 $Option{db_safe} = "/etc/mail/spamassassin/FuzzyOcr.safe.db";
58 $Option{db_max_days} = 35;
59 $Option{keep_bad_images} = 0;
60 $Option{score_ham} = 0;
61
62 # Default scores
63 $Score{base} = 4;
64 $Score{add} = 1;
65 $Score{corrupt} = 2.5;
66 $Score{corrupt_unfixable} = 5;
67 $Score{wrongctype} = 1.5;
68 $Score{autodisable} = 10;
69
70 # Default thresolds
71 $Threshold{s}  =
72 $Threshold{h}  =
73 $Threshold{w}  =
74 $Threshold{cn} = 0.01;
75 $Threshold{c}  = 5;
76 $Threshold{max_hash} = 5;
77
78 # constructor: register the eval rule
79 sub new {
80     my ( $class, $mailsa ) = @_;
81     $class = ref($class) || $class;
82     my $self = $class->SUPER::new($mailsa);
83     bless( $self, $class );
84     $self->register_eval_rule("fuzzyocr_check");
85     $self->register_eval_rule("dummy_check");
86     return $self;
87 }
88
89 sub parse_config {
90     my ( $self, $opts ) = @_;
91     if ( $opts->{key} =~ /^focr_bin_/i ) {
92         my $p = lc $opts->{key};
93         $p =~ s/focr_bin_//;
94         if (grep {m/$p/} @bin_utils) {
95             $App{$p} = $opts->{value};
96             debuglog("App{$p} => $App{$p}");
97         } else {
98             debuglog("unknown App: $opts->{key}");
99         }
100     }
101     elsif ( $opts->{key} =~ m/_score$/i ) {
102         my $o = lc $opts->{key};
103         $o =~ s/focr_//;
104         $o =~ s/_score//;
105         if (grep {m/$o/} @pgm_scores) {
106             $Score{$o} = $opts->{value};
107             debuglog("Score{$o} = $Score{$o}");
108         } else {
109             debuglog("unknown Score: $opts->{key}");
110         }
111     }
112     else {
113         my $o = lc $opts->{key};
114         $o =~ s/focr_//;
115         if (grep {m/$o/} @pgm_opts) {
116             if ($o eq 'scansets') {
117                 @scansets = (); # remove
118                 foreach my $s (split(',',$opts->{value})) {
119                     $s =~ s/^\s*//; $s =~ s/\s*$//;
120                     push @scansets,$s;
121                     debuglog("Found scan: $s");
122                 }
123             } elsif ($o eq 'path_bin') {
124                 @paths = (); # remove
125                 foreach my $p (split(':',$opts->{value})) {
126                     next unless -d $p;
127                     push @paths,$p;
128                     debuglog("Valid search path: $p");
129                 }
130             } else {
131                 $Option{$o} = $opts->{value};
132                 debuglog("Option $o = $Option{$o}");
133             }
134         } else {
135             debuglog("unknown Option: $opts->{key}");
136         }
137     }
138     1;
139 }
140
141 sub finish_parsing_end {
142     unless (@paths) {
143         foreach my $p ( '/usr/local/netpbm/bin', '/usr/local/bin', '/usr/bin' ) {
144             if (-d $p) {
145                 push @paths, $p;
146                 debuglog("Searching in: $p");
147             }
148         }
149     }
150     foreach my $a (@bin_utils) {
151         if (defined $App{$a} and ! -x $App{$a}) {
152             debuglog("cannot exec $a, removing...");
153             delete $App{$a};
154         }
155         foreach my $p (@paths) {
156             my $f = "$p/$a";
157             if (! defined $App{$a} and -x $f) {
158                 $App{$a} = $f;
159                 last;
160             }
161         }
162         if (defined $App{$a}) {
163             debuglog("Using $a => $App{$a}");
164         } else {
165             debuglog("Cannot find executable for $a");
166         }
167     }
168     if ($Option{enable_image_hashing} == 2 and -r $Option{digest_db}) {
169         my %DB; my $dbm; my $err = 0;
170         my $now = time - ($Option{db_max_days}*86400);
171         tie %DB, 'MLDBM', $Option{db_hash} or $err++;
172         if ($err) {
173             debuglog("Could not open \"$Option{db_hash}\"");
174         } else {
175             my $hash = 0;
176             debuglog("Expiring records prior to: ".scalar(localtime($now)));
177             foreach my $k (keys %DB) {
178                 my $db = $DB{$k};
179                 if ($db->{check} < $now) {
180                     debuglog("Expire: <$k> Reason: $db->{check} < $now");
181                     delete $DB{$k}; $hash++;
182                 }
183             }
184             debuglog("Expired <$hash> Image Hashes after $Option{db_max_days} day(s)")
185                 if ($hash>0);
186             $hash = 0;
187             open HASH, $Option{digest_db};
188             while (<HASH>) {
189                 chomp;
190                 my($score,$basic,$key) = split('::',$_,3);
191                 next if (defined $DB{$key});
192                 $dbm = $DB{$key};
193                 $dbm->{score} = $score;
194                 $dbm->{basic} = $basic;
195                 $dbm->{input} =
196                 $dbm->{check} = time;
197                 $dbm->{match} = 1;
198                 $DB{$key} = $dbm;
199                 $hash++;
200             }
201             close HASH;
202             debuglog("Imported <$hash> Image Hashes from \"$Option{digest_db}\"")
203                 if ($hash>0);
204             $hash = scalar(keys %DB);
205             debuglog("<$hash> Known BAD Image Hashes Available");
206         }
207         untie %DB; $err = 0;
208         tie %DB, 'MLDBM', $Option{db_safe} or $err++;
209         if ($err) {
210             debuglog("Could not open \"$Option{db_safe}\"");
211         } else {
212             my $hash = 0;
213             foreach my $k (keys %DB) {
214                 my $db = $DB{$k};
215                 if ($db->{check} < $now) {
216                     debuglog("Expire: <$k> Reason: $db->{check} < $now");
217                     delete $DB{$k}; $hash++;
218                 }
219             }
220             debuglog("Expired <$hash> Image Hashes after $Option{db_max_days} day(s)")
221                 if ($hash>0);
222             $hash = scalar(keys %DB);
223             debuglog("<$hash> Known GOOD Image Hashes Available");
224         }
225         untie %DB;
226     }
227     load_global_words( $Option{global_wordlist} );
228
229     unless (@scansets) {
230         @scansets = ( '$gocr -i $pfile', '$gocr -l 180 -d 2 -i $pfile');
231     }
232     foreach (@scansets) {
233         debuglog("Using scan: $_");
234     }
235 }
236
237 sub dummy_check {
238     return 0;
239 }
240
241 sub load_global_words {
242     unless ( -r $_[0] ) {
243         debuglog("Cannot read Global wordlist: \"$_[0]\"\n Please check file path and permissions are correct.");
244         return;
245     }
246     my $cnt = 0;
247     open WORDLIST, "<$_[0]";
248     while(my $w = <WORDLIST>) {
249         chomp($w);
250         $w =~ s/\s*//;
251         $w =~ s/#(.*)//;
252         next unless $w;
253         my $wt = $Option{threshold};
254         if ($w =~ /^(.*?)::(0(\.\d+){0,1})/) {
255             ($w, $wt) = (lc($1), $2);
256             $wt = $Option{threshold} unless ($wt =~ m/[\d\.]+/);
257         }
258         $words{$w} = $wt; $cnt++;
259     }
260     close WORDLIST;
261     debuglog("Loaded <$cnt> words from \"$_[0]\"");
262 }
263
264 sub load_personal_words {
265     unless ( -e $_[0] ) {
266         #debuglog("Personal wordlist <$_[0]> not found, skipping...");
267         return;
268     }
269     unless ( -r $_[0] ) {
270         debuglog("Cannot read from wordlist \"$_[0]\"\n Please make sure that permissions are correct."
271         );
272         return;
273     }
274     my $cnt = 0;
275     open WORDLIST, "<$_[0]";
276     while(my $w = <WORDLIST>) {
277         chomp($w);
278         $w =~ s/\s*//;
279         $w =~ s/#(.*)//;
280         next unless $w;
281         my $wt = $Option{threshold};
282         if ($w =~ /^(.*?)::(0(\.\d+){0,1})/) {
283             ($w, $wt) = ($1, $2);
284             $wt = $Option{threshold} unless ($wt =~ m/[\d\.]+/);
285         }
286         $words{$w} = $wt; $cnt++;
287     }
288     close WORDLIST;
289     debuglog("Updated Word List with $cnt words from $_[0]");
290 }
291
292 sub max {
293     unless ( defined( $_[0] ) and defined( $_[1] ) ) { return 0 }
294     unless ( defined( $_[0] ) ) { return $_[1] }
295     unless ( defined( $_[1] ) ) { return $_[0] }
296     if     ( $_[0] < $_[1] )    { return $_[1] }
297     else                        { return $_[0] }
298 }
299
300 sub within_threshold {
301     my $digest = shift;
302     my $record = shift;
303
304     my ($dimg,$dkey) = split('::',$digest);
305     my ($rimg,$rkey) = split('::',$record);
306     my ($ds, $dh, $dw, $dcn) = split(':',$dimg);
307     my ($rs, $rh, $rw, $rcn) = split(':',$rimg);
308     return(0) unless $rs;
309     return(0) unless $rh;
310     return(0) unless $rw;
311     return(0) unless $rcn;
312     return(0) unless $rkey;
313     return(0) if ((abs($ds  - $rs ) / $rs ) > $Threshold{s});
314     return(0) if ((abs($dh  - $rh ) / $rh ) > $Threshold{h});
315     return(0) if ((abs($dw  - $rw ) / $rw ) > $Threshold{w});
316     return(0) if ((abs($dcn - $rcn) / $rcn) > $Threshold{cn});
317            
318     my @rcf = split('::',$rkey);
319     my @dcf = split('::',$dkey);
320
321     my (@dcfs, @rcfs);
322     foreach (@dcf) { push @dcfs,split(':',$_); }
323     foreach (@rcf) { push @rcfs,split(':',$_); }
324
325     my $total = scalar(@rcfs);
326     if ($total == scalar(@dcfs)) {
327         my $match = 0;
328         foreach (0 .. ($total-1)) {
329             $match++ if (abs($dcfs[$_] - $rcfs[$_]) <= $Threshold{c});
330         }
331         debuglog("image matched <$match> of <$total> colors");
332         return(1) if ($match == $total);
333     }
334     return(0);
335 }
336
337 sub fmt_time {
338     my $when = time - $_[0];
339     my $ret;
340
341     if ($when>86400) {
342         my $d = int($when/86400);
343         $when -= $d*86400;
344         $ret = "$d days,";
345     }
346     if ($when>3600) {
347         my $h = int($when/3600);
348         $when -= $h*3600;
349         $ret .= " $h hours";
350     }
351     if ($when>60) {
352         my $m = int($when/60);
353         $when -= $m*60;
354         $ret .= " $m minutes";
355     }
356     if ($when>0) {
357         $ret .= " $when seconds";
358     }
359     $ret .= " ago";
360     return $ret;
361 }
362
363 sub gif_info {
364     my ($giffile, $t) = @_;
365    
366     my $fd = new IO::Handle;
367    
368     my @stdout_data;
369     my @stderr_data;
370
371     my $retcode = $t->run_and_catch(sub {
372         @stdout_data = qx($App{gifsicle} --info $giffile);
373     });
374
375     if ($retcode) {
376         #FIXME: WARN HERE
377     }
378
379     my $output = join("", @stdout_data);
380    
381     my ($globalinfo, @frameinfo)
382         = split /^ \s+ \+ \s+ (?=image \s+ \#\d+)/mx, $output;
383
384     my %info = ( 'loop' => 0,
385                  'loop_count' => 0,
386                  'delays' => [],
387                  'has_local_color_table' => 0 );
388
389     if ($globalinfo =~ /^ \s* loop \s+ (forever|count \s+ (\d+))/mx) {
390         $info{'loop'} = 1;
391         $info{'loop_count'} = $2 ? ($2 + 0) : 0;
392     }
393
394     my $frameno = 0;
395     foreach my $info (@frameinfo) {
396         # We could just match the delays, but we'll also check the image#'s
397         # as a sanity check.
398         my ($n, $delay) = $info =~ m{ image \s+ \#(\d+)
399                                       (?: .* \b delay \s+ (\d+(?:\.\d+)?) s)?
400                                       }sx;
401         #FIXME: Handle this error
402         $n == $frameno
403             or debuglog ( "Trouble parsing 'gifsicle --info' output.\n"
404                      . "  Expected 'image \#$frameno', found 'image \#$n'" );
405
406         $info{'delays'}->[$frameno++] = $delay ? $delay + 0.0 : 0.0;
407         $info{'has_local_color_table'} ||= $output =~ /local\s+color\s+table/;
408     }
409
410     return \%info;
411 }
412
413 sub find_dominant_image ($) {
414     my ($info) = @_;
415     my ($loop, $loop_count, $delays) = @$info{qw(loop loop_count delays)};
416
417     # Pick out the frame with the longest delay.
418     my $maxdelay = -1e6;
419     my $maxn = @$delays - 1;
420     for (my $n = 0; $n < @$delays; $n++) {
421         $delays->[$n] > $maxdelay
422             and ($maxn, $maxdelay) = ($n, $delays->[$n]);
423     }
424
425     if ($maxdelay < 15.0 && !$loop) {
426         # In non-looped (or finitely-looped) images, the last frame
427         # gets displayed forever at the end of the animation.
428         # Therefore the last frame is the dominant frame.
429         return @$delays - 1;
430     }
431     return $maxn;
432 }
433
434
435
436 sub check_image_hash_db {
437     my $digest = $_[0];
438     my $dbfile = $_[1] || $Option{db_hash};
439     my $fname  = $_[2];
440     my $ctype  = $_[3];
441     my ($img, $key) = split('::', $digest,2);
442     return (0,'') unless defined $key;
443     my $hash = $digest;
444     my $ret = 0; my $txt = 'Exact';
445     my $dinfo;
446     my %DB = (); my $dbm;
447
448     if ($Option{enable_image_hashing} == 2) {
449         tie %DB, 'MLDBM', $dbfile, O_RDWR or $ret++;
450         if ($ret>0) {
451             debuglog("No Image Hash database found at \"$dbfile\", or permissions wrong.");
452             return (0,'');
453         }
454         if (defined $DB{$key}) {
455             $dbm = $DB{$key};
456             if ($img eq $dbm->{basic}) {
457                 $ret = $dbm->{score} || 0.001;
458                 $dinfo = $dbm->{dinfo} || '';
459                 $dbm->{fname} = $fname;
460                 $dbm->{ctype} = $ctype;
461                 debuglog("Updating $txt info File:'$fname' Type:'$ctype'");
462                 $DB{$key} = $dbm;
463             }
464         }
465         if ($ret == 0) {
466             my $now = time - ($Option{db_max_days}*86400);
467             foreach my $k (keys %DB) {
468                 $dbm  = $DB{$k};
469                 $hash = $dbm->{basic} ? $dbm->{basic} : "0:0:0:0::$k";
470                 if (within_threshold($digest,$hash)) {
471                     $ret  = $dbfile eq $Option{db_hash} ? $dbm->{score} : $dbm->{match};
472                     $txt  = 'Approx'; $dinfo = $dbm->{dinfo} || '';
473                     debuglog("Found in: <$dbfile>");
474                     last;
475                 }
476                 # Has the record expired??
477                 $dbm->{check} = $now - 1 unless defined $dbm->{check};
478                 if ($dbm->{check} < $now) {
479                     debuglog("Expiring <$k> older than $Option{db_max_days} days");
480                     delete $DB{$k};
481                 }
482             }
483         }
484         if ($ret>0) {
485             $dbm->{match}++;
486             if ($dbfile eq $Option{db_hash}) {
487                 $ret = sprintf("%0.3f",$dbm->{score});
488                 debuglog("Found Score <$ret> for $txt Image Hash");
489             }
490             debuglog("Matched [$dbm->{match}] time(s). Prev match: ".fmt_time($dbm->{check}));
491             $dbm->{check} = time;
492             $DB{$key} = $dbm;
493         }
494         untie %DB;
495         return ($ret,$dinfo);
496     } elsif ($Option{enable_image_hashing} == 1) {
497         $ret = open HASH, $Option{digest_db};
498         unless($ret) {
499             debuglog("No Image Hash database found at \"$Option{digest_db}\", or permissions wrong.");
500             return (0,'');
501         }
502         while (<HASH>) {
503             chomp;
504             ($ret,$hash) = split('::',$_,2);
505             if (within_threshold($digest,$hash)) {
506                 debuglog("Found Score <$ret> for Hash <$hash>");
507                 return ($ret,'');
508             }
509         }
510         close HASH;
511         return (0,'');
512     }
513 }
514            
515 sub add_image_hash_db {
516     my $digest = $_[0];
517     my $score  = $_[1];
518     my $ret = 0;
519
520     if ($Option{enable_image_hashing} == 2) {
521         my $dbfile = $_[2] || $Option{db_hash};
522         my %DB = ();
523         tie %DB, 'MLDBM', $dbfile or $ret++;
524         if ($ret>0) {
525             debuglog("Unable to open/create Image Hash database at \"$dbfile\", check permissions.");
526             return;
527         }
528         debuglog("Adding Hash to \"$dbfile\"");
529
530         my ($img,$key) = split('::',$digest,2);
531         if (defined $key) {
532             my $dbm = $DB{$key};
533             $dbm->{fname} = $_[3];
534             $dbm->{ctype} = $_[4];
535             $dbm->{dinfo} = $_[5];
536             $dbm->{basic} = $img;
537             $dbm->{score} = $score;
538             $dbm->{input} =
539             $dbm->{check} = time;
540             $dbm->{match} = $dbfile eq $Option{db_hash} ? 0 : 1;
541             $DB{$key} = $dbm;
542         }
543         untie %DB;
544     } elsif ($Option{enable_image_hashing} == 1) {
545         if (-e $Option{digest_db}) {
546             $ret = open DB, ">>$Option{digest_db}";
547         } else {
548             $ret = open DB,  ">$Option{digest_db}";
549         }
550         unless ($ret) {
551             debuglog("Unable to open/create Image Hash database at \"$Option{digest_db}\", check permissions.");
552             return;
553         }
554         debuglog("Adding Hash to \"$Option{digest_db}\"");
555         flock( DB, LOCK_EX );
556         seek( DB, 0, 2 );
557         print DB "${score}::${digest}\n";
558         flock( DB, LOCK_UN );
559         close(DB);
560     }
561     debuglog("Digest: $digest");
562 }
563
564 sub calc_image_hash {
565     my $pfile = $_[0];
566     my ($rcode, $hash);
567
568     foreach my $a (qw/pamfile ppmhist/) {
569         unless (defined $App{$a}) {
570             info("FuzzyOcr: calc_image_hash cannot exec $a");
571             return (1, '');
572         }
573     }
574    
575     unless (-r $pfile) {
576         debuglog("Cannot read $pfile");
577         return(1,'');
578     }
579
580     my @stdout_data;
581     my ($w,$h,$s,$t) = (0,0,0,0);
582     $t = Mail::SpamAssassin::Timeout->new({ secs => $Option{timeout} });
583     $rcode = $t->run_and_catch(sub {
584         @stdout_data = qx($App{pamfile} $pfile 2>/dev/null);
585     });
586     if ($rcode) {
587         chomp $rcode;
588         debuglog("$App{pamfile}: Timed out [$rcode], skipping...");
589         return (1, '');
590     }
591     if ($stdout_data[0] =~ m/(\d+) by (\d+)/) {
592         $w = $1; $h = $2;
593         $s = (stat($pfile))[7];
594     }
595
596     $rcode = $t->run_and_catch(sub {
597         @stdout_data = qx($App{ppmhist} -noheader $pfile 2>/dev/null);
598     });
599     if ($rcode) {
600         chomp $rcode;
601         debuglog("$App{ppmhist}: Timed out [$rcode], skipping...");
602         return (1, '');
603     }
604     my $cnt = 0;
605     my $c = scalar(@stdout_data);
606     $hash = sprintf "%d:%d:%d:%d",$s,$h,$w,$c;
607     if ($Threshold{max_hash}) {
608         foreach (@stdout_data) {
609             $_ =~ s/ +/ /g;
610             my(@d) = split(' ', $_);
611             $hash .= sprintf("::%d:%d:%d:%d:%d",@d);
612             if ($cnt++ ge $Threshold{max_hash}) {
613                 last;
614             }
615         }
616     }
617     debuglog("Got: <$hash>");
618     return(0, $hash);
619 }
620
621 sub logfile {
622     my $logtext = $_[0];
623     my ( $sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst ) = localtime(time);
624     my $time = sprintf(
625         "%4d-%02d-%02d %02d:%02d:%02d",
626         $year + 1900,
627         $mon + 1, $mday, $hour, $min, $sec
628     );
629     $logtext =~ s/\n/\n                      /g;
630
631     unless ( open LOGFILE, ">>", $Option{logfile} ) {
632         warn "Can't open $Option{logfile} for writing, check permissions";
633     }
634     flock( LOGFILE, LOCK_EX );
635     seek( LOGFILE, 0, 2 );
636     print LOGFILE "$time [$$] $logtext\n";
637     flock( LOGFILE, LOCK_UN );
638     close LOGFILE;
639 }
640
641 sub debuglog {
642     my @lines = split('\n',$_[0]);
643     my $limit = $_[1];
644     $limit = 1 unless defined $limit;
645     foreach (@lines) { dbg ("FuzzyOcr: $_"); }
646     if ( $Option{verbose} > $limit ) {
647         logfile($_[0]) if defined $Option{logfile};
648     }
649 }
650
651 sub wrong_ctype {
652     my ( $format, $ctype ) = @_;
653     if ($Score{wrongctype}) {
654         my $debuginfo = "";
655         if ( $Option{verbose} > 0 ) {
656             $debuginfo =
657               ("Image has format \"$format\" but content-type is \"$ctype\"");
658         }
659         debuglog($debuginfo);
660         for my $set ( 0 .. 3 ) {
661             $pms->{conf}->{scoreset}->[$set]->{"FUZZY_OCR_WRONG_CTYPE"} =
662               sprintf( "%0.3f", $Score{wrongctype} );
663         }
664         $pms->_handle_hit( "FUZZY_OCR_WRONG_CTYPE", $Score{wrongctype}, "BODY: ",
665             $pms->{conf}->{descriptions}->{FUZZY_OCR_WRONG_CTYPE} . "\n$debuginfo" );
666     }
667 }
668
669 sub corrupt_img {
670     my ($score, $err) = @_;
671     if ($score>0) {
672         my $debuginfo = "";
673         if ( $Option{verbose} > 0 ) {
674             chomp($err);
675             $debuginfo = ("Corrupt image: $err");
676         }
677         debuglog($debuginfo);
678         for my $set ( 0 .. 3 ) {
679             $pms->{conf}->{scoreset}->[$set]->{"FUZZY_OCR_CORRUPT_IMG"} =
680               sprintf( "%0.3f", $score );
681         }
682         $pms->_handle_hit( "FUZZY_OCR_CORRUPT_IMG", $score, "BODY: ",
683             $pms->{conf}->{descriptions}->{FUZZY_OCR_CORRUPT_IMG} . "\n$debuginfo" );
684     }
685 }
686
687 sub known_img_hash {
688     my $score = $_[0] || $Score{base};
689     my $dinfo = $_[1] ? "\n$_[1]" : '';
690     for my $set ( 0 .. 3 ) {
691         $pms->{conf}->{scoreset}->[$set]->{"FUZZY_OCR_KNOWN_HASH"} =
692         sprintf( "%0.3f", $score );
693     }
694     $pms->_handle_hit( "FUZZY_OCR_KNOWN_HASH", $score, "BODY: ",
695         $pms->{conf}->{descriptions}->{FUZZY_OCR_KNOWN_HASH} . $dinfo );
696 }
697
698 sub removedir {
699     my $dir = $_[0];
700     return unless -d $dir;
701     opendir D, $dir;
702     my @files = readdir D;
703     closedir D;
704     foreach my $f (@files) {
705         next if $f eq '.';
706         next if $f eq '..';
707         my $ff = Mail::SpamAssassin::Util::untaint_file_path("$dir/$f");
708         unless (unlink $ff) {
709             debuglog("Cannot remove: $ff");
710         }
711     }
712     debuglog("Remove DIR: $dir");
713     unless(rmdir $dir) {
714         debuglog("Cannot remove DIR: $dir");
715     }
716 }
717
718 sub fuzzyocr_check {
719     ( $self, $pms ) = @_;
720     if ( $pms->get_score() > $Score{autodisable} ) {
721         debuglog("Scan canceled, message has already more than $Score{autodisable} points.");
722         return 0;
723     }
724
725     my $imgdir;
726     my %imgfiles = ();
727     my @found    = ();
728     my @hashes   = ();
729     my $cnt      = 0;
730     my $imgerr   = 0;
731
732     #debuglog("Starting FuzzyOcr...");
733     #debuglog("Attempting to load personal wordlist...");
734     if ($Option{personal_wordlist} =~ m/^\//) {
735         load_personal_words( $Option{personal_wordlist} );
736     } else {
737         my $homedir  = (getpwuid($<))[7];
738         if ($homedir) {
739             load_personal_words( $homedir . "/$Option{personal_wordlist}" );
740         } elsif (defined($ENV{HOME})) {
741             load_personal_words( $ENV{HOME} . "/$Option{personal_wordlist}" );
742         } else {
743             debuglog("Variable \$ENV{HOME} not defined and getpwuid failed, personal wordlist function not available...");
744         }
745     }
746
747     foreach my $p (
748         $pms->{msg}->find_parts(qr(^image\b)i),
749         $pms->{msg}->find_parts(qr(Application/Octet-Stream)i)
750     ) {
751         my $ctype = $p->{'type'};
752         my $fname = $p->{'name'} || 'unknown';
753         if (($fname eq 'unknown') and
754             (defined $p->{'headers'}->{'content-id'})
755             ){
756             $fname = join('',@{$p->{'headers'}->{'content-id'}});
757             $fname =~ s/[<>]//g;
758             $fname =~ tr/\@\$\%\&/_/s;
759         }
760         my $test  = 0;
761         $test++ if ($ctype =~ /image/i);
762         $test++ if ($fname =~ /(gif|jpg|jpeg|png|bmp|tiff)$/i);
763
764         if ($test == 0) {
765             debuglog("Skipping file with content-type=\"$ctype\" name=\"$fname\"");
766             next;
767         }
768
769         $imgdir = Mail::SpamAssassin::Util::secure_tmpdir() unless ($imgdir);
770         unless ($imgdir) {
771             debuglog("Scan canceled, cannot create Image TMPDIR.");
772             return 0;
773         }
774         #keep raw email for debugging later
775         my $imgfilename = $imgdir . "/raw.eml";
776         unless (-e $imgfilename) {
777             if (open RAW, ">$imgfilename") {
778                 print RAW $pms->{msg}->get_pristine();
779                 close RAW;
780                 debuglog("Saved: $imgfilename");
781             }
782         }
783
784         $fname =~ tr{a-zA-Z0-9\.}{_}cs;
785         $imgfilename = Mail::SpamAssassin::Util::untaint_file_path(
786             $imgdir . "/" . $fname
787         );
788         my $unique = 0;
789         while (-e $imgfilename) {
790             $imgfilename = Mail::SpamAssassin::Util::untaint_file_path(
791                 $imgdir . "/" . chr(65+$unique) . <