root/tags/FuzzyOcr-2.3j/FuzzyOcr.pm

Revision 3, 39.9 kB (checked in by decoder, 2 years ago)

Added current stable and testing release
Added samples
Added patches to external toolchain

Line 
1 # FuzzyOcr plugin, version 2.3j
2 # Changelog:
3 #    version 2.0
4 #    Replaced imagemagick with netpbm
5 #    Invoke giffix to fix broken gifs before conversion
6 #    Support png images
7 #    Analyze the file to detect the format without content-type
8 #    Added several configuration parameters
9 #    version 2.1
10 #       Added scoring for wrong content-type
11 #       Added scoring for broken gif images
12 #       Added configuration for helper applications
13 #       Added autodisable_score feature to disable the OCR engine if the message has already enough points
14 #    version 2.1b
15 #       Rule bugfix to avoid warnings
16 #    version 2.1c
17 #       Applied patch provided by Howard Kash to fix problems with spamassassin + Mailscanner + FuzzyOcr
18 #       Removed '-' from jpegtopnm arguments to provide backwards compatibility for older netpbm versions
19 #       Fixed typo (treshold -> threshold)
20 #    version 2.2
21 #       Small bugfix in content-type check for jpeg (jpg was not matching), thanks to Matthias Keller
22 #       Added more error handling
23 #       Removed debug files, added logfile instead
24 #       More messages with verbose = 2
25 #    version 2.3
26 #       Multiple scans with different pnm preprocessing and gocr arguments possible
27 #       Support for interlaced gifs
28 #       Support for animated gifs
29 #       Temporary file handling reorganized
30 #       External wordlist support
31 #       Personalized wordlist support
32 #       Spaces are now stripped from wordlist words and OCR results before matching
33 #       Experimental MD5 Database feature
34 #    version 2.3b
35 #       MD5 Database replaced by different feature database
36 #       Corrupted images are now handled better
37 #       Added a timeout function to avoid lockups
38 #       Added threshold overriding on word basis in wordlist
39 #       Various bugfixes
40 #
41 # written by Christian Holler decoder_at_own-hero_dot_net
42 #   patch by Jorge Valdes jorge_at_joval_info
43
44 package FuzzyOcr;
45
46 use strict;
47 use warnings;
48 use Mail::SpamAssassin;
49 use Mail::SpamAssassin::Logger;
50 use Mail::SpamAssassin::Util;
51 use Mail::SpamAssassin::Timeout;
52 use Mail::SpamAssassin::Plugin;
53
54 use String::Approx 'adistr';
55 use Image::Magick;
56 use MLDBM qw(DB_File Storable);
57 use FileHandle;
58 use Fcntl ':flock';
59
60 our @ISA = qw (Mail::SpamAssassin::Plugin);
61
62 our %App = ();
63 our %Option = ();
64 our %Score = ();
65 our %Threshold = ();
66 our %words = ();
67 our $self;
68 our $pms;
69 our @scansets;
70
71 our @bin_utils = qw/giffix giftext gifinter giftopnm
72     jpegtopnm pngtopnm bmptopnm ppmhist gocr/;
73
74 our @pgm_scores = qw/base add corrupt corrupt_unfixable wrongctype
75     autodisable/;
76
77 our @pgm_opts = qw/personal_wordlist global_wordlist logfile
78     threshold counts_required verbose timeout gif_max_frames
79     db_hash db_safe db_max_days path_bin scansets keep_bad_images
80     anim_delay anim_max_frames score_ham
81     enable_image_hashing digest_db hashing_learn_scanned/;
82
83 our @paths = qw(/usr/local/netpbm/bin /usr/local/bin /usr/bin);
84
85 # Default values
86 $Option{threshold} = 0.3;
87 $Option{counts_required} = 2;
88 $Option{verbose} = 1;
89 $Option{timeout} = 10;
90 $Option{gif_max_frames} = 5;
91 $Option{logfile} = "stderr";
92 $Option{enable_image_hashing} = 0;
93 $Option{hashing_learn_scanned} = 1;
94 $Option{digest_db} = "/etc/mail/spamassassin/FuzzyOcr.hashdb";
95 $Option{global_wordlist} = "/etc/mail/spamassassin/FuzzyOcr.words";
96 $Option{personal_wordlist} = ".spamassassin/fuzzyocr.words";
97 $Option{db_hash} = "/etc/mail/spamassassin/FuzzyOcr.db";
98 $Option{db_safe} = "/etc/mail/spamassassin/FuzzyOcr.safe.db";
99 $Option{db_max_days} = 35;
100 $Option{keep_bad_images} = 0;
101 $Option{anim_delay} = 100;
102 $Option{anim_max_frames} = 2;
103 $Option{score_ham} = 0;
104
105 # Default scores
106 $Score{base} = 4;
107 $Score{add} = 1;
108 $Score{corrupt} = 2.5;
109 $Score{corrupt_unfixable} = 5;
110 $Score{wrongctype} = 1.5;
111 $Score{autodisable} = 10;
112
113 # Default thresolds
114 $Threshold{s}  =
115 $Threshold{h}  =
116 $Threshold{w}  =
117 $Threshold{cn} = 0.01;
118 $Threshold{c}  = 5;
119 $Threshold{max_hash} = 5;
120
121 # constructor: register the eval rule
122 sub new {
123     my ( $class, $mailsa ) = @_;
124     $class = ref($class) || $class;
125     my $self = $class->SUPER::new($mailsa);
126     bless( $self, $class );
127     $self->register_eval_rule("fuzzyocr_check");
128     $self->register_eval_rule("dummy_check");
129     return $self;
130 }
131
132 sub parse_config {
133     my ( $self, $opts ) = @_;
134     if ( $opts->{key} =~ /^focr_bin_/i ) {
135         my $p = lc $opts->{key};
136         $p =~ s/focr_bin_//;
137         if (grep {m/$p/} @bin_utils) {
138             $App{$p} = $opts->{value};
139             debuglog("App{$p} => $App{$p}");
140         } else {
141             debuglog("unknown App: $opts->{key}");
142         }
143     }
144     elsif ( $opts->{key} =~ m/_score$/i ) {
145         my $o = lc $opts->{key};
146         $o =~ s/focr_//;
147         $o =~ s/_score//;
148         if (grep {m/$o/} @pgm_scores) {
149             $Score{$o} = $opts->{value};
150             debuglog("Score{$o} = $Score{$o}");
151         } else {
152             debuglog("unknown Score: $opts->{key}");
153         }
154     }
155     else {
156         my $o = lc $opts->{key};
157         $o =~ s/focr_//;
158         if (grep {m/$o/} @pgm_opts) {
159             if ($o eq 'scansets') {
160                 @scansets = (); # remove
161                 foreach my $s (split(',',$opts->{value})) {
162                     $s =~ s/^\s*//; $s =~ s/\s*$//;
163                     push @scansets,$s;
164                     debuglog("Found scan: $s");
165                 }
166             } elsif ($o eq 'path_bin') {
167                 @paths = (); # remove
168                 foreach my $p (split(':',$opts->{value})) {
169                     next unless -d $p;
170                     push @paths,$p;
171                     debuglog("Valid search path: $p");
172                 }
173             } else {
174                 $Option{$o} = $opts->{value};
175                 debuglog("Option $o = $Option{$o}");
176             }
177         } else {
178             debuglog("unknown Option: $opts->{key}");
179         }
180     }
181     1;
182 }
183
184 sub finish_parsing_end {
185     if ($Option{logfile} !~ m/stderr/i) {
186         if(!Mail::SpamAssassin::Logger::add(method=>'file', filename=>$Option{logfile})) {
187             debuglog("Could not log into $Option{logfile}");
188         } else {
189             Mail::SpamAssassin::Logger::add_facilities('file');
190             debuglog("Now logging to \"$Option{logfile}\"");
191         }
192     }
193     unless (@paths) {
194         foreach my $p ( '/usr/local/netpbm/bin', '/usr/local/bin', '/usr/bin' ) {
195             if (-d $p) {
196                 push @paths, $p;
197                 debuglog("Searching in: $p");
198             }
199         }
200     }
201     foreach my $a (@bin_utils) {
202         if (defined $App{$a} and ! -x $App{$a}) {
203             debuglog("cannot exec $a, removing...");
204             delete $App{$a};
205         }
206         foreach my $p (@paths) {
207             my $f = "$p/$a";
208             if (! defined $App{$a} and -x $f) {
209                 $App{$a} = $f;
210                 last;
211             }
212         }
213         if (defined $App{$a}) {
214             debuglog("Using $a => $App{$a}");
215         } else {
216             debuglog("Cannot find executable for $a");
217         }
218     }
219     if ($Option{enable_image_hashing} == 2 and -r $Option{digest_db}) {
220         my %DB; my $dbm; my $err = 0;
221         my $now = time - ($Option{db_max_days}*86400);
222         tie %DB, 'MLDBM', $Option{db_hash} or $err++;
223         if ($err) {
224             debuglog("Could not open \"$Option{db_hash}\"");
225         } else {
226             my $hash = 0;
227             debuglog("Expiring records prior to: ".scalar(localtime($now)));
228             foreach my $k (keys %DB) {
229                 my $db = $DB{$k};
230                 if ($db->{check} < $now) {
231                     debuglog("Expire: <$k> Reason: $db->{check} < $now");
232                     delete $DB{$k}; $hash++;
233                 }
234             }
235             debuglog("Expired <$hash> Image Hashes after $Option{db_max_days} day(s)")
236                 if ($hash>0);
237             $hash = 0;
238             open HASH, $Option{digest_db};
239             while (<HASH>) {
240                 chomp;
241                 my($score,$basic,$key) = split('::',$_,3);
242                 next if (defined $DB{$key});
243                 $dbm = $DB{$key};
244                 $dbm->{score} = $score;
245                 $dbm->{basic} = $basic;
246                 $dbm->{input} =
247                 $dbm->{check} = time;
248                 $dbm->{match} = 1;
249                 $DB{$key} = $dbm;
250                 $hash++;
251             }
252             close HASH;
253             debuglog("Imported <$hash> Image Hashes from \"$Option{digest_db}\"")
254                 if ($hash>0);
255             $hash = scalar(keys %DB);
256             debuglog("<$hash> Known BAD Image Hashes Available");
257         }
258         untie %DB; $err = 0;
259         tie %DB, 'MLDBM', $Option{db_safe} or $err++;
260         if ($err) {
261             debuglog("Could not open \"$Option{db_safe}\"");
262         } else {
263             my $hash = 0;
264             foreach my $k (keys %DB) {
265                 my $db = $DB{$k};
266                 if ($db->{check} < $now) {
267                     debuglog("Expire: <$k> Reason: $db->{check} < $now");
268                     delete $DB{$k}; $hash++;
269                 }
270             }
271             debuglog("Expired <$hash> Image Hashes after $Option{db_max_days} day(s)")
272                 if ($hash>0);
273             $hash = scalar(keys %DB);
274             debuglog("<$hash> Known GOOD Image Hashes Available");
275         }
276         untie %DB;
277     }
278     load_global_words( $Option{global_wordlist} );
279
280     unless (@scansets) {
281         @scansets = ( '$gocr -i $pfile', '$gocr -l 180 -d 2 -i $pfile');
282     }
283     foreach (@scansets) {
284         debuglog("Using scan: $_");
285     }
286 }
287
288 sub dummy_check {
289     return 0;
290 }
291
292 sub load_global_words {
293     unless ( -r $_[0] ) {
294         debuglog("Cannot read Global wordlist: \"$_[0]\"\n Please check file path and permissions are correct.");
295         return;
296     }
297     my $cnt = 0;
298     open WORDLIST, "<$_[0]";
299     while(my $w = <WORDLIST>) {
300         chomp($w);
301         $w =~ s/\s*//;
302         $w =~ s/#(.*)//;
303         next unless $w;
304         my $wt = $Option{threshold};
305         if ($w =~ /^(.*?)::(0(\.\d+){0,1})/) {
306             ($w, $wt) = (lc($1), $2);
307             $wt = $Option{threshold} unless ($wt =~ m/[\d\.]+/);
308         }
309         $words{$w} = $wt; $cnt++;
310     }
311     close WORDLIST;
312     debuglog("Loaded <$cnt> words from \"$_[0]\"");
313 }
314
315 sub load_personal_words {
316     unless ( -e $_[0] ) {
317         #debuglog("Personal wordlist <$_[0]> not found, skipping...");
318         return;
319     }
320     unless ( -r $_[0] ) {
321         debuglog("Cannot read from wordlist \"$_[0]\"\n Please make sure that permissions are correct."
322         );
323         return;
324     }
325     my $cnt = 0;
326     open WORDLIST, "<$_[0]";
327     while(my $w = <WORDLIST>) {
328         chomp($w);
329         $w =~ s/\s*//;
330         $w =~ s/#(.*)//;
331         next unless $w;
332         my $wt = $Option{threshold};
333         if ($w =~ /^(.*?)::(0(\.\d+){0,1})/) {
334             ($w, $wt) = ($1, $2);
335             $wt = $Option{threshold} unless ($wt =~ m/[\d\.]+/);
336         }
337         $words{$w} = $wt; $cnt++;
338     }
339     close WORDLIST;
340     debuglog("Updated Word List with <$cnt> words from \"$_[0]\"");
341 }
342
343 sub max {
344     unless ( defined( $_[0] ) and defined( $_[1] ) ) { return 0 }
345     unless ( defined( $_[0] ) ) { return $_[1] }
346     unless ( defined( $_[1] ) ) { return $_[0] }
347     if     ( $_[0] < $_[1] )    { return $_[1] }
348     else                        { return $_[0] }
349 }
350
351 sub within_threshold {
352     my $digest = shift;
353     my $record = shift;
354
355     my ($dimg,$dkey) = split('::',$digest);
356     my ($rimg,$rkey) = split('::',$record);
357     my ($ds, $dh, $dw, $dcn) = split(':',$dimg);
358     my ($rs, $rh, $rw, $rcn) = split(':',$rimg);
359     return(0) unless $rs;
360     return(0) unless $rh;
361     return(0) unless $rw;
362     return(0) unless $rcn;
363     return(0) unless $rkey;
364     return(0) if ((abs($ds  - $rs ) / $rs ) > $Threshold{s});
365     return(0) if ((abs($dh  - $rh ) / $rh ) > $Threshold{h});
366     return(0) if ((abs($dw  - $rw ) / $rw ) > $Threshold{w});
367     return(0) if ((abs($dcn - $rcn) / $rcn) > $Threshold{cn});
368            
369     my @rcf = split('::',$rkey);
370     my @dcf = split('::',$dkey);
371
372     my (@dcfs, @rcfs);
373     foreach (@dcf) { push @dcfs,split(':',$_); }
374     foreach (@rcf) { push @rcfs,split(':',$_); }
375
376     my $total = scalar(@rcfs);
377     if ($total == scalar(@dcfs)) {
378         my $match = 0;
379         foreach (0 .. ($total-1)) {
380             $match++ if (abs($dcfs[$_] - $rcfs[$_]) <= $Threshold{c});
381         }
382         debuglog("image matched <$match> of <$total> colors");
383         return(1) if ($match == $total);
384     }
385     return(0);
386 }
387
388 sub fmt_time {
389     my $when = time - $_[0];
390     my $ret;
391
392     if ($when>86400) {
393         my $d = int($when/86400);
394         $when -= $d*86400;
395         $ret = "$d days,";
396     }
397     if ($when>3600) {
398         my $h = int($when/3600);
399         $when -= $h*3600;
400         $ret .= " $h hours";
401     }
402     if ($when>60) {
403         my $m = int($when/60);
404         $when -= $m*60;
405         $ret .= " $m minutes";
406     }
407     if ($when>0) {
408         $ret .= " $when seconds";
409     }
410     $ret .= " ago";
411     return $ret;
412 }
413    
414 sub check_image_hash_db {
415     my $digest = $_[0];
416     my $dbfile = $_[1] || $Option{db_hash};
417     my $fname  = $_[2];
418     my $ctype  = $_[3];
419     my ($img, $key) = split('::', $digest,2);
420     my $hash = $digest;
421     my $ret = 0; my $txt = 'Exact';
422     my $dinfo;
423     my %DB = (); my $dbm; my $new = $key;
424
425     if ($Option{enable_image_hashing} == 2) {
426         tie %DB, 'MLDBM', $dbfile, O_RDWR or $ret++;
427         if ($ret>0) {
428             debuglog("No Image Hash database found at \"$dbfile\", or permissions wrong.");
429             return (0,'');
430         }
431         if (defined $DB{$key}) {
432             $dbm = $DB{$key};
433             if ($img eq $dbm->{basic}) {
434                 $ret = $dbm->{score} || 0.001;
435                 $dinfo = $dbm->{dinfo} || '';
436                 $dbm->{fname} = $fname;
437                 $dbm->{ctype} = $ctype;
438                 debuglog("Updating $txt info File:'$fname' Type:'$ctype'");
439                 $DB{$key} = $dbm;
440             }
441         }
442         if ($ret == 0) {
443             my $now = time - ($Option{db_max_days}*86400);
444             foreach my $k (keys %DB) {
445                 $dbm  = $DB{$k};
446                 $hash = $dbm->{basic} ? $dbm->{basic} : '0:0:0:0' . '::' . $k;
447                 if (within_threshold($digest,$hash)) {
448                     $ret  = $dbfile eq $Option{db_hash} ? $dbm->{score} : $dbm->{match};
449                     $txt  = 'Approx'; $new = $k; $dinfo = $dbm->{dinfo} || '';
450                     debuglog("Found in: <$dbfile>");
451                     last;
452                 }
453                 # Has the record expired??
454                 $dbm->{check} = $now - 1 unless defined $dbm->{check};
455                 if ($dbm->{check} < $now) {
456                     debuglog("Expiring <$k> older than $Option{db_max_days} days");
457                     delete $DB{$k};
458                 }
459             }
460         }
461         if ($ret>0) {
462             $dbm->{match}++;
463             if ($dbfile eq $Option{db_hash}) {
464                 $ret = sprintf("%0.3f",$dbm->{score});
465                 debuglog("Found Score <$ret> for $txt Image Hash");
466             }
467             debuglog("Matched [$dbm->{match}] time(s). Prev match: ".fmt_time($dbm->{check}));
468             $dbm->{check} = time;
469             $DB{$new} = $dbm;
470         }
471         untie %DB;
472         return ($ret,$dinfo);
473     } elsif ($Option{enable_image_hashing} == 1) {
474         $ret = open HASH, $Option{digest_db};
475         unless($ret) {
476             debuglog("No Image Hash database found at \"$Option{digest_db}\", or permissions wrong.");
477             return (0,'');
478         }
479         while (<HASH>) {
480             chomp;
481             ($ret,$hash) = split('::',$_,2);
482             if (within_threshold($digest,$hash)) {
483                 debuglog("Found Score <$ret> for Hash <$hash>");
484                 return ($ret,'');
485             }
486         }
487         close HASH;
488         return (0,'');
489     }
490 }
491            
492 sub add_image_hash_db {
493     my $digest = $_[0];
494     my $score  = $_[1];
495     my $ret = 0;
496
497     if ($Option{enable_image_hashing} == 2) {
498         my $dbfile = $_[2] || $Option{db_hash};
499         my %DB = ();
500         tie %DB, 'MLDBM', $dbfile or $ret++;
501         if ($ret>0) {
502             debuglog("Unable to open/create Image Hash database at \"$dbfile\", check permissions.");
503             return;
504         }
505         debuglog("Adding Hash to \"$dbfile\"");
506
507         my ($img,$key) = split('::',$digest,2);
508         my $dbm = $DB{$key};
509         $dbm->{fname} = $_[3];
510         $dbm->{ctype} = $_[4];
511         $dbm->{dinfo} = $_[5];
512         $dbm->{basic} = $img;
513         $dbm->{score} = $score;
514         $dbm->{input} =
515         $dbm->{check} = time;
516         $dbm->{match} = $dbfile eq $Option{db_hash} ? 0 : 1;
517         $DB{$key} = $dbm;
518         untie %DB;
519     } elsif ($Option{enable_image_hashing} == 1) {
520         if (-e $Option{digest_db}) {
521             $ret = open DB, ">>$Option{digest_db}";
522         } else {
523             $ret = open DB,  ">$Option{digest_db}";
524         }
525         unless ($ret) {
526             debuglog("Unable to open/create Image Hash database at \"$Option{digest_db}\", check permissions.");
527             return;
528         }
529         debuglog("Adding Hash to \"$Option{digest_db}\"");
530         flock( DB, LOCK_EX );
531         seek( DB, 0, 2 );
532         print DB "${score}::${digest}\n";
533         flock( DB, LOCK_UN );
534         close(DB);
535     }
536     debuglog("Digest: $digest");
537 }
538
539 sub calc_image_hash {
540     my $pfile = $_[0];
541     my ($rcode, $hash);
542
543     foreach my $a (qw/ppmhist/) {
544         unless (defined $App{$a}) {
545             info("FuzzyOcr: calc_image_hash cannot exec $a");
546             return (1, '');
547         }
548     }
549    
550     my $img = new Image::Magick;
551     my ($w,$h,$s,$t) = $img->ping($pfile);
552
553     $t = Mail::SpamAssassin::Timeout->new({ secs => $Option{timeout} });
554     my @stdout_data;
555     $rcode = $t->run_and_catch(sub {
556         @stdout_data = qx($App{ppmhist} -noheader $pfile 2>/dev/null);
557     });
558     if ($rcode) {
559         chomp $rcode;
560         debuglog("$App{ppmhist}: Timed out [$rcode], skipping...");
561         return (1, '');
562     }
563     my $cnt = 0;
564     my $c = scalar(@stdout_data);
565     $hash = sprintf "%d:%d:%d:%d",$s,$h,$w,$c;
566     if ($Threshold{max_hash}) {
567         foreach (@stdout_data) {
568             $_ =~ s/ +/ /g;
569             my(@d) = split(' ', $_);
570             $hash .= sprintf("::%d:%d:%d:%d:%d",@d);
571             if ($cnt++ ge $Threshold{max_hash}) {
572                 last;
573             }
574         }
575     }
576     debuglog("Got: <$hash>");
577     return(0, $hash);
578 }
579
580 sub debuglog {
581     my @lines = split('\n',$_[0]);
582     my $limit = $_[1] || 1;
583     if ( $Option{verbose} > $limit ) {
584         foreach (@lines) { info("FuzzyOcr: $_"); }
585     } else {
586         foreach (@lines) { dbg ("FuzzyOcr: $_"); }
587     }
588 }
589
590 sub wrong_ctype {
591     my ( $format, $ctype ) = @_;
592     if ($Score{wrongctype}) {
593         my $debuginfo = "";
594         if ( $Option{verbose} > 0 ) {
595             $debuginfo =
596               ("Image has format \"$format\" but content-type is \"$ctype\"");
597             debuglog($debuginfo);
598         }
599         for my $set ( 0 .. 3 ) {
600             $pms->{conf}->{scoreset}->[$set]->{"FUZZY_OCR_WRONG_CTYPE"} =
601               sprintf( "%0.3f", $Score{wrongctype} );
602         }
603         $pms->_handle_hit( "FUZZY_OCR_WRONG_CTYPE", $Score{wrongctype}, "BODY: ",
604             $pms->{conf}->{descriptions}->{FUZZY_OCR_WRONG_CTYPE} . "\n$debuginfo" );
605     }
606 }
607
608 sub corrupt_img {
609     my ($score, $err) = @_;
610     if ($score>0) {
611         my $debuginfo = "";
612         if ( $Option{verbose} > 0 ) {
613             chomp($err);
614             $debuginfo = ("Corrupt image: $err");
615             debuglog($debuginfo);
616         }
617         for my $set ( 0 .. 3 ) {
618             $pms->{conf}->{scoreset}->[$set]->{"FUZZY_OCR_CORRUPT_IMG"} =
619               sprintf( "%0.3f", $score );
620         }
621         $pms->_handle_hit( "FUZZY_OCR_CORRUPT_IMG", $score, "BODY: ",
622             $pms->{conf}->{descriptions}->{FUZZY_OCR_CORRUPT_IMG} . "\n$debuginfo" );
623     }
624 }
625
626 sub known_img_hash {
627     my $score = $_[0] || $Score{base};
628     my $dinfo = $_[1] ? "\n$_[1]" : '';
629     for my $set ( 0 .. 3 ) {
630         $pms->{conf}->{scoreset}->[$set]->{"FUZZY_OCR_KNOWN_HASH"} =
631         sprintf( "%0.3f", $score );
632     }
633     $pms->_handle_hit( "FUZZY_OCR_KNOWN_HASH", $score, "BODY: ",
634         $pms->{conf}->{descriptions}->{FUZZY_OCR_KNOWN_HASH} . $dinfo );
635 }
636
637 sub removedir {
638     my $dir = $_[0];
639     return unless -d $dir;
640     opendir D, $dir;
641     my @files = readdir D;
642     closedir D;
643     foreach my $f (@files) {
644         next if $f eq '.';
645         next if $f eq '..';
646         my $ff = Mail::SpamAssassin::Util::untaint_file_path("$dir/$f");
647         unless (unlink $ff) {
648             debuglog("Cannot remove: $ff");
649         }
650     }
651     debuglog("Remove DIR: $dir");
652     unless(rmdir $dir) {
653         debuglog("Cannot remove DIR: $dir");
654     }
655 }
656
657 sub fuzzyocr_check {
658     ( $self, $pms ) = @_;
659     if ( $pms->get_score() > $Score{autodisable} ) {
660         debuglog("Scan canceled, message has already more than $Score{autodisable} points.");
661         return 0;
662     }
663
664     my $imgdir;
665     my %imgfiles = ();
666     my @found    = ();
667     my @hashes   = ();
668     my $cnt      = 0;
669     my $imgerr   = 0;
670
671     #debuglog("Starting FuzzyOcr...");
672     #debuglog("Attempting to load personal wordlist...");
673     if ($Option{personal_wordlist} =~ m/^\//) {
674         load_personal_words( $Option{personal_wordlist} );
675     } else {
676         my $homedir  = (getpwuid($<))[7];
677         if ($homedir) {
678             load_personal_words( $homedir . "/$Option{personal_wordlist}" );
679         } elsif (defined($ENV{HOME})) {
680             load_personal_words( $ENV{HOME} . "/$Option{personal_wordlist}" );
681         } else {
682             debuglog("Variable \$ENV{HOME} not defined and getpwuid failed, personal wordlist function not available...");
683         }
684     }
685
686     foreach my $p (
687         $pms->{msg}->find_parts(qr(^image\b)i),
688         $pms->{msg}->find_parts(qr(Application/Octet-Stream)i)
689     ) {
690         my $ctype = $p->{'type'};
691         my $fname = $p->{'name'} || 'unknown';
692         if (($fname eq 'unknown') and
693             (defined $p->{'headers'}->{'content-id'})
694             ){
695             $fname = join('',@{$p->{'headers'}->{'content-id'}});
696             $fname =~ s/[<>]//g;
697             $fname =~ tr/\@\$\%\&/_/s;
698         }
699         my $test  = 0;
700         $test++ if ($ctype =~ /image/i);
701         $test++ if ($fname =~ /(gif|jpg|jpeg|png|bmp|tiff)$/i);
702
703         if ($test == 0) {
704             debuglog("Skipping file with content-type=\"$ctype\" name=\"$fname\"");
705             next;
706         }
707
708         $imgdir = Mail::SpamAssassin::Util::secure_tmpdir() unless ($imgdir);
709         unless ($imgdir) {
710             debuglog("Scan canceled, cannot create Image TMPDIR.");
711             return 0;
712         }
713         #keep raw email for debugging later
714         my $imgfilename = $imgdir . "/raw.eml";
715         unless (-e $imgfilename) {
716             if (open RAW, ">$imgfilename") {
717                 print RAW $pms->{msg}->get_pristine();
718                 close RAW;
719                 debuglog("Saved: $imgfilename");
720             }
721         }
722
723         $fname =~ tr{a-zA-Z0-9\.}{_}cs;
724         $imgfilename = Mail::SpamAssassin::Util::untaint_file_path(
725             $imgdir . "/" . $fname
726         );
727         my $unique = 0;
728         while (-e $imgfilename) {
729             $imgfilename = Mail::SpamAssassin::Util::untaint_file_path(
730                 $imgdir . "/" . chr(65+$unique) . "." . $fname
731             );
732             $unique++;
733         }
734
735         unless (open PICT, ">$imgfilename") {
736             debuglog("Cannot write \"$imgfilename\", skipping...");
737             next;
738         }
739         my $pdata = $p->decode();
740         binmode PICT;
741         print PICT $pdata;
742         close PICT;
743         debuglog("Wrote: $imgfilename"); $cnt++;
744         $imgfiles{$imgfilename}{header} = substr($pdata,0,6);
745         $imgfiles{$imgfilename}{ctype} = $ctype;
746         $imgfiles{$imgfilename}{fname} = $fname;
747     }
748
749     if ($cnt == 0) {
750         #debuglog("Skipping OCR, no image files found...");
751         removedir($imgdir) if (defined($imgdir) and ($Option{keep_bad_images}<2));
752         return 0;
753     }
754     debuglog("Found: $cnt images"); $cnt = 0;
755     my $t = Mail::SpamAssassin::Timeout->new({ secs => $Option{timeout} });
756     my $retcode;
757
758     my $haserr = open RAWERR, ">$imgdir/raw.err";
759     debuglog("Errors to: $imgdir/raw.err") if ($haserr>0);
760
761     IMAGE:
762     foreach my $file (keys %imgfiles) {
763         my $pic = $imgfiles{$file};
764         debuglog("Analyzing file with content-type=\"$$pic{ctype}\"");
765         my @used_scansets = ();
766         my $corrupt = 0;
767         my $digest;
768         my $ptype = 0;
769         my $tfile = $file;
770         my $pfile = $file . ".pnm";
771         my $efile = $file . ".err";
772         debuglog("pfile => $pfile");
773         debuglog("efile => $efile");
774
775         if ( substr($$pic{header},0,3) eq "\x47\x49\x46" ) {
776             debuglog("Found GIF header name=\"$$pic{fname}\"");