root/trunk/devel/patchset2.patch
| Revision 116, 17.8 kB (checked in by decoder, 2 years ago) |
|---|
-
FuzzyOcr.cf
old new 33 33 # Default value: 1 34 34 #focr_verbose 3 35 35 36 # Log Message-Id, From, To 37 # Default: 1 38 #focr_log_pmsinfo 0 39 36 40 # Send logging output to stderr. 37 41 # Default value: 1 38 42 #focr_log_stderr 0 … … 163 167 # Default value: 0 164 168 #focr_global_timeout 1 165 169 170 # Minimum image size to scan. Images with dimensions smaller than the 171 # ones specified here will be skipped: 172 # Default: Height:4 Width:4 173 #focr_min_height 4 174 #focr_min_width 4 175 166 176 # Maximum file size for different formats in byte, bigger pictures 167 177 # will not be scanned 168 178 # Default values: Unlimited) -
FuzzyOcr.pm
old new 124 124 my $imgerr = 0; 125 125 my $main = $self->{main}; 126 126 127 my $from = $pms->get('From') ? $pms->get('From') : "<no sender>"; 128 my $to = $pms->get('To') ? $pms->get('To') : "<no receipients>"; 129 my $msgid = $pms->get('Message-Id') ? $pms->get('Message-Id') : "<no messageid>"; 127 debuglog("Starting FuzzyOcr..."); 128 129 #Show PMS info if asked to 130 if ($conf->{focr_log_pmsinfo}) { 131 my $msgid = $pms->get('Message-Id') ? $pms->get('Message-Id') : "<no messageid>"; 132 my $from = $pms->get('From') ? $pms->get('From') : "<no sender>"; 133 my $to = $pms->get('To') ? $pms->get('To') : "<no receipients>"; 134 chomp($from, $to, $msgid); 135 infolog("Processing Message with ID \"$msgid\" ($from -> $to)"); 136 } 130 137 131 chomp($from, $to, $msgid);132 133 debuglog("Starting FuzzyOcr...");134 infolog("Processing Message with ID \"$msgid\" ($from -> $to)");135 138 foreach my $p ( 136 139 $pms->{msg}->find_parts(qr(^image\b)i), 137 140 $pms->{msg}->find_parts(qr(Application/Octet-Stream)i) … … 146 149 $fname =~ tr/\@\$\%\&/_/s; 147 150 } 148 151 149 my $filename = $fname; $filename =~ tr{a-zA-Z0-9\.}{_}cs; 152 my $filename = $fname; $filename =~ tr{a-zA-Z0-9\-.}{_}cs; 153 debuglog("fname: \"$fname\" => \"$filename\""); 150 154 my $pdata = $p->decode(); 151 155 my $pdatalen = length($pdata); 152 156 my $w = 0; my $h = 0; 153 157 154 my $blah = substr($pdata,0,3);155 156 158 if ( substr($pdata,0,3) eq "\x47\x49\x46" ) { 157 159 ## GIF File 158 160 $imgfiles{$filename}{ftype} = 1; 159 161 ($w,$h) = unpack("vv",substr($pdata,6,4)); 160 infolog("GIF: [${h}x${w}] $filename ");162 infolog("GIF: [${h}x${w}] $filename ($pdatalen)"); 161 163 $imgfiles{$filename}{width} = $w; 162 164 $imgfiles{$filename}{height} = $h; 163 165 } elsif ( substr($pdata,0,2) eq "\xff\xd8" ) { … … 184 186 errorlog("Cannot find image dimensions"); 185 187 } else { 186 188 ($h,$w) = unpack("nn",substr($pdata,$pos+3,4)); 187 infolog("JPEG: [${h}x${w}] $filename ");189 infolog("JPEG: [${h}x${w}] $filename ($pdatalen)"); 188 190 $imgfiles{$filename}{ftype} = 2; 189 191 $imgfiles{$filename}{height} = $h; 190 192 $imgfiles{$filename}{width} = $w; … … 195 197 $imgfiles{$filename}{ftype} = 3; 196 198 $imgfiles{$filename}{width} = $w; 197 199 $imgfiles{$filename}{height} = $h; 198 infolog("PNG: [${h}x${w}] $filename ");200 infolog("PNG: [${h}x${w}] $filename ($pdatalen)"); 199 201 } elsif ( substr($pdata,0,2) eq "BM" ) { 200 202 ## BMP File 201 ($w,$h) = unpack(" NN",substr($pdata,18,8));203 ($w,$h) = unpack("VV",substr($pdata,18,8)); 202 204 $imgfiles{$filename}{ftype} = 4; 203 205 $imgfiles{$filename}{width} = $w; 204 206 $imgfiles{$filename}{height} = $h; 205 infolog("BMP: [${h}x${w}] $filename ");207 infolog("BMP: [${h}x${w}] $filename ($pdatalen)"); 206 208 } elsif ( 207 209 ## TIFF File 208 210 (substr($pdata,0,4) eq "\x4d\x4d\x00\x2a") or … … 218 220 $w = $val if ($id == 257); 219 221 last if ($h != 0 and $w != 0); 220 222 } 221 infolog("TIFF: [${h}x${w}] $filename ($ worder)");223 infolog("TIFF: [${h}x${w}] $filename ($pdatalen) ($worder)"); 222 224 infolog("Cannot determine size of TIFF image, setting to '1x1'") if ($h == 0 and $w == 0); 223 225 $imgfiles{$filename}{ftype} = 5; 224 226 $imgfiles{$filename}{width} = $w ? $w : 1; … … 230 232 infolog("Skipping file with content-type=\"$ctype\" name=\"$fname\""); 231 233 delete $imgfiles{$filename}; 232 234 next; 235 } 233 236 237 #Skip images that cannot contain text 238 if ($imgfiles{$filename}{height} < $conf->{focr_min_height}) { 239 infolog("Skipping image: height < $conf->{focr_min_height}"); 240 delete $imgfiles{$filename}; 241 next; 234 242 } 235 243 244 #Skip images that cannot contain text 245 if ($imgfiles{$filename}{width} < $conf->{focr_min_width}) { 246 infolog("Skipping image: width < $conf->{focr_min_width}"); 247 delete $imgfiles{$filename}; 248 next; 249 } 250 236 251 #Found Image!! Get a temporary dir to save image 237 252 $imgdir = Mail::SpamAssassin::Util::secure_tmpdir(); 238 253 unless ($imgdir) { … … 243 258 244 259 #Generete unique filename to store image 245 260 my $imgfilename = Mail::SpamAssassin::Util::untaint_file_path( 246 $imgdir . "/" . $f name261 $imgdir . "/" . $filename 247 262 ); 248 263 my $unique = 0; 249 264 while (-e $imgfilename) { 250 265 $imgfilename = Mail::SpamAssassin::Util::untaint_file_path( 251 $imgdir . "/" . chr(65+$unique) . "." . $f name266 $imgdir . "/" . chr(65+$unique) . "." . $filename 252 267 ); 253 268 $unique++; 254 269 } … … 308 323 } 309 324 } 310 325 311 IMAGE:312 326 my $haserr; 313 327 foreach my $filename (keys %imgfiles) { 314 328 my $pic = $imgfiles{$filename}; … … 351 365 infolog("Found GIF header name=\"$$pic{fname}\""); 352 366 if ($conf->{focr_skip_gif}) { 353 367 infolog("Skipping image check"); 354 next IMAGE;368 next; 355 369 } 356 370 if (defined($conf->{focr_max_size_gif}) and ($$pic{fsize} > $conf->{focr_max_size_gif})) { 357 371 infolog("GIF file size ($$pic{fsize}) exceeds maximum file size for this format, skipping..."); … … 374 388 foreach my $a (qw/gifsicle giftext giffix gifinter giftopnm/) { 375 389 unless (defined $conf->{"focr_bin_$a"}) { 376 390 errorlog("Cannot exec $a, skipping image"); 377 next IMAGE;391 next; 378 392 } 379 393 } 394 380 395 my @stderr_data; 381 382 396 my ($retcode, @stdout_data) = save_execute( 383 397 "$conf->{focr_bin_giftext} $file", 384 398 undef, … … 429 443 } 430 444 } 431 445 } 432 433 if (defined($conf->{focr_max_size_gif}) and ( ((stat($tfile))[7])> $conf->{focr_max_size_gif})) {434 infolog("Fixed GIF file size ($ $pic{fsize}) exceeds maximum file size for this format, skipping...");446 my $fixedsize = (stat($tfile))[7]; 447 if (defined($conf->{focr_max_size_gif}) and ($fixedsize > $conf->{focr_max_size_gif})) { 448 infolog("Fixed GIF file size ($fixedsize) exceeds maximum file size for this format, skipping..."); 435 449 next; 436 450 } 437 451 … … 503 517 infolog("Found JPEG header name=\"$$pic{fname}\""); 504 518 if ($conf->{focr_skip_jpeg}) { 505 519 infolog("Skipping image check"); 506 next IMAGE;520 next; 507 521 } 508 522 509 523 if (defined($conf->{focr_max_size_jpeg}) and ($$pic{fsize} > $conf->{focr_max_size_jpeg})) { … … 523 537 foreach my $a (qw/jpegtopnm/) { 524 538 unless (defined $conf->{"focr_bin_$a"}) { 525 539 errorlog("Cannot exec $a, skipping image"); 526 next IMAGE;540 next; 527 541 } 528 542 } 529 543 printf RAWERR qq(## $conf->{focr_bin_jpegtopnm} $file >$pfile 2>>$efile\n) if ($haserr>0); … … 545 559 infolog("Found PNG header name=\"$$pic{fname}\""); 546 560 if ($conf->{focr_skip_png}) { 547 561 infolog("Skipping image check"); 548 next IMAGE;562 next; 549 563 } 550 564 if (defined($conf->{focr_max_size_png}) and ($$pic{fsize} > $conf->{focr__max_size_png})) { 551 565 infolog("PNG file size ($$pic{fsize}) exceeds maximum file size for this format, skipping..."); … … 562 576 foreach my $a (qw/pngtopnm/) { 563 577 unless (defined $conf->{"focr_bin_$a"}) { 564 578 errorlog("Cannot exec $a, skipping image"); 565 next IMAGE;579 next; 566 580 } 567 581 } 568 582 … … 585 599 infolog("Found BMP header name=\"$$pic{fname}\""); 586 600 if ($conf->{focr_skip_bmp}) { 587 601 infolog("Skipping image check"); 588 next IMAGE;602 next; 589 603 } 590 604 if (defined($conf->{focr_max_size_bmp}) and ($$pic{fsize} > $conf->{focr_max_size_bmp})) { 591 605 infolog("BMP file size ($$pic{fsize}) exceeds maximum file size for this format, skipping..."); … … 602 616 foreach my $a (qw/bmptopnm/) { 603 617 unless (defined $conf->{"focr_bin_$a"}) { 604 618 errorlog("Cannot exec $a, skipping image"); 605 next IMAGE;619 next; 606 620 } 607 621 } 608 622 printf RAWERR qq(## $conf->{focr_bin_bmptopnm} $file >$pfile 2>>$efile\n) if ($haserr>0); … … 624 638 infolog("Found TIFF header name=\"$$pic{fname}\""); 625 639 if ($conf->{focr_skip_tiff}) { 626 640 infolog("Skipping image check"); 627 next IMAGE;641 next; 628 642 } 629 643 if (defined($conf->{focr_max_size_tiff}) and ($$pic{fsize} > $conf->{focr_max_size_tiff})) { 630 644 infolog("TIFF file size ($$pic{fsize}) exceeds maximum file size for this format, skipping..."); … … 642 656 foreach my $a (qw/tifftopnm/) { 643 657 unless (defined $conf->{"focr_bin_$a"}) { 644 658 errorlog("Cannot exec $a, skipping image"); 645 next IMAGE;659 next; 646 660 } 647 661 } 648 662 printf RAWERR qq(## $conf->{focr_bin_tifftopnm} $file >$pfile 2>>$efile\n) if ($haserr>0); … … 688 702 ($score,$dinfo) = check_image_hash_db($digest, $whash, $$pic{fname}, $$pic{ctype}, $$pic{ftype}); 689 703 if ($score > 0) { 690 704 infolog("Image in KNOWN_GOOD. Skipping OCR checks..."); 691 next IMAGE;705 next; 692 706 } 693 707 } 694 708 if ($digest eq '') { 695 709 infolog("Empty Hash, skipping..."); 696 next IMAGE;710 next; 697 711 } 698 712 } else { 699 713 infolog("Image hashing disabled in configuration, skipping..."); … … 711 725 712 726 my @ocr_results = (); 713 727 my $scansets = get_scansets(); 728 my $newlist = ''; 729 foreach my $s (@$scansets) { 730 $newlist .= ' ' . $s->{label} . '(' . $s->{hit_counter} . ')'; 731 } 732 infolog("Scanset Order:$newlist"); 714 733 my $mcnt = 0; 715 734 my $modus = 0; 716 735 my $modus_match = 0; … … 832 851 } 833 852 } 834 853 } 835 infolog("Resorting scanset list..."); 836 @$scansets = sort { $b->{hit_counter} <=> $a->{hit_counter} } @$scansets; 854 837 855 } 838 856 last; 839 857 } -
FuzzyOcr/Config.pm
old new 130 130 } 131 131 132 132 sub get_scansets { 133 if ($conf->{focr_autosort_scanset}) { 134 @scansets = sort { $b->{hit_counter} <=> $a->{hit_counter} } @scansets; 135 } 133 136 return \@scansets; 134 137 } 135 138 … … 194 197 type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC 195 198 }); 196 199 } 200 foreach my $t (qw/height width/) { 201 push (@cmds, { 202 setting => 'focr_min_'.$t, 203 default => 4, 204 type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC 205 }); 206 } 197 207 push (@cmds, { 198 208 setting => 'focr_threshold', 199 209 default => 0.25, … … 245 255 }); 246 256 247 257 push (@cmds, { 258 setting => 'focr_log_pmsinfo', 259 default => 1, 260 type => $Mail::SpamAssassin::Conf::CONF_TYPE_BOOL 261 }); 262 263 push (@cmds, { 248 264 setting => 'focr_enable_image_hashing', 249 265 default => 0, 250 266 code => sub { -
FuzzyOcr/Hashing.pm
old new 94 94 $dinfo = $data[9] || ''; 95 95 if ($data[2] eq '') { 96 96 infolog("Updating $txt info File-Name:'$fname'"); 97 $ddb->do(qq(update $db.$dbfile set $dbfile.fname= '$fname' where $dbfile.key='$key'));97 $ddb->do(qq(update $db.$dbfile set $dbfile.fname=? where $dbfile.key='$key'),undef,$fname); 98 98 } 99 99 if ($data[3] eq '') { 100 100 infolog("Updating $txt info Content-Type:'$ctype'"); 101 $ddb->do(qq(update $db.$dbfile set $dbfile.ctype= '$ctype' where $dbfile.key='$key'));101 $ddb->do(qq(update $db.$dbfile set $dbfile.ctype=? where $dbfile.key='$key'),undef,$ctype); 102 102 } 103 103 if ($data[4] != $ftype) { 104 104 infolog("Updating $txt info File-Type:'$ftype'"); 105 $ddb->do(qq(update $db.$dbfile set $dbfile.ftype= '$ftype' where $dbfile.key='$key'));105 $ddb->do(qq(update $db.$dbfile set $dbfile.ftype=? where $dbfile.key='$key'),undef,$ftype); 106 106 } 107 107 } 108 108 unless ($match) { … … 134 134 } 135 135 infolog("Matched [$next] time(s). Prev match: ".fmt_time($now - $when)); 136 136 $sql = qq(update $db.$dbfile set $dbfile.match='$next',$dbfile.check='$now' where $dbfile.key='$key'); 137 debuglog($sql,2);138 137 $ddb->do($sql); 138 debuglog($sql); 139 139 } 140 140 return ($ret,$dinfo); 141 141 } … … 143 143 use MLDBM qw(DB_File Storable); 144 144 use MLDBM::Sync; 145 145 my %DB = (); my $dbm; my $sdbm; 146 $sdbm = tie %DB, 'MLDBM::Sync', $dbfile, O_ RDWR or $ret++;146 $sdbm = tie %DB, 'MLDBM::Sync', $dbfile, O_CREAT|O_RDWR or $ret++; 147 147 if ($ret>0) { 148 148 warnlog("No Image Hash database found at \"$dbfile\", or permissions wrong."); 149 149 return (0,''); … … 242 242 if ($conf->{focr_mysql_update_hash}) { 243 243 infolog("Hash already in $db.$table updating..."); 244 244 $sql = "update $db.$table set "; 245 $sql .= "basic='$img'," unless ($data[1] eq $img); 246 $sql .= "fname='$fname'," unless ($data[2] eq $fname); 247 $sql .= "ctype='$ctype'," unless ($data[3] eq $ctype); 248 $sql .= "ftype='$ftype'," unless ($data[4] == $ftype); 249 $sql .= "score='$score'," unless ($data[8] == $score); 250 $sql .= "dinfo='$dinfo'," unless ($data[9] eq $dinfo); 245 my @params; 246 unless ($data[1] eq $img) { 247 $sql .= "basic=?,"; push @params,$img; 248 } 249 unless ($data[2] eq $fname) { 250 $sql .= "fname=?,"; push @params,$fname; 251 } 252 unless ($data[3] eq $ctype) { 253 $sql .= "ctype=?,"; push @params,$ctype; 254 } 255 unless ($data[4] == $ftype) { 256 $sql .= "ftype=?,"; push @params,$ftype; 257 } 258 unless ($data[8] == $score) { 259 $sql .= "score=?,"; push @params,$score; 260 } 261 unless ($data[9] == $dinfo) { 262 $sql .= "dinfo=?,"; push @params,$dinfo; 263 } 251 264 $sql =~ s/,$//; 252 265 $sql .= " where $table.key='$key'"; 266 $ddb->do($sql,undef,@params); 267 foreach my $p (@params) { $sql =~ s/\?/$p/; } 253 268 debuglog($sql); 254 $ddb->do($sql);255 269 } else { 256 270 infolog("Hash already in $db.$table skipping..."); 257 271 } 258 272 } else { 259 $sql = 260 "insert into $db.$table values ('". $key 261 . "','" . $img 262 . "','" . $fname 263 . "','" . $ctype 264 . "','" . $ftype 265 . "','" . ($table eq $conf->{focr_mysql_hash} ? 0 : 1) 266 . "','" . time 267 . "','" . time 268 . "','" . $score 269 . "','" . $dinfo 270 . "')"; 273 my @params = ( 274 $key, $img, $fname, $ctype, $ftype, 275 ($table eq $conf->{focr_mysql_hash} ? 0 : 1), 276 time, time, $score, $dinfo); 277 $sql = "insert into $db.$table values (?,?,?,?,?,?,?,?,?,?)"; 278 $ddb->do($sql,undef,@params); 279 foreach my $p (@params) { $sql =~ s/\?/$p/; } 271 280 debuglog($sql); 272 $ddb->do($sql);273 281 } 274 282 } 275 283 }
Note: See TracBrowser for help on using the browser.
