Changeset 38
- Timestamp:
- 16.11.2006 23:47:41 (2 years ago)
- Files:
-
- trunk/devel/FuzzyOcr.cf (modified) (2 diffs)
- trunk/devel/FuzzyOcr.mysql (added)
- trunk/devel/FuzzyOcr.pm (modified) (7 diffs)
- trunk/devel/FuzzyOcr/Config.pm (modified) (6 diffs)
- trunk/devel/FuzzyOcr/Hashing.pm (modified) (9 diffs)
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
trunk/devel/FuzzyOcr.cf
r37 r38 135 135 # Value = 1 ... use digest_hash only 136 136 # Value = 2 ... use digest_db w/digest_hash import 137 # Value = 3 ... use mysql database 137 138 #focr_enable_image_hashing 2 138 139 # … … 154 155 #focr_db_max_days 15 155 156 # 157 # MySQL options 158 #focr_mysql_db FuzzyOcr 159 #focr_mysql_hash Hash 160 #focr_mysql_safe Safe 161 #focr_mysql_user fuzzyocr 162 #focr_mysql_pass fuzzyocr 163 #focr_mysql_host localhost 164 #focr_mysql_port 3306 165 #focr_mysql_socket /tmp/mysql.sock (Default: empty) 166 # 156 167 # Automatically add hashes of spam images recognized by OCR to the Image Hash database, to disable, set to 0.0 (Default value: 1) 157 168 #focr_hashing_learn_scanned 1 trunk/devel/FuzzyOcr.pm
r37 r38 19 19 20 20 use lib qw(.); # Allow placing of FuzzyOcr in siteconfigdir 21 use FuzzyOcr::Config qw(get_pms save_pms get_ thresholds get_scansets get_config get_wordlist set_config finish_parsing_end load_global_words load_personal_words debuglog logfile);21 use FuzzyOcr::Config qw(get_pms save_pms get_ddb get_thresholds get_scansets get_config get_wordlist set_config finish_parsing_end load_global_words load_personal_words debuglog logfile); 22 22 use FuzzyOcr::Hashing qw(check_image_hash_db check_image_hash_db add_image_hash_db calc_image_hash); 23 23 use FuzzyOcr::Deanimate qw(deanimate); … … 74 74 debuglog("Variable \$ENV{HOME} not defined and getpwuid failed, personal wordlist function not available..."); 75 75 } 76 } 77 if ($conf->{focr_enable_image_hashing} == 3) { 78 $conf->{focr_ddb} = get_ddb(); 76 79 } 77 80 … … 428 431 debuglog("Error calculating the image hash, skipping hash check..."); 429 432 } else { 430 my ($score, $dinfo); 431 ($score,$dinfo) = check_image_hash_db($digest, $conf->{"focr_db_hash"}, $$pic{fname}, $$pic{ctype}); 433 my ($score, $dinfo, $whash); 434 $whash = $conf->{focr_enable_image_hashing} == 3 435 ? $conf->{focr_mysql_hash} 436 : $conf->{focr_db_hash}; 437 ($score,$dinfo) = check_image_hash_db($digest, $whash, $$pic{fname}, $$pic{ctype}); 432 438 if ($score > 0) { 433 439 known_img_hash($score,$dinfo); … … 436 442 return 0; 437 443 } 438 ($score,$dinfo) = check_image_hash_db($digest, $conf->{"focr_db_safe"}, $$pic{fname}, $$pic{ctype}); 444 $whash = $conf->{focr_enable_image_hashing} == 3 445 ? $conf->{focr_mysql_safe} 446 : $conf->{focr_db_safe}; 447 ($score,$dinfo) = check_image_hash_db($digest, $whash, $$pic{fname}, $$pic{ctype}); 439 448 if ($score > 0) { 440 449 debuglog("Image in KNOWN_GOOD. Skipping OCR checks..."); … … 521 530 522 531 if ($cnt == 0) { 523 if ($conf->{"focr_enable_image_hashing"} == 2and @hashes) {532 if ($conf->{"focr_enable_image_hashing"} > 1 and @hashes) { 524 533 debuglog("Message is ham, saving..."); 525 534 foreach my $h (@hashes) { 526 535 my ($mcnt,$fname,$ctype,$digest) = split('::',$h,4); 527 536 next if $mcnt; 528 add_image_hash_db($digest,0,$conf->{"focr_db_safe"},$fname,$ctype); 537 my $whash = $conf->{focr_enable_image_hashing} == 3 538 ? $conf->{focr_mysql_safe} 539 : $conf->{focr_db_safe}; 540 add_image_hash_db($digest,0,$whash,$fname,$ctype); 529 541 } 530 542 } … … 548 560 my ($mcnt,$fname,$ctype,$digest) = split('::',$h,4); 549 561 next unless $mcnt; 550 add_image_hash_db($digest,$score,$conf->{"focr_db_hash"},$fname,$ctype,$debuginfo); 562 my $whash = $conf->{focr_enable_image_hashing} == 3 563 ? $conf->{focr_mysql_hash} 564 : $conf->{focr_db_hash}; 565 add_image_hash_db($digest,$score,$whash,$fname,$ctype,$debuginfo); 551 566 } 552 567 } … … 562 577 if ($imgerr == 0 and $conf->{"focr_keep_bad_images"}<2) { 563 578 removedir($imgdir); 579 } 580 if ($conf->{focr_enable_image_hashing} == 3) { 581 $conf->{focr_ddb}->disconnect; 564 582 } 565 583 debuglog("FuzzyOcr ending successfully..."); trunk/devel/FuzzyOcr/Config.pm
r35 r38 10 10 get_wordlist 11 11 set_config 12 get_ddb 12 13 finish_parsing_end 13 14 load_global_words … … 26 27 our $conf; 27 28 our $pms; 28 29 our @bin_utils = qw/gifsicle giffix giftext gifinter giftopnm 30 jpegtopnm pngtopnm bmptopnm tifftopnm ppmhist pamfile gocr ocrad/; 31 32 our @pgm_scores = qw/base add corrupt corrupt_unfixable wrongctype 33 autodisable/; 34 35 our @pgm_opts = qw/personal_wordlist global_wordlist logfile 36 threshold counts_required verbose timeout max_size_gif max_size_jpeg 37 max_size_tiff max_size_bmp db_hash db_safe db_max_days path_bin 38 scansets keep_bad_images score_ham enable_image_hashing digest_db 39 hashing_learn_scanned/; 29 our $ddb; 30 31 our @bin_utils = qw/gifsicle 32 giffix 33 giftext 34 gifinter 35 giftopnm 36 jpegtopnm 37 pngtopnm 38 bmptopnm 39 tifftopnm 40 ppmhist 41 pamfile 42 ocrad 43 gocr/; 40 44 41 45 our @paths = qw(/usr/local/netpbm/bin /usr/local/bin /usr/bin); … … 75 79 } 76 80 81 sub get_ddb { 82 my $conf = get_config(); 83 my %dopts = ( AutoCommit => 1 ); 84 my $dsn = sprintf "dbi:mysql:%s\@%s:%d", 85 $conf->{focr_mysql_db}, 86 $conf->{focr_mysql_host}, 87 $conf->{focr_mysql_port} 88 ); 89 my $ddb = DBI->connect($dsn, 90 $conf->{focr_mysql_user}, 91 $conf->{focr_mysql_pass}, 92 \%dopts); 93 return $ddb; 94 } 95 77 96 sub set_config { 78 97 my($self, $conf) = @_; … … 125 144 return $Mail::SpamAssassin::Conf::MISSING_REQUIRED_VALUE; 126 145 } 127 unless ($value =~ m/^[012 ]$/) {146 unless ($value =~ m/^[0123]$/) { 128 147 return $Mail::SpamAssassin::Conf::INVALID_VALUE; 129 148 } … … 146 165 push (@cmds, { 147 166 setting => 'focr_global_wordlist', 148 default => " /etc/mail/spamassassin/FuzzyOcr.words",167 default => "__userstate__/FuzzyOcr.words", 149 168 type => $Mail::SpamAssassin::Conf::CONF_TYPE_STRING 150 169 }); … … 260 279 default => '$gocr -i $pfile, $gocr -l 180 -d 2 -i $pfile, $gocr -l 140 -d 2 -i $pfile', 261 280 type => $Mail::SpamAssassin::Conf::CONF_TYPE_STRING 262 }); 281 }); 282 283 push (@cmds, { 284 setting => 'focr_mysql_host', 285 default => 'localhost', 286 type => $Mail::SpamAssassin::Conf::CONF_TYPE_STRING 287 }); 288 289 push (@cmds, { 290 setting => 'focr_mysql_port', 291 default => 3306, 292 type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC 293 }); 294 295 push (@cmds, { 296 setting => 'focr_mysql_db', 297 default => 'FuzzyOcr', 298 type => $Mail::SpamAssassin::Conf::CONF_TYPE_STRING 299 }); 300 301 push (@cmds, { 302 setting => 'focr_mysql_hash', 303 default => 'Hash', 304 type => $Mail::SpamAssassin::Conf::CONF_TYPE_STRING 305 }); 306 307 push (@cmds, { 308 setting => 'focr_mysql_safe', 309 default => 'Safe', 310 type => $Mail::SpamAssassin::Conf::CONF_TYPE_STRING 311 }); 312 313 foreach (qw/user pass/) { 314 push (@cmds, { 315 setting => 'focr_mysql_'.$_, 316 default => 'fuzzyocr', 317 type => $Mail::SpamAssassin::Conf::CONF_TYPE_STRING 318 }); 319 } 263 320 264 321 $conf->{parser}->register_commands(\@cmds); trunk/devel/FuzzyOcr/Hashing.pm
r20 r38 10 10 use lib "../"; 11 11 use FuzzyOcr::Config qw(get_thresholds get_config set_config debuglog logfile); 12 use DBI; 12 13 use MLDBM qw(DB_File Storable); 13 14 use Fcntl; … … 58 59 sub check_image_hash_db { 59 60 my $conf = get_config(); 60 return (0,'') unless ( 61 ($conf->{'focr_enable_image_hashing'} > 0) and 62 ($conf->{'focr_enable_image_hashing'} < 3) 63 ); 61 return (0,'') if ($conf->{focr_enable_image_hashing} == 0); 64 62 my $digest = $_[0]; 65 63 my $dbfile = $_[1] || $conf->{"focr_db_hash"}; … … 68 66 my ($img, $key) = split('::', $digest,2); 69 67 return (0,'') unless defined $key; 68 my $now = time; 70 69 my $hash = $digest; 71 70 my $ret = 0; my $txt = 'Exact'; 72 71 my $dinfo; 73 my %DB = (); my $dbm; 74 75 if ($conf->{"focr_enable_image_hashing"} == 2) { 72 73 if ($conf->{focr_enable_image_hashing} == 3) { 74 unless (defined $conf->{focr_ddb}) { 75 debuglog("Cannot connect to '$conf->{focr_mysql_db}\@$conf->{focr_mysql_host}:$conf->{focr_mysql_port}"); 76 return (0,''); 77 } 78 my $ddb = $conf->{focr_ddb}; 79 my @data = $ddb->selectrow_array(qq(select * from $fname where key='$key')); 80 my $next = 0; 81 my $when = 0; 82 if (scalar(@data)>0) { 83 $next = $data[4] || 0; $next++; 84 $when = $data[6] || $now; 85 $ret = $data[7] || 0.001; 86 $dinfo = $data[8] || ''; 87 if ($data[2] eq '') { 88 debuglog("Updateing $txt info File:'$fname'"); 89 $ddb->do(qq(update $fname set fname='$fname' where key='$key')); 90 } 91 if ($data[3] eq '') { 92 debuglog("Updateing $txt info Type:'$ctype'"); 93 $ddb->do(qq(update $fname set ctype='$ctype' where key='$key')); 94 } 95 } else { 96 my $then = time - ($conf->{"focr_db_max_days"}*86400); 97 my $sth = $ddb->prepare(qq(select * from $fname)); $sth->execute; 98 while (my @row = $sth->fetchrow_array) { 99 my $hash2 = $row[1] || "0:0:0:0"; 100 $hash2 .= "::$row[0]"; 101 if (within_threshold($digest,$hash2)) { 102 $txt = 'Approx'; 103 $key = $row[0]; 104 $next = $row[4] + 1; 105 $when = $row[6] || $now; 106 $ret = $dbfile eq $conf->{"focr_mysql_hash"} ? $row[7] : $row[4]; 107 $dinfo = $row[8] || ''; 108 debuglog("Found in Table:'$dbfile'"); 109 last; 110 } 111 } 112 # Expire old records... 113 $ddb->do(qq(delete from $fname where check < $then)); 114 } 115 if ($ret > 0) { 116 if ($dbfile eq $conf->{"focr_mysql_hash"}) { 117 debuglog("Found Score <$ret> for $txt Image Hash"); 118 } 119 debuglog("Matched [$next] time(s). Prev match: ".fmt_time($when)); 120 $ddb->do(qq(update $fname set match='$next',match='$now' where key='$key')); 121 } 122 } 123 elsif ($conf->{"focr_enable_image_hashing"} == 2) { 124 my %DB = (); my $dbm; 76 125 tie %DB, 'MLDBM', $dbfile, O_RDWR or $ret++; 77 126 if ($ret>0) { … … 91 140 } 92 141 if ($ret == 0) { 93 my $ now= time - ($conf->{"focr_db_max_days"}*86400);142 my $then = time - ($conf->{"focr_db_max_days"}*86400); 94 143 foreach my $k (keys %DB) { 95 144 $dbm = $DB{$k}; … … 103 152 # Has the record expired?? 104 153 $dbm->{check} = $now - 1 unless defined $dbm->{check}; 105 if ($dbm->{check} < $ now) {154 if ($dbm->{check} < $then) { 106 155 debuglog("Expiring <$k> older than $conf->{'focr_db_max_days'} days"); 107 156 delete $DB{$k}; … … 121 170 untie %DB; 122 171 return ($ret,$dinfo); 123 } elsif ($conf->{"focr_enable_image_hashing"} == 1) { 172 } 173 elsif ($conf->{"focr_enable_image_hashing"} == 1) { 124 174 $ret = open HASH, $conf->{"focr_digest_db"}; 125 175 unless($ret) { … … 142 192 sub add_image_hash_db { 143 193 my $conf = get_config(); 144 return unless ( 145 ($conf->{'focr_enable_image_hashing'} > 0) and 146 ($conf->{'focr_enable_image_hashing'} < 3) 147 ); 194 return if ($conf->{focr_enable_image_hashing} == 0); 148 195 my $digest = $_[0]; 149 196 my $score = $_[1]; 150 197 my $ret = 0; 151 198 152 if ($conf->{"focr_enable_image_hashing"} == 2) { 199 if ($conf->{focr_enable_image_hashing} == 3) { 200 unless (defined $conf->{focr_ddb}) { 201 debuglog("Cannot connect to '$conf->{focr_mysql_db}\@$conf->{focr_mysql_host}:$conf->{focr_mysql_port}"); 202 return; 203 } 204 my $ddb = $conf->{focr_ddb}; 205 my $table = $_[2] || $conf->{focr_mysql_hash}; 206 debuglog("Adding Hash to \"$table\""); 207 my ($img,$key) = split('::',$digest,2); 208 if (defined $key) { 209 my $sql = "insert into $table values ("; 210 $sql .= "'$key','$img','$_[3]','$_[4]',"; 211 $sql .= sprintf ("'%d','%d','%d','%d','%s'", 212 $table eq $conf->{focr_mysql_hash} ? 0 : 1, 213 time,time,$score,$_[5]); 214 $ddb->do($sql); 215 } 216 } 217 elsif ($conf->{"focr_enable_image_hashing"} == 2) { 153 218 my $dbfile = $_[2] || $conf->{"focr_db_hash"}; 154 219 my %DB = (); … … 159 224 } 160 225 debuglog("Adding Hash to \"$dbfile\""); 161 162 226 my ($img,$key) = split('::',$digest,2); 163 227 if (defined $key) { … … 174 238 } 175 239 untie %DB; 176 } elsif ($conf->{"focr_enable_image_hashing"} == 1) { 240 } 241 elsif ($conf->{"focr_enable_image_hashing"} == 1) { 177 242 if (-e $conf->{"focr_digest_db"}) { 178 243 $ret = open DB, ">>$conf->{'focr_digest_db'}";
