Changeset 68
- Timestamp:
- 30.11.2006 00:36:22 (2 years ago)
- Files:
-
- trunk (modified) (1 prop)
- trunk/devel/FuzzyOcr.pm (modified) (8 diffs)
- trunk/devel/FuzzyOcr.preps (modified) (1 diff)
- trunk/devel/FuzzyOcr.scansets (modified) (1 diff)
- trunk/devel/FuzzyOcr/Config.pm (modified) (29 diffs)
- trunk/devel/FuzzyOcr/Deanimate.pm (modified) (2 diffs)
- trunk/devel/FuzzyOcr/Hashing.pm (modified) (1 diff)
- trunk/devel/FuzzyOcr/Logging.pm (modified) (1 diff)
- trunk/devel/FuzzyOcr/Misc.pm (modified) (5 diffs)
- trunk/devel/FuzzyOcr/Preprocessor.pm (modified) (1 diff)
- trunk/devel/FuzzyOcr/Scanset.pm (modified) (1 diff)
- trunk/devel/FuzzyOcr/Scoring.pm (modified) (1 diff)
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
trunk
- Property svn:ignore set to
devel2
- Property svn:ignore set to
trunk/devel/FuzzyOcr.pm
r67 r68 14 14 use Mail::SpamAssassin::Plugin; 15 15 16 use Time::HiRes qw( time usleep ualarmgettimeofday tv_interval );16 use Time::HiRes qw( gettimeofday tv_interval ); 17 17 use String::Approx 'adistr'; 18 18 use FileHandle; 19 19 use Fcntl ':flock'; 20 use POSIX; 20 21 21 22 use lib qw(. /etc/mail/spamassassin); # Allow placing of FuzzyOcr in siteconfigdir 22 23 23 use FuzzyOcr::Config qw(kill_pid get_tmpdir set_tmpdir get_pms save_pms get_timeout get_ddb get_thresholds get_scansets get_config get_wordlist set_config finish_parsing_end parse_config load_global_words load_personal_words debuglog logfile); 24 use FuzzyOcr::Hashing qw(check_image_hash_db check_image_hash_db add_image_hash_db calc_image_hash); 24 use FuzzyOcr::Logging qw(debuglog); 25 use FuzzyOcr::Config qw(kill_pid 26 get_tmpdir 27 set_tmpdir 28 get_pms 29 save_pms 30 get_timeout 31 get_mysql_ddb 32 get_scansets 33 get_wordlist 34 set_config 35 get_config 36 finish_parsing_end 37 read_words); 38 use FuzzyOcr::Hashing qw(check_image_hash_db add_image_hash_db calc_image_hash); 25 39 use FuzzyOcr::Deanimate qw(deanimate); 26 40 use FuzzyOcr::Scoring qw(wrong_ctype corrupt_img known_img_hash); … … 46 60 47 61 sub fuzzyocr_check { 62 my ( $self, $pms ) = @_; 48 63 my $conf = get_config(); 49 my ( $self, $pms ) = @_;50 64 51 65 save_pms($pms); … … 84 98 my ( $self, $conf, $pms ) = @_; 85 99 86 if ( $pms->get_score() > $conf->{focr_autodisable_score} ) { 87 debuglog("Scan canceled, message has already more than $conf->{focr_autodisable_score} points."); 100 my $current_score = $pms->get_score(); 101 my $score = $conf->{focr_autodisable_score} || 100; 102 103 if ( $current_score > $score ) { 104 debuglog("Scan canceled, message has already more than $score points ($current_score)."); 88 105 return 0; 89 106 } 90 107 91 if ( $pms->get_score() < $conf->{focr_autodisable_negative_score} ) { 92 debuglog("Scan canceled, message has less than $conf->{focr_autodisable_negative_score} points."); 108 $score = $conf->{focr_autodiable_negative_score} || -100; 109 if ( $current_score < $score ) { 110 debuglog("Scan canceled, message has less than $score points ($current_score)."); 93 111 return 0; 94 112 } … … 101 119 my $cnt = 0; 102 120 my $imgerr = 0; 103 my $main = $self->{main};121 my $main = $self->{main}; 104 122 105 123 debuglog("Starting FuzzyOcr..."); 106 124 debuglog("Attempting to load personal wordlist..."); 107 125 if ($conf->{focr_personal_wordlist} =~ m/^\//) { 108 load_personal_words( $conf->{focr_personal_wordlist} );126 read_words( $conf->{focr_personal_wordlist} ); 109 127 } else { 110 my $peruserlist = $main->sed_path($conf->{focr_personal_wordlist}); 111 unless ($peruserlist) { 128 my $peruserlist = $main->sed_path($conf->{focr_personal_wordlist}); 129 if (-r $peruserlist) { 130 read_words($peruserlist); 131 } else { 112 132 debuglog("Error getting personal wordlist, skipping..."); 113 133 } 114 load_personal_words($peruserlist);115 134 } 116 135 … … 264 283 debuglog("Found: $cnt images"); $cnt = 0; 265 284 if ($conf->{focr_enable_image_hashing} == 3) { 266 $ddb = $conf->{focr_ddb} = get_ ddb();285 $ddb = $conf->{focr_ddb} = get_mysql_ddb(); 267 286 } 268 287 … … 310 329 my ($retcode, @stdout_data) = save_execute( 311 330 "$conf->{focr_bin_giftext} $file", 312 undef,331 undef, 313 332 ">$imgdir/giftext.info", 314 333 ">>$imgdir/giftext.err", 1); … … 389 408 printf RAWERR qq(## $conf->{focr_bin_gifinter} $cfile >$tfile 2>>$efile\n) if ($haserr>0); 390 409 391 $retcode = save_execute("$conf->{focr_bin_gifinter} $cfile", undef, ">$tfile", ">>$efile");410 $retcode = save_execute("$conf->{focr_bin_gifinter} $cfile", undef, ">$tfile", ">>$efile"); 392 411 393 412 if ($retcode<0) { … … 406 425 printf RAWERR qq(## $conf->{focr_bin_giftopnm} $tfile >$pfile 2>>$efile\n) if ($haserr>0); 407 426 408 $retcode = save_execute("$conf->{focr_bin_giftopnm} $tfile", undef, ">$pfile", ">>$efile");427 $retcode = save_execute("$conf->{focr_bin_giftopnm} $tfile", undef, ">$pfile", ">>$efile"); 409 428 410 429 if ($retcode<0) { trunk/devel/FuzzyOcr.preps
r60 r68 10 10 } 11 11 12 # requires ImageMagic convert 12 13 preprocessor maketiff { 13 14 command = convert $input tiff:$output trunk/devel/FuzzyOcr.scansets
r62 r68 8 8 scanset ocrad-invert { 9 9 ocr_command = $ocrad -s5 -i $input 10 } 11 12 scanset gocr { 13 ocr_command = $gocr -i $input 14 } 15 16 scanset gocr-180 { 17 ocr_command = $gocr -l 180 -d 2 -i $input 10 18 } 11 19 trunk/devel/FuzzyOcr/Config.pm
r67 r68 2 2 package FuzzyOcr::Config; 3 3 4 use FuzzyOcr::Logging qw(debuglog logfile);5 4 use lib qw(..); 5 use FuzzyOcr::Logging qw(debuglog); 6 6 use FuzzyOcr::Scanset; 7 7 use FuzzyOcr::Preprocessor; 8 8 use Mail::SpamAssassin::Logger; 9 9 10 10 use base 'Exporter'; 11 our @EXPORT_OK = qw( 11 our @EXPORT_OK = qw/ 12 parse_config 13 finish_parsing_end 14 get_config 15 set_config 12 16 set_pid 13 17 unset_pid … … 21 25 get_preprocessor 22 26 get_thresholds 23 get_config24 27 get_wordlist 25 set_config 26 get_ddb 27 finish_parsing_end 28 parse_config 29 load_global_words 30 load_personal_words 31 debuglog 32 logfile); 33 34 use Fcntl ':flock'; 35 use POSIX; 28 get_mysql_ddb 29 read_words 30 /; 36 31 37 32 use constant HAS_DBI => eval { require DBI; }; … … 150 145 } 151 146 152 sub get_ ddb {147 sub get_mysql_ddb { 153 148 return undef unless (HAS_DBI and HAS_DBD_MYSQL); 154 149 use DBI; … … 205 200 default => 10, 206 201 type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC 207 });202 }); 208 203 209 204 push (@cmds, { … … 211 206 default => 0, 212 207 type => $Mail::SpamAssassin::Conf::CONF_TYPE_BOOL 213 });208 }); 214 209 215 210 push (@cmds, { 216 211 setting => 'focr_logfile', 217 212 type => $Mail::SpamAssassin::Conf::CONF_TYPE_STRING 218 });213 }); 219 214 220 215 push (@cmds, { … … 222 217 default => 0, 223 218 type => $Mail::SpamAssassin::Conf::CONF_TYPE_BOOL 224 });219 }); 225 220 226 221 push (@cmds, { … … 237 232 $self->{focr_enable_image_hashing} = $value+0; 238 233 } 239 });234 }); 240 235 241 236 push (@cmds, { … … 243 238 default => 1, 244 239 type => $Mail::SpamAssassin::Conf::CONF_TYPE_BOOL 245 }); 240 }); 241 242 push (@cmds, { 243 setting => 'focr_skip_updates', 244 default => 0, 245 type => $Mail::SpamAssassin::Conf::CONF_TYPE_BOOL 246 }); 246 247 247 248 push (@cmds, { 248 249 setting => 'focr_digest_db', 249 default => " /etc/mail/spamassassin/FuzzyOcr.hashdb",250 type => $Mail::SpamAssassin::Conf::CONF_TYPE_STRING 251 });250 default => "__local_rules_dir__/FuzzyOcr.hashdb", 251 type => $Mail::SpamAssassin::Conf::CONF_TYPE_STRING 252 }); 252 253 253 254 push (@cmds, { 254 255 setting => 'focr_global_wordlist', 255 default => " /etc/mail/spamassassin/FuzzyOcr.words",256 type => $Mail::SpamAssassin::Conf::CONF_TYPE_STRING 257 });256 default => "__local_rules_dir__/FuzzyOcr.words", 257 type => $Mail::SpamAssassin::Conf::CONF_TYPE_STRING 258 }); 258 259 259 260 push (@cmds, { … … 261 262 default => "__userstate__/FuzzyOcr.words", 262 263 type => $Mail::SpamAssassin::Conf::CONF_TYPE_STRING 263 });264 }); 264 265 265 266 push (@cmds, { 266 267 setting => 'focr_db_hash', 267 default => " /etc/mail/spamassassin/FuzzyOcr.db",268 type => $Mail::SpamAssassin::Conf::CONF_TYPE_STRING 269 });268 default => "__local_rules_dir__/FuzzyOcr.db", 269 type => $Mail::SpamAssassin::Conf::CONF_TYPE_STRING 270 }); 270 271 271 272 push (@cmds, { 272 273 setting => 'focr_db_safe', 273 default => " /etc/mail/spamassassin/FuzzyOcr.safe.db",274 type => $Mail::SpamAssassin::Conf::CONF_TYPE_STRING 275 });274 default => "__local_rules_dir__/FuzzyOcr.safe.db", 275 type => $Mail::SpamAssassin::Conf::CONF_TYPE_STRING 276 }); 276 277 277 278 push (@cmds, { … … 279 280 default => 35, 280 281 type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC 281 });282 }); 282 283 283 284 push (@cmds, { … … 294 295 $self->{focr_keep_bad_images} = $value+0; 295 296 } 296 });297 }); 297 298 298 299 push (@cmds, { … … 300 301 default => 0, 301 302 type => $Mail::SpamAssassin::Conf::CONF_TYPE_BOOL 302 });303 }); 303 304 304 305 push (@cmds, { … … 306 307 default => 5, 307 308 type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC 308 });309 }); 309 310 310 311 push (@cmds, { … … 312 313 default => 1, 313 314 type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC 314 });315 }); 315 316 316 317 push (@cmds, { … … 318 319 default => 2.5, 319 320 type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC 320 });321 }); 321 322 322 323 push (@cmds, { … … 324 325 default => 5, 325 326 type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC 326 });327 }); 327 328 328 329 push (@cmds, { … … 330 331 default => 1.5, 331 332 type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC 332 });333 }); 333 334 334 335 push (@cmds, { … … 336 337 default => 10, 337 338 type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC 338 });339 }); 339 340 340 341 push (@cmds, { … … 342 343 default => -5, 343 344 type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC 344 });345 }); 345 346 346 347 push (@cmds, { … … 348 349 default => '/usr/local/netpbm/bin:/usr/local/bin:/usr/bin', 349 350 type => $Mail::SpamAssassin::Conf::CONF_TYPE_STRING 350 });351 }); 351 352 352 353 foreach (@bin_utils) { … … 354 355 setting => 'focr_bin_'.$_, 355 356 type => $Mail::SpamAssassin::Conf::CONF_TYPE_STRING 356 });357 }); 357 358 } 358 359 … … 362 363 default => 0, 363 364 type => $Mail::SpamAssassin::Conf::CONF_TYPE_BOOL 364 });365 }); 365 366 push (@cmds, { 366 367 setting => 'focr_max_size_'.$_, 367 368 type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC 368 });369 }); 369 370 } 370 371 371 372 push (@cmds, { 372 373 setting => 'focr_scanset_file', 373 default => "/etc/mail/spamassassin/FuzzyOcr.scansets",374 type => $Mail::SpamAssassin::Conf::CONF_TYPE_STRING 375 });374 default => '__local_rules_dir__/FuzzyOcr.scansets', 375 type => $Mail::SpamAssassin::Conf::CONF_TYPE_STRING 376 }); 376 377 push (@cmds, { 377 378 setting => 'focr_preprocessor_file', 378 default => "/etc/mail/spamassassin/FuzzyOcr.preps",379 type => $Mail::SpamAssassin::Conf::CONF_TYPE_STRING 380 });381 382 push (@cmds, { 383 setting => 'focr_minimal_scanset',384 default => 0,385 type => $Mail::SpamAssassin::Conf::CONF_TYPE_BOOL386 });387 push (@cmds, { 388 setting => 'focr_autosort_scanset',389 default => 1,390 type => $Mail::SpamAssassin::Conf::CONF_TYPE_BOOL391 });392 push (@cmds, { 393 setting => 'focr_autosort_buffer',394 default => 10,395 type => $Mail::SpamAssassin::Conf::CONF_TYPE_BOOL396 });379 default => '__local_rules_dir__/FuzzyOcr.preps', 380 type => $Mail::SpamAssassin::Conf::CONF_TYPE_STRING 381 }); 382 383 push (@cmds, { 384 setting => 'focr_minimal_scanset', 385 default => 0, 386 type => $Mail::SpamAssassin::Conf::CONF_TYPE_BOOL 387 }); 388 push (@cmds, { 389 setting => 'focr_autosort_scanset', 390 default => 1, 391 type => $Mail::SpamAssassin::Conf::CONF_TYPE_BOOL 392 }); 393 push (@cmds, { 394 setting => 'focr_autosort_buffer', 395 default => 10, 396 type => $Mail::SpamAssassin::Conf::CONF_TYPE_BOOL 397 }); 397 398 push (@cmds, { 398 399 setting => 'focr_mysql_host', 399 400 default => 'localhost', 400 401 type => $Mail::SpamAssassin::Conf::CONF_TYPE_STRING 401 });402 }); 402 403 403 404 push (@cmds, { … … 405 406 default => 3306, 406 407 type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC 407 }); 408 408 }); 409 409 push (@cmds, { 410 410 setting => 'focr_mysql_socket', 411 411 type => $Mail::SpamAssassin::Conf::CONF_TYPE_STRING 412 }); 413 412 }); 414 413 push (@cmds, { 415 414 setting => 'focr_mysql_db', 416 415 default => 'FuzzyOcr', 417 416 type => $Mail::SpamAssassin::Conf::CONF_TYPE_STRING 418 }); 419 417 }); 420 418 push (@cmds, { 421 419 setting => 'focr_mysql_hash', 422 420 default => 'Hash', 423 421 type => $Mail::SpamAssassin::Conf::CONF_TYPE_STRING 424 }); 425 422 }); 426 423 push (@cmds, { 427 424 setting => 'focr_mysql_safe', 428 425 default => 'Safe', 429 426 type => $Mail::SpamAssassin::Conf::CONF_TYPE_STRING 430 }); 431 427 }); 432 428 push (@cmds, { 433 429 setting => 'focr_mysql_update_hash', 434 430 default => 0, 435 431 type => $Mail::SpamAssassin::Conf::CONF_TYPE_BOOL 436 }); 437 432 }); 438 433 foreach (qw/user pass/) { 439 434 push (@cmds, { … … 441 436 default => 'fuzzyocr', 442 437 type => $Mail::SpamAssassin::Conf::CONF_TYPE_STRING 443 });438 }); 444 439 } 445 440 … … 451 446 if ($opts->{key} eq 'focr_end_config') { 452 447 $conf = $opts->{conf}; 448 my $retcode; 449 info("FuzzyOcr: focr_end_config"); 450 453 451 # Parse preprocessor file 454 452 my $pfile = $conf->{'focr_preprocessor_file'}; 455 453 debuglog("Starting preprocessor parser for file \"$pfile\"..."); 456 ( my$retcode, @preprocessors) = parse_preprocessors($pfile);454 ($retcode, @preprocessors) = parse_preprocessors($pfile); 457 455 if ($retcode) { 458 456 warn("Error parsing preprocessor file \"$pfile\", aborting..."); … … 475 473 sub finish_parsing_end { 476 474 my ($self, $opts) = @_; 475 my $main = $self->{main}; 477 476 $conf = $opts->{conf}; 478 477 479 #add_facilities('FuzzyOcr'); 478 479 # fix paths 480 foreach (qw/focr_personal_wordlist focr_global_wordlist 481 focr_db_hash focr_db_safe focr_hash_db 482 focr_scanset_file focr_preprocessor_file/) { 483 next unless defined $conf->{$_}; 484 my $path = $main->sed_path($conf->{$_}); 485 $conf->{$_} = $path ? $path : $_; 486 debuglog("$_ => $path"); 487 } 488 480 489 # find external binaries 481 @paths = split(/:/, $conf->{ 'focr_path_bin'});490 @paths = split(/:/, $conf->{focr_path_bin}); 482 491 debuglog("Searching in: $_") foreach @paths; 483 492 foreach my $a (@bin_utils) { 484 if (defined $conf->{"focr_bin_$a"} and ! -x $conf->{"focr_bin_$a"}) { 493 my $b = "focr_bin_$a"; 494 if (defined $conf->{$b} and ! -x $conf->{$b}) { 485 495 debuglog("cannot exec $a, removing..."); 486 delete $conf->{ "focr_bin_$a"};496 delete $conf->{$b}; 487 497 } 488 498 foreach my $p (@paths) { 489 499 my $f = "$p/$a"; 490 if (! defined $conf->{ "focr_bin_$a"} and -x $f) {491 $conf->{ "focr_bin_$a"} = $f;500 if (! defined $conf->{$b} and -x $f) { 501 $conf->{$b} = $f; 492 502 last; 493 503 } 494 504 } 495 if (defined $conf->{ "focr_bin_$a"}) {496 debuglog("Using $a => ".$conf->{"focr_bin_$a"});505 if (defined $conf->{$b}) { 506 debuglog("Using $a => $b"); 497 507 } else { 498 508 debuglog("Cannot find executable for $a"); … … 500 510 } 501 511 512 # Display All Options 502 513 foreach my $k (sort keys %{$conf}) { 503 514 next unless $k =~ m/^focr_/; … … 535 546 } 536 547 } 537 if ($conf->{focr_enable_image_hashing} == 2 and -r $conf->{focr_digest_db}) { 538 my %DB; my $dbm; my $err = 0; 539 my $now = time - ($conf->{focr_db_max_days}*86400); 540 debuglog($conf->{focr_db_hash}); 541 tie %DB, 'MLDBM', $conf->{focr_db_hash} or $err++; 542 if ($err) { 543 debuglog("Could not open \"$conf->{focr_db_hash}\""); 544 } else { 545 my $hash = 0; 546 debuglog("Expiring records prior to: ".scalar(localtime($now))); 547 foreach my $k (keys %DB) { 548 my $db = $DB{$k}; 549 if ($db->{check} < $now) { 550 debuglog("Expire: <$k> Reason: $db->{check} < $now"); 551 delete $DB{$k}; $hash++; 552 } 553 } 554 debuglog("Expired <$hash> Image Hashes after $conf->{focr_db_max_days} day(s)") 555 if ($hash>0); 556 $hash = 0; 557 open HASH, $conf->{focr_digest_db}; 558 while (<HASH>) { 559 chomp; 560 my($score,$basic,$key) = split('::',$_,3); 561 next if (defined $DB{$key}); 562 $dbm = $DB{$key}; 563 $dbm->{score} = $score; 564 $dbm->{basic} = $basic; 565 $dbm->{input} = 566 $dbm->{check} = time; 567 $dbm->{match} = 1; 568 $DB{$key} = $dbm; 569 $hash++; 570 } 571 close HASH; 572 debuglog("Imported <$hash> Image Hashes from \"$conf->{focr_digest_db}\"") 573 if ($hash>0); 574 $hash = scalar(keys %DB); 575 debuglog("<$hash> Known BAD Image Hashes Available"); 576 untie %DB; 577 } 578 $err = 0; 579 tie %DB, 'MLDBM', $conf->{focr_db_safe} or $err++; 580 if ($err) { 581 debuglog("Could not open \"$conf->{focr_db_safe}\""); 582 } else { 583 my $hash = 0; 584 foreach my $k (keys %DB) { 585 my $db = $DB{$k}; 586 if ($db->{check} < $now) { 587 debuglog("Expire: <$k> Reason: $db->{check} < $now"); 588 delete $DB{$k}; $hash++; 589 } 590 } 591 debuglog("Expired <$hash> Image Hashes after $conf->{focr_db_max_days} day(s)") 592 if ($hash>0); 593 $hash = scalar(keys %DB); 594 debuglog("<$hash> Known GOOD Image Hashes Available"); 595 untie %DB; 596 } 597 } 598 if ($conf->{focr_enable_image_hashing} == 3) { 599 my $ddb = get_ddb(); 600 if (defined $ddb) { 601 my $db = $conf->{focr_mysql_db}; 602 my $tab = $conf->{focr_mysql_hash}; 603 my $file = $conf->{focr_db_hash}; 548 unless ($conf->{focr_skip_updates}) { 549 use DBI; 550 use MLDBM qw(DB_File Storable); 551 if ($conf->{focr_enable_image_hashing} == 2 and -r $conf->{focr_digest_db}) { 604 552 my %DB; my $dbm; my $err = 0; 605 tie %DB, 'MLDBM', $file or $err++; 553 my $now = time - ($conf->{focr_db_max_days}*86400); 554 debuglog($conf->{focr_db_hash}); 555 tie %DB, 'MLDBM', $conf->{focr_db_hash} or $err++; 606 556 if ($err) { 607 debuglog("Could not open \"$ file\"");557 debuglog("Could not open \"$conf->{focr_db_hash}\""); 608 558 } else { 559 my $hash = 0; 560 debuglog("Expiring records prior to: ".scalar(localtime($now))); 609 561 foreach my $k (keys %DB) { 610 my $dbm = $DB{$k}; 611 my $sql = qq(select score from $db.$tab where $tab.key='$k'); 612 my @data = $ddb->selectrow_array($sql); 613 unless (scalar(@data)>0) { 614 $sql = "insert into $db.$tab values ('$k'"; 615 foreach my $y (qw/basic fname ctype/) { 616 my $val = defined($dbm->{$y}) ? $dbm->{$y} : ''; 617 $sql .= ",'$val'"; 562 my $db = $DB{$k}; 563 if ($db->{check} < $now) { 564 debuglog("Expire: <$k> Reason: $db->{check} < $now"); 565 delete $DB{$k}; $hash++; 566 } 567 } 568 debuglog("Expired <$hash> Image Hashes after $conf->{focr_db_max_days} day(s)") 569 if ($hash>0); 570 $hash = 0; 571 open HASH, $conf->{focr_digest_db}; 572 while (<HASH>) { 573 chomp; 574 my($score,$basic,$key) = split('::',$_,3); 575 next if (defined $DB{$key}); 576 $dbm = $DB{$key}; 577 $dbm->{score} = $score; 578 $dbm->{basic} = $basic; 579 $dbm->{input} = 580 $dbm->{check} = time; 581 $dbm->{match} = 1; 582 $DB{$key} = $dbm; 583 $hash++; 584 } 585 close HASH; 586 debuglog("Imported <$hash> Image Hashes from \"$conf->{focr_digest_db}\"") 587 if ($hash>0); 588 $hash = scalar(keys %DB); 589 debuglog("<$hash> Known BAD Image Hashes Available"); 590 untie %DB; 591 } 592 $err = 0; 593 tie %DB, 'MLDBM', $conf->{focr_db_safe} or $err++; 594 if ($err) { 595 debuglog("Could not open \"$conf->{focr_db_safe}\""); 596 } else { 597 my $hash = 0; 598 foreach my $k (keys %DB) { 599 my $db = $DB{$k}; 600 if ($db->{check} < $now) { 601 debuglog("Expire: <$k> Reason: $db->{check} < $now"); 602 delete $DB{$k}; $hash++; 603 } 604 } 605 debuglog("Expired <$hash> Image Hashes after $conf->{focr_db_max_days} day(s)") 606 if ($hash>0); 607 $hash = scalar(keys %DB); 608 debuglog("<$hash> Known GOOD Image Hashes Available"); 609 untie %DB; 610 } 611 } 612 if ($conf->{focr_enable_image_hashing} == 3) { 613 my $ddb = get_mysql_ddb(); 614 if (defined $ddb) { 615 my $db = $conf->{focr_mysql_db}; 616 my $tab = $conf->{focr_mysql_hash}; 617 my $file = $conf->{focr_db_hash}; 618 my %DB; my $dbm; my $err = 0; 619 tie %DB, 'MLDBM', $file or $err++; 620 if ($err) { 621 debuglog("Could not open \"$file\""); 622 } else { 623 foreach my $k (keys %DB) { 624 my $dbm = $DB{$k}; 625 my $sql = qq(select score from $db.$tab where $tab.key='$k'); 626 my @data = $ddb->selectrow_array($sql); 627 unless (scalar(@data)>0) { 628 $sql = "insert into $db.$tab values ('$k'"; 629 foreach my $y (qw/basic fname ctype/) { 630 my $val = defined($dbm->{$y}) ? $dbm->{$y} : ''; 631 $sql .= ",'$val'"; 632 } 633 if ($dbm->{ctype} =~ m/gif/i) { $sql .= ",'1'"; } 634 elsif ($dbm->{ctype} =~ m/jpg|jpeg/i) { $sql .= ",'2'"; } 635 elsif ($dbm->{ctype} =~ m/png/i) { $sql .= ",'3'"; } 636 elsif ($dbm->{ctype} =~ m/bmp/i) { $sql .= ",'4'"; } 637 elsif ($dbm->{ctype} =~ m/tiff/i) { $sql .= ",'5'"; } 638 else { $sql .= ",'0'"; } 639 foreach my $y (qw/match input check score dinfo/) { 640 my $val = defined($dbm->{$y}) ? $dbm->{$y} : ''; 641 $sql .= ",'$val'"; 642 } 643 $sql .= ")"; 644 debuglog($sql,2); 645 $ddb->do($sql); $err++; 618 646 } 619 if ($dbm->{ctype} =~ m/gif/i) { $sql .= ",'1'"; } 620 elsif ($dbm->{ctype} =~ m/jpg|jpeg/i) { $sql .= ",'2'"; } 621 elsif ($dbm->{ctype} =~ m/png/i) { $sql .= ",'3'"; } 622 elsif ($dbm->{ctype} =~ m/bmp/i) { $sql .= ",'4'"; } 623 elsif ($dbm->{ctype} =~ m/tiff/i) { $sql .= ",'5'"; } 624 else { $sql .= ",'0'"; } 625 foreach my $y (qw/match input check score dinfo/) { 626 my $val = defined($dbm->{$y}) ? $dbm->{$y} : ''; 627 $sql .= ",'$val'"; 647 } 648 untie %DB; 649 debuglog("Stored [$err] Hashes in $db.$tab") if $err>0; 650 } 651 $tab = $conf->{focr_mysql_safe}; 652 $file = $conf->{focr_db_safe}; 653 $err = 0; 654 tie %DB, 'MLDBM', $file or $err++; 655 if ($err) { 656 debuglog("Could not open \"$file\""); 657 } else { 658 foreach my $k (keys %DB) { 659 my $dbm = $DB{$k}; 660 my $sql = qq(select score from $db.$tab where $tab.key='$k'); 661 my @data = $ddb->selectrow_array($sql); 662 unless (scalar(@data)>0) { 663 $sql = "insert into $db.$tab values ('$k'"; 664 foreach my $y (qw/basic fname ctype/) { 665 my $val = defined($dbm->{$y}) ? $dbm->{$y} : ''; 666 $sql .= ",'$val'"; 667 } 668 if ($dbm->{ctype} =~ m/gif/i) { $sql .= ",'1'"; } 669 elsif ($dbm->{ctype} =~ m/jpg|jpeg/i) { $sql .= ",'2'"; } 670 elsif ($dbm->{ctype} =~ m/png/i) { $sql .= ",'3'"; } 671 elsif ($dbm->{ctype} =~ m/bmp/i) { $sql .= ",'4'"; } 672 elsif ($dbm->{ctype} =~ m/tiff/i) { $sql .= ",'5'"; } 673 else { $sql .= ",'0'"; } 674 foreach my $y (qw/match input check score dinfo/) { 675 my $val = defined($dbm->{$y}) ? $dbm->{$y} : ''; 676 $sql .= ",'$val'"; 677 } 678 $sql .= ")"; 679 debuglog($sql,2); 680 $ddb->do($sql); $err++; 628 681 } 629 $sql .= ")";630 debuglog($sql,2);631 $ddb->do($sql); $err++;632 682 } 633 } 634 untie %DB; 635 debuglog("Stored [$err] Hashes in $db.$tab") if $err>0; 636 } 637 $tab = $conf->{focr_mysql_safe}; 638 $file = $conf->{focr_db_safe}; 639 $err = 0; 640 tie %DB, 'MLDBM', $file or $err++; 641 if ($err) { 642 debuglog("Could not open \"$file\""); 643 } else { 644 foreach my $k (keys %DB) { 645 my $dbm = $DB{$k}; 646 my $sql = qq(select score from $db.$tab where $tab.key='$k'); 647 my @data = $ddb->selectrow_array($sql); 648 unless (scalar(@data)>0) { 649 $sql = "insert into $db.$tab values ('$k'"; 650 foreach my $y (qw/basic fname ctype/) { 651 my $val = defined($dbm->{$y}) ? $dbm->{$y} : ''; 652 $sql .= ",'$val'"; 653 } 654 if ($dbm->{ctype} =~ m/gif/i) { $sql .= ",'1'"; } 655 elsif ($dbm->{ctype} =~ m/jpg|jpeg/i) { $sql .= ",'2'"; } 656 elsif ($dbm->{ctype} =~ m/png/i) { $sql .= ",'3'"; } 657 elsif ($dbm->{ctype} =~ m/bmp/i) { $sql .= ",'4'"; } 658 elsif ($dbm->{ctype} =~ m/tiff/i) { $sql .= ",'5'"; } 659 else { $sql .= ",'0'"; } 660 foreach my $y (qw/match input check score dinfo/) { 661 my $val = defined($dbm->{$y}) ? $dbm->{$y} : ''; 662 $sql .= ",'$val'"; 663 } 664 $sql .= ")"; 665 debuglog($sql,2); 666 $ddb->do($sql); $err++; 667 } 668 } 669 untie %DB; 670 debuglog("Stored [$err] Hashes in $db.$tab") if $err>0; 671 } 672 $ddb->disconnect; 673 debuglog("done updating MySQL database"); 674 } 675 } 676 load_global_words( $conf->{focr_global_wordlist} ); 677 } 678 679 sub load_global_words { 680 unless ( -r $_[0] ) { 681 debuglog("Cannot read Global wordlist: \"$_[0]\"\n Please check file path and permissions are correct."); 683 untie %DB; 684 debuglog("Stored [$err] Hashes in $db.$tab") if $err>0; 685 } 686 $ddb->disconnect; 687 debuglog("done updating MySQL database"); 688 } 689 } 690 } 691 read_words( $conf->{focr_global_wordlist} , 'Global'); 692 1; 693 } 694 695 sub read_words { 696 my $wfile = $_[0]; 697 my $tfile = $_[1] || 'Personal'; 698 unless ( -r $wfile ) { 699 debuglog("Cannot read $tfile wordlist: \"$wfile\"\n Please check file path and permissions are correct."); 682 700 return; 683 701 } 684 702 my $cnt = 0; 685 open WORDLIST, "<$ _[0]";703 open WORDLIST, "<$wfile"; 686 704 while(my $w = <WORDLIST>) { 687 705 chomp($w); … … 701 719 } 702 720 close WORDLIST; 703 debuglog("Loaded <$cnt> words from \"$_[0]\""); 704 } 705 706 sub load_personal_words { 707 unless ( -e $_[0] ) { 708 #debuglog("Personal wordlist <$_[0]> not found, skipping..."); 709 return; 710 } 711 unless ( -r $_[0] ) { 712 debuglog("Cannot read from wordlist \"$_[0]\"\n Please make sure that permissions are correct."); 713 return; 714 } 715 my $cnt = 0; 716 open WORDLIST, "<$_[0]"; 717 while(my $w = <WORDLIST>) { 718 chomp($w); 719 $w =~ s/\s*//; 720 $w =~ s/#(.*)//; 721 next unless $w; 722 my $wt = $conf->{focr_threshold}; 723 if ($w =~ /^(.*?)::(0(\.\d+){0,1})/) { 724 ($w, $wt) = ($1, $2); 725 $wt = $conf->{focr_threshold} unless ($wt =~ m/[\d\.]+/); 726 } else { 727 $wt *= 0.750 if length($w) == 5; 728 $wt *= 0.500 if length($w) == 4; 729 $wt *= 0.250 if length($w) < 4; 730 } 731 $words{$w} = $wt; $cnt++; 732 } 733 close WORDLIST; 734 debuglog("Updated Word List with $cnt words from $_[0]"); 721 debuglog("Added <$cnt> words from \"$wfile\"") if ($cnt>0); 735 722 } 736 723 trunk/devel/FuzzyOcr/Deanimate.pm
r61 r68 5 5 our @EXPORT_OK = qw(deanimate); 6 6 7 use lib "../";7 use lib qw(..); 8 8 use FuzzyOcr::Config qw(get_config set_config get_tmpdir); 9 9 use FuzzyOcr::Misc qw(save_execute); 10 use FuzzyOcr::Logging qw(debuglog logfile);10 use FuzzyOcr::Logging qw(debuglog); 11 11 12 12 # Provide functions to deanimate gifs … … 78 78 ($retcode, @stdout_data) = save_execute( 79 79 "$conf->{focr_bin_gifsicle} --info $giffile", 80 undef,80 undef, 81 81 ">$imgdir/gifsicle.info", 82 82 ">>$imgdir/gifsicle.err", 1); trunk/devel/FuzzyOcr/Hashing.pm
r61 r68 7 7 calc_image_hash); 8 8 9 use lib "../";9 use lib qw(..); 10 10 use FuzzyOcr::Config qw(get_thresholds get_config set_config get_tmpdir); 11 11 use FuzzyOcr::Misc qw(save_execute); 12 use FuzzyOcr::Logging qw(debuglog logfile);12 use FuzzyOcr::Logging qw(debuglog); 13 13 use Fcntl; 14 14 use Fcntl ':flock';
