Changeset 38

Show
Ignore:
Timestamp:
16.11.2006 23:47:41 (2 years ago)
Author:
jorge
Message:

Added option to store hash in MySQL database making it easier for multiple servers to share the Hash/Safe DB

Files:

Legend:

Unmodified
Added
Removed
Modified
Copied
Moved
  • trunk/devel/FuzzyOcr.cf

    r37 r38  
    135135# Value = 1 ... use digest_hash only 
    136136# Value = 2 ... use digest_db w/digest_hash import 
     137# Value = 3 ... use mysql database 
    137138#focr_enable_image_hashing 2 
    138139# 
     
    154155#focr_db_max_days 15 
    155156# 
     157# MySQL options 
     158#focr_mysql_db FuzzyOcr 
     159#focr_mysql_hash Hash 
     160#focr_mysql_safe Safe 
     161#focr_mysql_user fuzzyocr 
     162#focr_mysql_pass fuzzyocr 
     163#focr_mysql_host localhost 
     164#focr_mysql_port 3306 
     165#focr_mysql_socket /tmp/mysql.sock (Default: empty) 
     166# 
    156167# Automatically add hashes of spam images recognized by OCR to the Image Hash database, to disable, set to 0.0 (Default value: 1) 
    157168#focr_hashing_learn_scanned 1 
  • trunk/devel/FuzzyOcr.pm

    r37 r38  
    1919 
    2020use lib qw(.); # Allow placing of FuzzyOcr in siteconfigdir 
    21 use FuzzyOcr::Config qw(get_pms save_pms get_thresholds get_scansets get_config get_wordlist set_config finish_parsing_end load_global_words load_personal_words debuglog logfile); 
     21use FuzzyOcr::Config qw(get_pms save_pms get_ddb get_thresholds get_scansets get_config get_wordlist set_config finish_parsing_end load_global_words load_personal_words debuglog logfile); 
    2222use FuzzyOcr::Hashing qw(check_image_hash_db check_image_hash_db add_image_hash_db calc_image_hash); 
    2323use FuzzyOcr::Deanimate qw(deanimate); 
     
    7474            debuglog("Variable \$ENV{HOME} not defined and getpwuid failed, personal wordlist function not available..."); 
    7575        } 
     76    } 
     77    if ($conf->{focr_enable_image_hashing} == 3) { 
     78        $conf->{focr_ddb} = get_ddb(); 
    7679    } 
    7780 
     
    428431                debuglog("Error calculating the image hash, skipping hash check..."); 
    429432            } else { 
    430                 my ($score, $dinfo); 
    431                 ($score,$dinfo) = check_image_hash_db($digest, $conf->{"focr_db_hash"}, $$pic{fname}, $$pic{ctype}); 
     433                my ($score, $dinfo, $whash); 
     434                $whash = $conf->{focr_enable_image_hashing} == 3 
     435                    ? $conf->{focr_mysql_hash}  
     436                    : $conf->{focr_db_hash}; 
     437                ($score,$dinfo) = check_image_hash_db($digest, $whash, $$pic{fname}, $$pic{ctype}); 
    432438                if ($score > 0) { 
    433439                    known_img_hash($score,$dinfo); 
     
    436442                    return 0; 
    437443                } 
    438                 ($score,$dinfo) = check_image_hash_db($digest, $conf->{"focr_db_safe"}, $$pic{fname}, $$pic{ctype}); 
     444                $whash = $conf->{focr_enable_image_hashing} == 3 
     445                    ? $conf->{focr_mysql_safe}  
     446                    : $conf->{focr_db_safe}; 
     447                ($score,$dinfo) = check_image_hash_db($digest, $whash, $$pic{fname}, $$pic{ctype}); 
    439448                if ($score > 0) { 
    440449                    debuglog("Image in KNOWN_GOOD. Skipping OCR checks..."); 
     
    521530     
    522531    if ($cnt == 0) { 
    523         if ($conf->{"focr_enable_image_hashing"} == 2 and @hashes) { 
     532        if ($conf->{"focr_enable_image_hashing"} > 1 and @hashes) { 
    524533            debuglog("Message is ham, saving..."); 
    525534            foreach my $h (@hashes) { 
    526535                my ($mcnt,$fname,$ctype,$digest) = split('::',$h,4); 
    527536                next if $mcnt; 
    528                 add_image_hash_db($digest,0,$conf->{"focr_db_safe"},$fname,$ctype); 
     537                my $whash = $conf->{focr_enable_image_hashing} == 3 
     538                    ? $conf->{focr_mysql_safe}  
     539                    : $conf->{focr_db_safe}; 
     540                add_image_hash_db($digest,0,$whash,$fname,$ctype); 
    529541            } 
    530542        } 
     
    548560                my ($mcnt,$fname,$ctype,$digest) = split('::',$h,4); 
    549561                next unless $mcnt; 
    550                 add_image_hash_db($digest,$score,$conf->{"focr_db_hash"},$fname,$ctype,$debuginfo); 
     562                my $whash = $conf->{focr_enable_image_hashing} == 3 
     563                    ? $conf->{focr_mysql_hash}  
     564                    : $conf->{focr_db_hash}; 
     565                add_image_hash_db($digest,$score,$whash,$fname,$ctype,$debuginfo); 
    551566            } 
    552567        } 
     
    562577    if ($imgerr == 0 and $conf->{"focr_keep_bad_images"}<2) { 
    563578        removedir($imgdir); 
     579    } 
     580    if ($conf->{focr_enable_image_hashing} == 3) { 
     581        $conf->{focr_ddb}->disconnect; 
    564582    } 
    565583    debuglog("FuzzyOcr ending successfully..."); 
  • trunk/devel/FuzzyOcr/Config.pm

    r35 r38  
    1010    get_wordlist  
    1111    set_config  
     12    get_ddb 
    1213    finish_parsing_end  
    1314    load_global_words  
     
    2627our $conf; 
    2728our $pms; 
    28  
    29 our @bin_utils = qw/gifsicle giffix giftext gifinter giftopnm  
    30     jpegtopnm pngtopnm bmptopnm tifftopnm ppmhist pamfile gocr ocrad/; 
    31  
    32 our @pgm_scores = qw/base add corrupt corrupt_unfixable wrongctype 
    33     autodisable/; 
    34  
    35 our @pgm_opts = qw/personal_wordlist global_wordlist logfile 
    36     threshold counts_required verbose timeout max_size_gif max_size_jpeg 
    37     max_size_tiff max_size_bmp db_hash db_safe db_max_days path_bin  
    38     scansets keep_bad_images score_ham enable_image_hashing digest_db 
    39     hashing_learn_scanned/; 
     29our $ddb; 
     30 
     31our @bin_utils = qw/gifsicle 
     32    giffix 
     33    giftext  
     34    gifinter  
     35    giftopnm  
     36    jpegtopnm  
     37    pngtopnm  
     38    bmptopnm  
     39    tifftopnm  
     40    ppmhist  
     41    pamfile  
     42    ocrad 
     43    gocr/;  
    4044 
    4145our @paths = qw(/usr/local/netpbm/bin /usr/local/bin /usr/bin); 
     
    7579} 
    7680 
     81sub get_ddb { 
     82    my $conf = get_config(); 
     83    my %dopts = ( AutoCommit => 1 ); 
     84    my $dsn = sprintf "dbi:mysql:%s\@%s:%d", 
     85        $conf->{focr_mysql_db}, 
     86        $conf->{focr_mysql_host}, 
     87        $conf->{focr_mysql_port} 
     88        ); 
     89    my $ddb = DBI->connect($dsn, 
     90        $conf->{focr_mysql_user}, 
     91        $conf->{focr_mysql_pass}, 
     92        \%dopts); 
     93    return $ddb; 
     94} 
     95 
    7796sub set_config { 
    7897    my($self, $conf) = @_; 
     
    125144                return $Mail::SpamAssassin::Conf::MISSING_REQUIRED_VALUE; 
    126145            } 
    127             unless ($value =~ m/^[012]$/) { 
     146            unless ($value =~ m/^[0123]$/) { 
    128147                return $Mail::SpamAssassin::Conf::INVALID_VALUE; 
    129148            } 
     
    146165    push (@cmds, { 
    147166        setting => 'focr_global_wordlist', 
    148         default => "/etc/mail/spamassassin/FuzzyOcr.words", 
     167        default => "__userstate__/FuzzyOcr.words", 
    149168        type => $Mail::SpamAssassin::Conf::CONF_TYPE_STRING 
    150169       }); 
     
    260279        default => '$gocr -i $pfile, $gocr -l 180 -d 2 -i $pfile, $gocr -l 140 -d 2 -i $pfile', 
    261280        type => $Mail::SpamAssassin::Conf::CONF_TYPE_STRING 
    262        }); 
     281        }); 
     282 
     283    push (@cmds, { 
     284        setting => 'focr_mysql_host', 
     285        default => 'localhost', 
     286        type => $Mail::SpamAssassin::Conf::CONF_TYPE_STRING 
     287        }); 
     288 
     289    push (@cmds, { 
     290        setting => 'focr_mysql_port', 
     291        default => 3306, 
     292        type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC 
     293        }); 
     294 
     295    push (@cmds, { 
     296        setting => 'focr_mysql_db', 
     297        default => 'FuzzyOcr', 
     298        type => $Mail::SpamAssassin::Conf::CONF_TYPE_STRING 
     299        }); 
     300 
     301    push (@cmds, { 
     302        setting => 'focr_mysql_hash', 
     303        default => 'Hash', 
     304        type => $Mail::SpamAssassin::Conf::CONF_TYPE_STRING 
     305        }); 
     306 
     307    push (@cmds, { 
     308        setting => 'focr_mysql_safe', 
     309        default => 'Safe', 
     310        type => $Mail::SpamAssassin::Conf::CONF_TYPE_STRING 
     311        }); 
     312 
     313    foreach (qw/user pass/) { 
     314        push (@cmds, { 
     315           setting => 'focr_mysql_'.$_, 
     316            default => 'fuzzyocr', 
     317            type =>  $Mail::SpamAssassin::Conf::CONF_TYPE_STRING 
     318            }); 
     319    } 
    263320 
    264321    $conf->{parser}->register_commands(\@cmds); 
  • trunk/devel/FuzzyOcr/Hashing.pm

    r20 r38  
    1010use lib "../"; 
    1111use FuzzyOcr::Config qw(get_thresholds get_config set_config debuglog logfile); 
     12use DBI; 
    1213use MLDBM qw(DB_File Storable); 
    1314use Fcntl; 
     
    5859sub check_image_hash_db { 
    5960    my $conf = get_config(); 
    60     return (0,'') unless ( 
    61         ($conf->{'focr_enable_image_hashing'} > 0) and 
    62         ($conf->{'focr_enable_image_hashing'} < 3) 
    63     ); 
     61    return (0,'') if ($conf->{focr_enable_image_hashing} == 0); 
    6462    my $digest = $_[0]; 
    6563    my $dbfile = $_[1] || $conf->{"focr_db_hash"}; 
     
    6866    my ($img, $key) = split('::', $digest,2); 
    6967    return (0,'') unless defined $key; 
     68    my $now = time; 
    7069    my $hash = $digest; 
    7170    my $ret = 0; my $txt = 'Exact'; 
    7271    my $dinfo; 
    73     my %DB = (); my $dbm; 
    74  
    75     if ($conf->{"focr_enable_image_hashing"} == 2) { 
     72 
     73    if ($conf->{focr_enable_image_hashing} == 3) { 
     74        unless (defined $conf->{focr_ddb}) { 
     75            debuglog("Cannot connect to '$conf->{focr_mysql_db}\@$conf->{focr_mysql_host}:$conf->{focr_mysql_port}"); 
     76            return (0,''); 
     77        } 
     78        my $ddb  = $conf->{focr_ddb}; 
     79        my @data = $ddb->selectrow_array(qq(select * from $fname where key='$key')); 
     80        my $next = 0; 
     81        my $when = 0; 
     82        if (scalar(@data)>0) { 
     83            $next  = $data[4] || 0; $next++; 
     84            $when  = $data[6] || $now; 
     85            $ret   = $data[7] || 0.001; 
     86            $dinfo = $data[8] || ''; 
     87            if ($data[2] eq '') { 
     88                debuglog("Updateing $txt info File:'$fname'"); 
     89                $ddb->do(qq(update $fname set fname='$fname' where key='$key')); 
     90            } 
     91            if ($data[3] eq '') { 
     92                debuglog("Updateing $txt info Type:'$ctype'"); 
     93                $ddb->do(qq(update $fname set ctype='$ctype' where key='$key')); 
     94            } 
     95        } else { 
     96            my $then = time - ($conf->{"focr_db_max_days"}*86400); 
     97            my $sth  = $ddb->prepare(qq(select * from $fname)); $sth->execute; 
     98            while (my @row = $sth->fetchrow_array) { 
     99                my $hash2 = $row[1] || "0:0:0:0"; 
     100                $hash2 .= "::$row[0]"; 
     101                if (within_threshold($digest,$hash2)) { 
     102                    $txt   = 'Approx'; 
     103                    $key   = $row[0]; 
     104                    $next  = $row[4] + 1; 
     105                    $when  = $row[6] || $now; 
     106                    $ret   = $dbfile eq $conf->{"focr_mysql_hash"} ? $row[7] : $row[4]; 
     107                    $dinfo = $row[8] || ''; 
     108                    debuglog("Found in Table:'$dbfile'"); 
     109                    last; 
     110                } 
     111            } 
     112            # Expire old records... 
     113            $ddb->do(qq(delete from $fname where check < $then)); 
     114        } 
     115        if ($ret > 0) { 
     116            if ($dbfile eq $conf->{"focr_mysql_hash"}) { 
     117                debuglog("Found Score <$ret> for $txt Image Hash"); 
     118            } 
     119            debuglog("Matched [$next] time(s). Prev match: ".fmt_time($when)); 
     120            $ddb->do(qq(update $fname set match='$next',match='$now' where key='$key')); 
     121        } 
     122    } 
     123    elsif ($conf->{"focr_enable_image_hashing"} == 2) { 
     124        my %DB = (); my $dbm; 
    76125        tie %DB, 'MLDBM', $dbfile, O_RDWR or $ret++; 
    77126        if ($ret>0) { 
     
    91140        } 
    92141        if ($ret == 0) { 
    93             my $now = time - ($conf->{"focr_db_max_days"}*86400); 
     142            my $then = time - ($conf->{"focr_db_max_days"}*86400); 
    94143            foreach my $k (keys %DB) { 
    95144                $dbm  = $DB{$k}; 
     
    103152                # Has the record expired?? 
    104153                $dbm->{check} = $now - 1 unless defined $dbm->{check}; 
    105                 if ($dbm->{check} < $now) { 
     154                if ($dbm->{check} < $then) { 
    106155                    debuglog("Expiring <$k> older than $conf->{'focr_db_max_days'} days"); 
    107156                    delete $DB{$k}; 
     
    121170        untie %DB; 
    122171        return ($ret,$dinfo); 
    123     } elsif ($conf->{"focr_enable_image_hashing"} == 1) { 
     172    } 
     173    elsif ($conf->{"focr_enable_image_hashing"} == 1) { 
    124174        $ret = open HASH, $conf->{"focr_digest_db"}; 
    125175        unless($ret) { 
     
    142192sub add_image_hash_db { 
    143193    my $conf = get_config(); 
    144     return unless ( 
    145         ($conf->{'focr_enable_image_hashing'} > 0) and 
    146         ($conf->{'focr_enable_image_hashing'} < 3) 
    147     ); 
     194    return if ($conf->{focr_enable_image_hashing} == 0); 
    148195    my $digest = $_[0]; 
    149196    my $score  = $_[1]; 
    150197    my $ret = 0; 
    151198 
    152     if ($conf->{"focr_enable_image_hashing"} == 2) { 
     199    if ($conf->{focr_enable_image_hashing} == 3) { 
     200        unless (defined $conf->{focr_ddb}) { 
     201            debuglog("Cannot connect to '$conf->{focr_mysql_db}\@$conf->{focr_mysql_host}:$conf->{focr_mysql_port}"); 
     202            return; 
     203        } 
     204        my $ddb  = $conf->{focr_ddb}; 
     205        my $table = $_[2] || $conf->{focr_mysql_hash}; 
     206        debuglog("Adding Hash to \"$table\""); 
     207        my ($img,$key) = split('::',$digest,2); 
     208        if (defined $key) { 
     209            my $sql = "insert into $table values ("; 
     210            $sql .= "'$key','$img','$_[3]','$_[4]',"; 
     211            $sql .= sprintf ("'%d','%d','%d','%d','%s'", 
     212                $table eq $conf->{focr_mysql_hash} ? 0 : 1, 
     213                time,time,$score,$_[5]); 
     214            $ddb->do($sql); 
     215        } 
     216    } 
     217    elsif ($conf->{"focr_enable_image_hashing"} == 2) { 
    153218        my $dbfile = $_[2] || $conf->{"focr_db_hash"}; 
    154219        my %DB = (); 
     
    159224        } 
    160225        debuglog("Adding Hash to \"$dbfile\""); 
    161  
    162226        my ($img,$key) = split('::',$digest,2); 
    163227        if (defined $key) { 
     
    174238        } 
    175239        untie %DB; 
    176     } elsif ($conf->{"focr_enable_image_hashing"} == 1) { 
     240    } 
     241    elsif ($conf->{"focr_enable_image_hashing"} == 1) { 
    177242        if (-e $conf->{"focr_digest_db"}) { 
    178243            $ret = open DB, ">>$conf->{'focr_digest_db'}";