Changeset 89
- Timestamp:
- 06.12.2006 23:46:25 (2 years ago)
- Files:
-
- trunk/devel/FuzzyOcr.pm (modified) (3 diffs)
- trunk/devel/FuzzyOcr/Config.pm (modified) (4 diffs)
- trunk/devel/Utils/fuzzy-find (modified) (9 diffs)
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
trunk/devel/FuzzyOcr.pm
r88 r89 139 139 my $test = 0; 140 140 $test++ if ($ctype =~ /image/i); 141 $test++ if ($fname =~ / (gif|jpg|jpeg|png|bmp|tiff?)$/i);141 $test++ if ($fname =~ /\.(gif|jpe?g|png|bmp|tiff?)$/i); 142 142 143 143 if ($test == 0) { … … 262 262 print PICT $pdata; 263 263 close PICT; 264 infolog("Saved: $imgfilename");264 debuglog("Saved: $imgfilename"); 265 265 $cnt++; 266 266 } … … 756 756 $wcnt++; 757 757 infolog( 758 " Found word \"$w\" in line\n \"$_\" \nwith fuzz of "758 "Scanset \"$scanlabel\" found word \"$w\" with fuzz of " 759 759 . sprintf("%0.4f",$matched) 760 . " scanned with scanset \"$scanlabel\""760 . "\nline: \"$_\"" 761 761 ); 762 762 if ($conf->{focr_unique_matches}) { trunk/devel/FuzzyOcr/Config.pm
r88 r89 68 68 my @img_types = qw/gif png jpeg bmp tiff/; 69 69 70 # Default thresolds71 $Threshold{s} =72 $Threshold{h} =73 $Threshold{w} =74 $Threshold{cn} = 0.01;75 $Threshold{c} = 5;76 $Threshold{max_hash} = 5;77 78 70 sub get_timeout { 79 71 unless (defined $timeout) { … … 164 156 $dsn .= ";port=".$conf->{focr_mysql_port} if $conf->{focr_mysql_port} != 3306; 165 157 } 166 infolog("Connecting to: $dsn");158 debuglog("Connecting to: $dsn"); 167 159 my $ddb = DBI->connect($dsn, 168 160 $conf->{focr_mysql_user}, … … 176 168 my @cmds = (); 177 169 170 foreach my $t (qw/s h w cn/) { 171 push (@cmds, { 172 setting => 'focr_threshold_'.$t, 173 default => 0.01, 174 type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC 175 }); 176 } 177 foreach my $t (qw/c max_hash/) { 178 push (@cmds, { 179 setting => 'focr_threshold_'.$t, 180 default => 5, 181 type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC 182 }); 183 } 178 184 push (@cmds, { 179 185 setting => 'focr_threshold', … … 552 558 } 553 559 560 # Allow scanning if in debug mode? 561 $conf->{focr_autodisable_score} = 1000 562 if $Mail::SpamAssassin::Logger::LOG_SA{level} == 3; 563 564 # Extract Thresholds 565 foreach my $k (keys %{$conf}) { 566 if ($k =~ m/^focr_threshold_(\S+)/) { 567 $Threshold{$1} = $conf->{$k}; 568 debuglog("Threshold[$1] => $conf->{$k}"); 569 } 570 } 554 571 # Display All Options 555 572 foreach my $k (sort keys %{$conf}) { trunk/devel/Utils/fuzzy-find
r40 r89 1 #!/usr/ bin/perl1 #!/usr/local/bin/perl 2 2 use Getopt::Long; 3 use DBI; 3 4 use MLDBM qw(DB_File Storable); 4 5 my %Files = ( … … 7 8 ); 8 9 10 my %MySQL = ( 11 db => 'FuzzyOcr' 12 ,hash => 'Hash' 13 ,safe => 'Safe' 14 ,user => 'fuzzyocr' 15 ,pass => 'fuzzyocr' 16 ,host => 'localhost' 17 ,port => 3306 18 ); 19 9 20 # defaults 10 my $cfgfile = " FuzzyOcr.cf";21 my $cfgfile = "/etc/mail/spamassassin/FuzzyOcr.cf"; 11 22 my %App; 12 23 my @bin_utils = qw/pamfile ppmhist jpegtopnm giftopnm pngtopnm bmptopnm/; 13 foreach (@bin_utils) {14 $App{$_} = "/usr/bin/$_";15 }16 24 17 25 my $delete = 0; … … 19 27 my $learn_ham = 0; 20 28 my $learn_spam = 0; 29 my $score; 21 30 GetOptions( 22 31 'verbose' => \$verbose, 23 32 'delete' => \$delete, 33 'config=s' => \$cfgfile, 34 'score=f' => \$score, 24 35 'learn-ham' => \$learn_ham, 25 36 'learn-spam' => \$learn_spam, … … 30 41 print "\n"; 31 42 print "Available options:\n"; 43 print "--config=s Specify location of FuzzyOcr.cf\n"; 44 print " Default: /etc/mail/spamassasin/FuzzyOcr.cf\n"; 32 45 print "--delete Removes the hash from the database\n"; 33 46 print "--learn-ham Add the hash as ham to the database\n"; 34 47 print "--learn-spam Add the hash as spam to the database\n"; 48 print "--score=i Score to use when adding ham/spam\n"; 35 49 print "--verbose Show more informations\n"; 36 50 print "\n"; … … 38 52 } 39 53 54 # Setup default score 55 unless (defined $score) { 56 $score = $learn_ham ? 10 : 0; 57 } 58 40 59 # Read custom paths from FuzzyOcr.cf 60 my $app_path = q(/usr/local/netpbm/bin:/usr/local/bin:/usr/bin); 41 61 open CONFIG, "< $cfgfile" or warn "Can't read configuration file, using defaults...\n"; 42 62 … … 44 64 chomp; 45 65 if ($_ =~ m/^focr_bin_(\w+) (.+)/) { 46 $App{$1} = $2; 47 printf "Found custom path \"$2\" for application \"$1\"\n" if $verbose 66 $App{$1} = $2; 67 printf "Found custom path \"$2\" for application \"$1\"\n" if $verbose; 68 } 69 if ($_ =~ m/^focr_path_bin (.+)/) { 70 $app_path = $1; 71 printf "Found new path: \"$1\"\n" if $verbose; 72 } 73 if ($_ =~ m/^focr_enable_image_hashing (\d)/) { 74 $App{hashing_type} = $1; 75 printf "Found DB Hashing\n" if ($verbose and $1 == 2); 76 printf "Found MySQL Hashing\n" if ($verbose and $1 == 3); 77 } 78 if ($_ =~ m/^focr_mysql_(\w+) (.+)/) { 79 $MySQL{$1} = $2; 80 printf "Found MySQL option $1 => '$2'\n" if $verbose; 81 } 82 if ($_ =~ m/^focr_threshold_max_hash (.+)/) { 83 $App{max_hash} = $1; 84 printf "Updated Thresold{max_hash} = $1\n" if $verbose; 48 85 } 49 86 } 50 87 51 88 close CONFIG; 89 90 # make shure we have this threshold set 91 $App{max_hash} = 5 unless defined $App{max_hash}; 92 93 # search path for bin_util unless already specified in configuration file 94 foreach my $app (@bin_utils) { 95 next if defined $App{$app}; 96 foreach my $d (split(':',$app_path)) { 97 if (-x "$d/$app") { 98 $App{$app} = "$d/$app"; 99 last; 100 } 101 } 102 } 103 104 sub get_ddb { 105 my %dopts = ( AutoCommit => 1 ); 106 my $dsn = "dbi:mysql:database=".$MySQL{db}; 107 if (defined $MySQL{socket}) { 108 $dsn .= ";mysql_socket=$MySQL{socket}"; 109 } else { 110 $dsn .= ";host=$MySQL{host}"; 111 $dns .= ";port=$MySQL{port}" unless $MySQL{port} == 3306; 112 } 113 printf "Connecting to: $dsn\n" if $verbose; 114 return DBI->connect($dsn,$MySQL{user},$MySQL{pass},\%dopts); 115 } 52 116 53 117 while (@ARGV) { … … 64 128 my $key = ''; 65 129 my $ctype = ''; 130 my $ftype = 0; 66 131 unless (@data) { 67 132 my $app; 68 133 if (($file =~ m/\.jpg$/i) or ($file =~ m/\.jpeg$/i)) { 69 134 $app = $App{jpegtopnm}; 70 $ctype = "image/jpeg"; 135 $ctype = "image/jpeg"; 136 $ftype = 2; 71 137 } elsif ($file =~ m/\.png$/i) { 72 138 $app = $App{pngtopnm}; 73 $ctype = "image/png"; 139 $ctype = "image/png"; 140 $ftype = 3; 74 141 } elsif ($file =~ m/\.bmp$/i) { 142 $ctype = "image/bmp"; 75 143 $app = $App{bmptopnm}; 76 $ctype = "image/bmp"; 144 $ftype = 4; 145 } elsif ($file =~ m/\.tiff?$/i) { 146 $app = $App{tifftopnm}; 147 $ctype = "image/tiff"; 148 $ftype = 5; 149 } elsif ($file =~ m/\.gif$/i) { 150 $app = $App{giftopnm}; 151 $ctype = "image/gif"; 152 $ftype = 1; 77 153 } elsif ($file =~ m/\.pnm$/i) { 78 $ctype = "image/pnm";79 154 $app = '/bin/cat'; 80 } elsif ($file =~ m/\.gif$/i) { 81 $ctype = "image/gif"; 82 $app = $App{giftopnm}; 155 $ctype = "image/pnm"; 83 156 } else { 84 print "Unknown extension given in \"$file\", aborting...\n";85 exit 1;86 }87 my @hist = `$app $file |$App{ppmhist} -noheader -`;88 my @res = `$app $file |$App{pamfile} -`;157 print "Unknown extension given in \"$file\", aborting...\n"; 158 exit 1; 159 } 160 my @hist = `$app $file 2>/dev/null |$App{ppmhist} -noheader -`; 161 my @res = `$app $file 2>/dev/null |$App{pamfile} -`; 89 162 my ($h,$w) = (0,0); 90 163 if ($res[0] =~ m/(\d+) by (\d+)/) { … … 99 172 my @d = split(' ',$_); 100 173 $hash .= sprintf("::%d:%d:%d:%d:%d",@d); 101 last if ($cnt++ ge 5);174 last if ($cnt++ ge $App{max_hash}); 102 175 } 103 176 $key = substr($hash,2); … … 106 179 printf "key = <$key>\n" if ($key); 107 180 if ($learn_spam || $learn_ham) { 108 my %DB; 109 my $ff = $learn_spam ? 'db_hash' : 'db_safe'; 110 tie %DB, 'MLDBM', $Files{$ff} or die "Can't open $ff"; 111 print "Adding key to database...\n"; 112 if (defined $key) { 113 my $dbm = $DB{$key}; 114 $dbm->{fname} = $file; 115 $dbm->{ctype} = $ctype; 116 $dbm->{dinfo} = "Manually added to the database\n"; 117 $dbm->{basic} = join(':', @data); 118 $dbm->{score} = $learn_spam ? 10 : 0; 119 $dbm->{input} = 120 $dbm->{check} = time; 121 $dbm->{match} = $learn_spam ? 0 : 1; 122 $DB{$key} = $dbm; 123 } 124 untie %DB; 125 exit 0; 181 if ($App{hashing_type} == 2) { 182 my %DB; 183 my $ff = $learn_spam ? 'db_hash' : 'db_safe'; 184 tie %DB, 'MLDBM', $Files{$ff} or die "Can't open $ff"; 185 print "Adding key to database...\n"; 186 if (defined $key) { 187 my $dbm = $DB{$key}; 188 $dbm->{fname} = $file; 189 $dbm->{ctype} = $ctype; 190 $dbm->{dinfo} = "Manually added to the database\n"; 191 $dbm->{basic} = join(':', @data); 192 $dbm->{score} = $score; 193 $dbm->{input} = 194 $dbm->{check} = time; 195 $dbm->{match} = $learn_spam ? 0 : 1; 196 $DB{$key} = $dbm; 197 } 198 untie %DB; 199 exit 0; 200 } elsif ($App{hashing_type} == 3) { 201 my $ddb = get_ddb(); 202 if ($ddb) { 203 my $now = time; 204 my $tab = $learn_spam ? 'hash' : 'safe'; 205 my $sql = "INSERT INTO $MySQL{$tab} VALUES ('" . $key 206 . "','" . join(':',@data)."','" 207 . "','" . $file 208 . "','" . $ctype 209 . "','" . $ftype 210 . "','" . $learn_spam ? 0 : 1 211 . "','" . $now 212 . "','" . $now 213 . "','" . $score 214 . "','" . "Manually added to the database\n"; 215 $ddb->do($sql); 216 $dbb->disconnect; 217 } else { 218 printf "Cannot connect to $dsn\n"; 219 exit 1; 220 } 221 exit 0; 222 } 126 223 } else { 127 foreach my $ff (keys %Files) { 128 my %DB; 129 tie %DB, 'MLDBM', $Files{$ff} or next; 130 printf "Searching $Files{$ff}...\n"; 131 foreach my $kk (keys %DB) { 132 my $db = $DB{$kk}; 133 my @dd = split('::',$kk); 134 shift @dd if ($dd[0] !~ m/:/); 135 my $dd = join('::',@dd); 136 if ($key eq '') { 137 next unless ($db->{basic} eq join(':',@data)); 138 } else { 139 next unless ($dd eq $key); 140 } 141 printf "%s HASH\n",($delete)?'Removing':'Found'; 142 if ($delete) { 143 delete $DB{$kk}; 144 } else { 145 printf "ImageInfo : %9d:%d:%d:%d\n",split(':',$db->{basic}); 146 printf "Matched : %4d Time(s)\n",$db->{match}; 147 printf "Calc.Score : %9.3f\n",$db->{score}; 148 printf "in DB since: %s\n",scalar(localtime($db->{input})); 149 printf "Last Match : %s\n",scalar(localtime($db->{check})); 150 } 151 } 152 untie %DB; 153 } 154 } 155 } 224 if ($App{hashing_type} == 2) { 225 foreach my $ff (keys %Files) { 226 my %DB; 227 tie %DB, 'MLDBM', $Files{$ff} or next; 228 printf "Searching $Files{$ff}...\n"; 229 foreach my $kk (keys %DB) { 230 my $db = $DB{$kk}; 231 my @dd = split('::',$kk); 232 shift @dd if ($dd[0] !~ m/:/); 233 my $dd = join('::',@dd); 234 if ($key eq '') { 235 next unless ($db->{basic} eq join(':',@data)); 236 } else { 237 next unless ($dd eq $key); 238 } 239 printf "%s HASH\n",($delete)?'Removing':'Found'; 240 if ($delete) { 241 delete $DB{$kk}; 242 } else { 243 printf "ImageInfo : %9d:%d:%d:%d\n",split(':',$db->{basic}); 244 printf "Matched : %4d Time(s)\n",$db->{match}; 245 printf "Calc.Score : %9.3f\n",$db->{score}; 246 printf "in DB since: %s\n",scalar(localtime($db->{input})); 247 printf "Last Match : %s\n",scalar(localtime($db->{check})); 248 } 249 } 250 untie %DB; 251 } 252 } elsif ($App{hashing_type} == 3) { 253 my $ddb = get_ddb(); 254 if ($ddb) { 255 foreach my $ff (sort keys %Files) { 256 my $sql; 257 if ($delete) { 258 $sql = "DELETE FROM $ff WHERE $MySQL{$tab}.key=?"; 259 $ddb->do($sql,undef,$key); 260 } else { 261 my $tab = $ff; $tab =~ s/db_//; 262 $sql = "SELECT * FROM $MySQL{$tab} where $MySQL{$tab}.key=?"; 263 my @data = $ddb->selectrow_array($sql,undef,$key); 264 if (scalar(@data)) { 265 printf "ImageInfo : %9d:%d:%d:%d\n",split(':',$data[1]); 266 printf "Matched : %4d Time(s)\n",$data[5]; 267 printf "Calc.Score : %9.3f\n",$data[8]; 268 printf "in DB since: %s\n",scalar(localtime($data[6])); 269 printf "Last Match : %s\n",scalar(localtime($data[7])); 270 } 271 } 272 } 273 $ddb->disconnect; 274 } 275 } 276 } 277 }
