| 1 |
|
|---|
| 2 |
|
|---|
| 3 |
|
|---|
| 4 |
|
|---|
| 5 |
|
|---|
| 6 |
|
|---|
| 7 |
|
|---|
| 8 |
|
|---|
| 9 |
|
|---|
| 10 |
|
|---|
| 11 |
|
|---|
| 12 |
|
|---|
| 13 |
|
|---|
| 14 |
|
|---|
| 15 |
|
|---|
| 16 |
|
|---|
| 17 |
|
|---|
| 18 |
|
|---|
| 19 |
|
|---|
| 20 |
use Getopt::Long; |
|---|
| 21 |
use DBI; |
|---|
| 22 |
use MLDBM qw(DB_File Storable); |
|---|
| 23 |
my %Files = ( |
|---|
| 24 |
db_hash => '/etc/mail/spamassassin/FuzzyOcr.db', |
|---|
| 25 |
db_safe => '/etc/mail/spamassassin/FuzzyOcr.safe.db', |
|---|
| 26 |
); |
|---|
| 27 |
|
|---|
| 28 |
my %MySQL = ( |
|---|
| 29 |
db => 'FuzzyOcr' |
|---|
| 30 |
,hash => 'Hash' |
|---|
| 31 |
,safe => 'Safe' |
|---|
| 32 |
,user => 'fuzzyocr' |
|---|
| 33 |
,pass => 'fuzzyocr' |
|---|
| 34 |
,host => 'localhost' |
|---|
| 35 |
,port => 3306 |
|---|
| 36 |
); |
|---|
| 37 |
|
|---|
| 38 |
|
|---|
| 39 |
my $cfgfile = "/etc/mail/spamassassin/FuzzyOcr.cf"; |
|---|
| 40 |
my %App; |
|---|
| 41 |
my @bin_utils = qw/pamfile ppmhist jpegtopnm giftopnm pngtopnm bmptopnm/; |
|---|
| 42 |
|
|---|
| 43 |
my $delete = 0; |
|---|
| 44 |
my $verbose = 0; |
|---|
| 45 |
my $learn_ham = 0; |
|---|
| 46 |
my $learn_spam = 0; |
|---|
| 47 |
my $score; |
|---|
| 48 |
GetOptions( |
|---|
| 49 |
'verbose' => \$verbose, |
|---|
| 50 |
'delete' => \$delete, |
|---|
| 51 |
'config=s' => \$cfgfile, |
|---|
| 52 |
'score=f' => \$score, |
|---|
| 53 |
'learn-ham' => \$learn_ham, |
|---|
| 54 |
'learn-spam' => \$learn_spam, |
|---|
| 55 |
); |
|---|
| 56 |
|
|---|
| 57 |
unless (@ARGV) { |
|---|
| 58 |
print "Usage: fuzzy-find.pl [Options] (imagehash|imagefile) \n"; |
|---|
| 59 |
print "\n"; |
|---|
| 60 |
print "Available options:\n"; |
|---|
| 61 |
print "--config=s Specify location of FuzzyOcr.cf\n"; |
|---|
| 62 |
print " Default: /etc/mail/spamassassin/FuzzyOcr.cf\n"; |
|---|
| 63 |
print "--delete Removes the hash from the database\n"; |
|---|
| 64 |
print "--learn-ham Add the hash as ham to the database\n"; |
|---|
| 65 |
print "--learn-spam Add the hash as spam to the database\n"; |
|---|
| 66 |
print "--score=i Score to use when adding ham/spam\n"; |
|---|
| 67 |
print "--verbose Show more informations\n"; |
|---|
| 68 |
print "\n"; |
|---|
| 69 |
exit 1; |
|---|
| 70 |
} |
|---|
| 71 |
|
|---|
| 72 |
|
|---|
| 73 |
unless (defined $score) { |
|---|
| 74 |
$score = $learn_ham ? 10 : 0; |
|---|
| 75 |
} |
|---|
| 76 |
|
|---|
| 77 |
|
|---|
| 78 |
my $app_path = q(/usr/local/netpbm/bin:/usr/local/bin:/usr/bin); |
|---|
| 79 |
open CONFIG, "< $cfgfile" or warn "Can't read configuration file, using defaults...\n"; |
|---|
| 80 |
|
|---|
| 81 |
while (<CONFIG>) { |
|---|
| 82 |
chomp; |
|---|
| 83 |
if ($_ =~ m/^focr_bin_(\w+) (.+)/) { |
|---|
| 84 |
$App{$1} = $2; |
|---|
| 85 |
printf "Found custom path \"$2\" for application \"$1\"\n" if $verbose; |
|---|
| 86 |
} |
|---|
| 87 |
if ($_ =~ m/^focr_path_bin (.+)/) { |
|---|
| 88 |
$app_path = $1; |
|---|
| 89 |
printf "Found new path: \"$1\"\n" if $verbose; |
|---|
| 90 |
} |
|---|
| 91 |
if ($_ =~ m/^focr_enable_image_hashing (\d)/) { |
|---|
| 92 |
$App{hashing_type} = $1; |
|---|
| 93 |
printf "Found DB Hashing\n" if ($verbose and $1 == 2); |
|---|
| 94 |
printf "Found MySQL Hashing\n" if ($verbose and $1 == 3); |
|---|
| 95 |
} |
|---|
| 96 |
if ($_ =~ m/^focr_mysql_(\w+) (.+)/) { |
|---|
| 97 |
$MySQL{$1} = $2; |
|---|
| 98 |
printf "Found MySQL option $1 => '$2'\n" if $verbose; |
|---|
| 99 |
} |
|---|
| 100 |
if ($_ =~ m/^focr_threshold_max_hash (.+)/) { |
|---|
| 101 |
$App{max_hash} = $1; |
|---|
| 102 |
printf "Updated Thresold{max_hash} = $1\n" if $verbose; |
|---|
| 103 |
} |
|---|
| 104 |
} |
|---|
| 105 |
|
|---|
| 106 |
close CONFIG; |
|---|
| 107 |
|
|---|
| 108 |
|
|---|
| 109 |
$App{max_hash} = 5 unless defined $App{max_hash}; |
|---|
| 110 |
|
|---|
| 111 |
|
|---|
| 112 |
foreach my $app (@bin_utils) { |
|---|
| 113 |
next if defined $App{$app}; |
|---|
| 114 |
foreach my $d (split(':',$app_path)) { |
|---|
| 115 |
if (-x "$d/$app") { |
|---|
| 116 |
$App{$app} = "$d/$app"; |
|---|
| 117 |
last; |
|---|
| 118 |
} |
|---|
| 119 |
} |
|---|
| 120 |
} |
|---|
| 121 |
|
|---|
| 122 |
sub get_ddb { |
|---|
| 123 |
my %dopts = ( AutoCommit => 1 ); |
|---|
| 124 |
my $dsn = "dbi:mysql:database=".$MySQL{db}; |
|---|
| 125 |
if (defined $MySQL{socket}) { |
|---|
| 126 |
$dsn .= ";mysql_socket=$MySQL{socket}"; |
|---|
| 127 |
} else { |
|---|
| 128 |
$dsn .= ";host=$MySQL{host}"; |
|---|
| 129 |
$dns .= ";port=$MySQL{port}" unless $MySQL{port} == 3306; |
|---|
| 130 |
} |
|---|
| 131 |
printf "Connecting to: $dsn\n" if $verbose; |
|---|
| 132 |
return DBI->connect($dsn,$MySQL{user},$MySQL{pass},\%dopts); |
|---|
| 133 |
} |
|---|
| 134 |
|
|---|
| 135 |
while (@ARGV) { |
|---|
| 136 |
my $file = shift @ARGV; |
|---|
| 137 |
my @data = (); |
|---|
| 138 |
if ($file =~ m/(\d+):(\d+):(\d+):(\d+)/) { |
|---|
| 139 |
push @data, $1,$2,$3,$4; |
|---|
| 140 |
} elsif ($file eq ':::0') { |
|---|
| 141 |
$key = $file; |
|---|
| 142 |
$data[3] = 0; |
|---|
| 143 |
} else { |
|---|
| 144 |
next unless -r $file; |
|---|
| 145 |
} |
|---|
| 146 |
my $key = ''; |
|---|
| 147 |
my $ctype = ''; |
|---|
| 148 |
my $ftype = 0; |
|---|
| 149 |
unless (@data) { |
|---|
| 150 |
my $app; |
|---|
| 151 |
if (($file =~ m/\.jpg$/i) or ($file =~ m/\.jpeg$/i)) { |
|---|
| 152 |
$app = $App{jpegtopnm}; |
|---|
| 153 |
$ctype = "image/jpeg"; |
|---|
| 154 |
$ftype = 2; |
|---|
| 155 |
} elsif ($file =~ m/\.png$/i) { |
|---|
| 156 |
$app = $App{pngtopnm}; |
|---|
| 157 |
$ctype = "image/png"; |
|---|
| 158 |
$ftype = 3; |
|---|
| 159 |
} elsif ($file =~ m/\.bmp$/i) { |
|---|
| 160 |
$ctype = "image/bmp"; |
|---|
| 161 |
$app = $App{bmptopnm}; |
|---|
| 162 |
$ftype = 4; |
|---|
| 163 |
} elsif ($file =~ m/\.tiff?$/i) { |
|---|
| 164 |
$app = $App{tifftopnm}; |
|---|
| 165 |
$ctype = "image/tiff"; |
|---|
| 166 |
$ftype = 5; |
|---|
| 167 |
} elsif ($file =~ m/\.gif$/i) { |
|---|
| 168 |
$app = $App{giftopnm}; |
|---|
| 169 |
$ctype = "image/gif"; |
|---|
| 170 |
$ftype = 1; |
|---|
| 171 |
} elsif ($file =~ m/\.pnm$/i) { |
|---|
| 172 |
$app = '/bin/cat'; |
|---|
| 173 |
$ctype = "image/pnm"; |
|---|
| 174 |
} else { |
|---|
| 175 |
print "Unknown extension given in \"$file\", aborting...\n"; |
|---|
| 176 |
exit 1; |
|---|
| 177 |
} |
|---|
| 178 |
my @hist = `$app $file 2>/dev/null |$App{ppmhist} -noheader -`; |
|---|
| 179 |
my @res = `$app $file 2>/dev/null |$App{pamfile} -`; |
|---|
| 180 |
my ($h,$w) = (0,0); |
|---|
| 181 |
if ($res[0] =~ m/(\d+) by (\d+)/) { |
|---|
| 182 |
$w = $1; $h = $2; |
|---|
| 183 |
printf "Found ($h,$w)\n" if $verbose |
|---|
| 184 |
} |
|---|
| 185 |
my $c = scalar(@hist); my $cnt = 0; |
|---|
| 186 |
printf "Colors: %d\n",$c if $verbose; |
|---|
| 187 |
push @data, (stat($file))[7],$h,$w,$c; |
|---|
| 188 |
foreach (@hist) { |
|---|
| 189 |
$_ =~ s/ +/ /g; |
|---|
| 190 |
my @d = split(' ',$_); |
|---|
| 191 |
$hash .= sprintf("::%d:%d:%d:%d:%d",@d); |
|---|
| 192 |
last if ($cnt++ ge $App{max_hash}); |
|---|
| 193 |
} |
|---|
| 194 |
$key = substr($hash,2); |
|---|
| 195 |
} |
|---|
| 196 |
printf "Img = %9d %dx%dx%d\n",@data; |
|---|
| 197 |
printf "key = <$key>\n" if ($key); |
|---|
| 198 |
if ($learn_spam || $learn_ham) { |
|---|
| 199 |
if ($App{hashing_type} == 2) { |
|---|
| 200 |
my %DB; |
|---|
| 201 |
my $ff = $learn_spam ? 'db_hash' : 'db_safe'; |
|---|
| 202 |
my $dfscore = $learn_spam ? 5 : -5; |
|---|
| 203 |
$score = $score ? $score : $dfscore; |
|---|
| 204 |
tie %DB, 'MLDBM', $Files{$ff} or die "Can't open $ff"; |
|---|
| 205 |
print "Adding key to database...\n"; |
|---|
| 206 |
if (defined $key) { |
|---|
| 207 |
my $dbm = $DB{$key}; |
|---|
| 208 |
$dbm->{fname} = $file; |
|---|
| 209 |
$dbm->{ctype} = $ctype; |
|---|
| 210 |
$dbm->{dinfo} = "Manually added to the database\n"; |
|---|
| 211 |
$dbm->{basic} = join(':', @data); |
|---|
| 212 |
$dbm->{score} = $score; |
|---|
| 213 |
$dbm->{input} = |
|---|
| 214 |
$dbm->{check} = time; |
|---|
| 215 |
$dbm->{match} = $learn_spam ? 0 : 1; |
|---|
| 216 |
$DB{$key} = $dbm; |
|---|
| 217 |
} |
|---|
| 218 |
untie %DB; |
|---|
| 219 |
exit 0; |
|---|
| 220 |
} elsif ($App{hashing_type} == 3) { |
|---|
| 221 |
my $ddb = get_ddb(); |
|---|
| 222 |
if ($ddb) { |
|---|
| 223 |
my $now = time; |
|---|
| 224 |
my $tab = $learn_spam ? 'hash' : 'safe'; |
|---|
| 225 |
my $sql = "INSERT INTO $MySQL{$tab} VALUES ('" . $key |
|---|
| 226 |
. "','" . join(':',@data)."','" |
|---|
| 227 |
. "','" . $file |
|---|
| 228 |
. "','" . $ctype |
|---|
| 229 |
. "','" . $ftype |
|---|
| 230 |
. "','" . $learn_spam ? 0 : 1 |
|---|
| 231 |
. "','" . $now |
|---|
| 232 |
. "','" . $now |
|---|
| 233 |
. "','" . $score |
|---|
| 234 |
. "','" . "Manually added to the database\n"; |
|---|
| 235 |
$ddb->do($sql); |
|---|
| 236 |
$dbb->disconnect; |
|---|
| 237 |
} else { |
|---|
| 238 |
printf "Cannot connect to $dsn\n"; |
|---|
| 239 |
exit 1; |
|---|
| 240 |
} |
|---|
| 241 |
exit 0; |
|---|
| 242 |
} |
|---|
| 243 |
} else { |
|---|
| 244 |
if ($App{hashing_type} == 2) { |
|---|
| 245 |
foreach my $ff (keys %Files) { |
|---|
| 246 |
my %DB; |
|---|
| 247 |
tie %DB, 'MLDBM', $Files{$ff} or next; |
|---|
| 248 |
printf "Searching $Files{$ff}...\n"; |
|---|
| 249 |
foreach my $kk (keys %DB) { |
|---|
| 250 |
my $db = $DB{$kk}; |
|---|
| 251 |
my @dd = split('::',$kk); |
|---|
| 252 |
shift @dd if ($dd[0] !~ m/:/); |
|---|
| 253 |
my $dd = join('::',@dd); |
|---|
| 254 |
if ($key eq '') { |
|---|
| 255 |
next unless ($db->{basic} eq join(':',@data)); |
|---|
| 256 |
} else { |
|---|
| 257 |
next unless ($dd eq $key); |
|---|
| 258 |
} |
|---|
| 259 |
printf "%s HASH\n",($delete)?'Removing':'Found'; |
|---|
| 260 |
if ($delete) { |
|---|
| 261 |
delete $DB{$kk}; |
|---|
| 262 |
} else { |
|---|
| 263 |
printf "ImageInfo : %9d:%d:%d:%d\n",split(':',$db->{basic}); |
|---|
| 264 |
printf "Matched : %4d Time(s)\n",$db->{match}; |
|---|
| 265 |
printf "Calc.Score : %9.3f\n",$db->{score}; |
|---|
| 266 |
printf "in DB since: %s\n",scalar(localtime($db->{input})); |
|---|
| 267 |
printf "Last Match : %s\n",scalar(localtime($db->{check})); |
|---|
| 268 |
} |
|---|
| 269 |
} |
|---|
| 270 |
untie %DB; |
|---|
| 271 |
} |
|---|
| 272 |
} elsif ($App{hashing_type} == 3) { |
|---|
| 273 |
my $ddb = get_ddb(); |
|---|
| 274 |
if ($ddb) { |
|---|
| 275 |
foreach my $ff (sort keys %Files) { |
|---|
| 276 |
my $sql; |
|---|
| 277 |
if ($delete) { |
|---|
| 278 |
$sql = "DELETE FROM $ff WHERE $MySQL{$tab}.key=?"; |
|---|
| 279 |
$ddb->do($sql,undef,$key); |
|---|
| 280 |
} else { |
|---|
| 281 |
my $tab = $ff; $tab =~ s/db_//; |
|---|
| 282 |
$sql = "SELECT * FROM $MySQL{$tab} where $MySQL{$tab}.key=?"; |
|---|
| 283 |
my @data = $ddb->selectrow_array($sql,undef,$key); |
|---|
| 284 |
if (scalar(@data)) { |
|---|
| 285 |
printf "ImageInfo : %9d:%d:%d:%d\n",split(':',$data[1]); |
|---|
| 286 |
printf "Matched : %4d Time(s)\n",$data[5]; |
|---|
| 287 |
printf "Calc.Score : %9.3f\n",$data[8]; |
|---|
| 288 |
printf "in DB since: %s\n",scalar(localtime($data[6])); |
|---|
| 289 |
printf "Last Match : %s\n",scalar(localtime($data[7])); |
|---|
| 290 |
} |
|---|
| 291 |
} |
|---|
| 292 |
} |
|---|
| 293 |
$ddb->disconnect; |
|---|
| 294 |
} |
|---|
| 295 |
} |
|---|
| 296 |
} |
|---|
| 297 |
} |
|---|