| 1 |
|
|---|
| 2 |
|
|---|
| 3 |
|
|---|
| 4 |
|
|---|
| 5 |
|
|---|
| 6 |
|
|---|
| 7 |
|
|---|
| 8 |
|
|---|
| 9 |
|
|---|
| 10 |
|
|---|
| 11 |
|
|---|
| 12 |
|
|---|
| 13 |
|
|---|
| 14 |
|
|---|
| 15 |
|
|---|
| 16 |
|
|---|
| 17 |
|
|---|
| 18 |
|
|---|
| 19 |
|
|---|
| 20 |
|
|---|
| 21 |
|
|---|
| 22 |
|
|---|
| 23 |
|
|---|
| 24 |
|
|---|
| 25 |
|
|---|
| 26 |
|
|---|
| 27 |
|
|---|
| 28 |
|
|---|
| 29 |
|
|---|
| 30 |
|
|---|
| 31 |
|
|---|
| 32 |
|
|---|
| 33 |
|
|---|
| 34 |
|
|---|
| 35 |
|
|---|
| 36 |
|
|---|
| 37 |
|
|---|
| 38 |
|
|---|
| 39 |
|
|---|
| 40 |
|
|---|
| 41 |
|
|---|
| 42 |
|
|---|
| 43 |
|
|---|
| 44 |
package FuzzyOcr; |
|---|
| 45 |
|
|---|
| 46 |
use strict; |
|---|
| 47 |
use warnings; |
|---|
| 48 |
use Mail::SpamAssassin; |
|---|
| 49 |
use Mail::SpamAssassin::Logger; |
|---|
| 50 |
use Mail::SpamAssassin::Util; |
|---|
| 51 |
use Mail::SpamAssassin::Timeout; |
|---|
| 52 |
use Mail::SpamAssassin::Plugin; |
|---|
| 53 |
|
|---|
| 54 |
use String::Approx 'adistr'; |
|---|
| 55 |
use Image::Magick; |
|---|
| 56 |
use MLDBM qw(DB_File Storable); |
|---|
| 57 |
use FileHandle; |
|---|
| 58 |
use Fcntl ':flock'; |
|---|
| 59 |
|
|---|
| 60 |
our @ISA = qw (Mail::SpamAssassin::Plugin); |
|---|
| 61 |
|
|---|
| 62 |
our %App = (); |
|---|
| 63 |
our %Option = (); |
|---|
| 64 |
our %Score = (); |
|---|
| 65 |
our %Threshold = (); |
|---|
| 66 |
our %words = (); |
|---|
| 67 |
our $self; |
|---|
| 68 |
our $pms; |
|---|
| 69 |
our @scansets; |
|---|
| 70 |
|
|---|
| 71 |
our @bin_utils = qw/giffix giftext gifinter giftopnm |
|---|
| 72 |
jpegtopnm pngtopnm bmptopnm ppmhist gocr/; |
|---|
| 73 |
|
|---|
| 74 |
our @pgm_scores = qw/base add corrupt corrupt_unfixable wrongctype |
|---|
| 75 |
autodisable/; |
|---|
| 76 |
|
|---|
| 77 |
our @pgm_opts = qw/personal_wordlist global_wordlist logfile |
|---|
| 78 |
threshold counts_required verbose timeout gif_max_frames |
|---|
| 79 |
db_hash db_safe db_max_days path_bin scansets keep_bad_images |
|---|
| 80 |
anim_delay anim_max_frames score_ham |
|---|
| 81 |
enable_image_hashing digest_db hashing_learn_scanned/; |
|---|
| 82 |
|
|---|
| 83 |
our @paths = qw(/usr/local/netpbm/bin /usr/local/bin /usr/bin); |
|---|
| 84 |
|
|---|
| 85 |
|
|---|
| 86 |
$Option{threshold} = 0.3; |
|---|
| 87 |
$Option{counts_required} = 2; |
|---|
| 88 |
$Option{verbose} = 1; |
|---|
| 89 |
$Option{timeout} = 10; |
|---|
| 90 |
$Option{gif_max_frames} = 5; |
|---|
| 91 |
$Option{logfile} = "stderr"; |
|---|
| 92 |
$Option{enable_image_hashing} = 0; |
|---|
| 93 |
$Option{hashing_learn_scanned} = 1; |
|---|
| 94 |
$Option{digest_db} = "/etc/mail/spamassassin/FuzzyOcr.hashdb"; |
|---|
| 95 |
$Option{global_wordlist} = "/etc/mail/spamassassin/FuzzyOcr.words"; |
|---|
| 96 |
$Option{personal_wordlist} = ".spamassassin/fuzzyocr.words"; |
|---|
| 97 |
$Option{db_hash} = "/etc/mail/spamassassin/FuzzyOcr.db"; |
|---|
| 98 |
$Option{db_safe} = "/etc/mail/spamassassin/FuzzyOcr.safe.db"; |
|---|
| 99 |
$Option{db_max_days} = 35; |
|---|
| 100 |
$Option{keep_bad_images} = 0; |
|---|
| 101 |
$Option{anim_delay} = 100; |
|---|
| 102 |
$Option{anim_max_frames} = 2; |
|---|
| 103 |
$Option{score_ham} = 0; |
|---|
| 104 |
|
|---|
| 105 |
|
|---|
| 106 |
$Score{base} = 4; |
|---|
| 107 |
$Score{add} = 1; |
|---|
| 108 |
$Score{corrupt} = 2.5; |
|---|
| 109 |
$Score{corrupt_unfixable} = 5; |
|---|
| 110 |
$Score{wrongctype} = 1.5; |
|---|
| 111 |
$Score{autodisable} = 10; |
|---|
| 112 |
|
|---|
| 113 |
|
|---|
| 114 |
$Threshold{s} = |
|---|
| 115 |
$Threshold{h} = |
|---|
| 116 |
$Threshold{w} = |
|---|
| 117 |
$Threshold{cn} = 0.01; |
|---|
| 118 |
$Threshold{c} = 5; |
|---|
| 119 |
$Threshold{max_hash} = 5; |
|---|
| 120 |
|
|---|
| 121 |
|
|---|
| 122 |
sub new { |
|---|
| 123 |
my ( $class, $mailsa ) = @_; |
|---|
| 124 |
$class = ref($class) || $class; |
|---|
| 125 |
my $self = $class->SUPER::new($mailsa); |
|---|
| 126 |
bless( $self, $class ); |
|---|
| 127 |
$self->register_eval_rule("fuzzyocr_check"); |
|---|
| 128 |
$self->register_eval_rule("dummy_check"); |
|---|
| 129 |
return $self; |
|---|
| 130 |
} |
|---|
| 131 |
|
|---|
| 132 |
sub parse_config { |
|---|
| 133 |
my ( $self, $opts ) = @_; |
|---|
| 134 |
if ( $opts->{key} =~ /^focr_bin_/i ) { |
|---|
| 135 |
my $p = lc $opts->{key}; |
|---|
| 136 |
$p =~ s/focr_bin_//; |
|---|
| 137 |
if (grep {m/$p/} @bin_utils) { |
|---|
| 138 |
$App{$p} = $opts->{value}; |
|---|
| 139 |
debuglog("App{$p} => $App{$p}"); |
|---|
| 140 |
} else { |
|---|
| 141 |
debuglog("unknown App: $opts->{key}"); |
|---|
| 142 |
} |
|---|
| 143 |
} |
|---|
| 144 |
elsif ( $opts->{key} =~ m/_score$/i ) { |
|---|
| 145 |
my $o = lc $opts->{key}; |
|---|
| 146 |
$o =~ s/focr_//; |
|---|
| 147 |
$o =~ s/_score//; |
|---|
| 148 |
if (grep {m/$o/} @pgm_scores) { |
|---|
| 149 |
$Score{$o} = $opts->{value}; |
|---|
| 150 |
debuglog("Score{$o} = $Score{$o}"); |
|---|
| 151 |
} else { |
|---|
| 152 |
debuglog("unknown Score: $opts->{key}"); |
|---|
| 153 |
} |
|---|
| 154 |
} |
|---|
| 155 |
else { |
|---|
| 156 |
my $o = lc $opts->{key}; |
|---|
| 157 |
$o =~ s/focr_//; |
|---|
| 158 |
if (grep {m/$o/} @pgm_opts) { |
|---|
| 159 |
if ($o eq 'scansets') { |
|---|
| 160 |
@scansets = (); |
|---|
| 161 |
foreach my $s (split(',',$opts->{value})) { |
|---|
| 162 |
$s =~ s/^\s*//; $s =~ s/\s*$//; |
|---|
| 163 |
push @scansets,$s; |
|---|
| 164 |
debuglog("Found scan: $s"); |
|---|
| 165 |
} |
|---|
| 166 |
} elsif ($o eq 'path_bin') { |
|---|
| 167 |
@paths = (); |
|---|
| 168 |
foreach my $p (split(':',$opts->{value})) { |
|---|
| 169 |
next unless -d $p; |
|---|
| 170 |
push @paths,$p; |
|---|
| 171 |
debuglog("Valid search path: $p"); |
|---|
| 172 |
} |
|---|
| 173 |
} else { |
|---|
| 174 |
$Option{$o} = $opts->{value}; |
|---|
| 175 |
debuglog("Option $o = $Option{$o}"); |
|---|
| 176 |
} |
|---|
| 177 |
} else { |
|---|
| 178 |
debuglog("unknown Option: $opts->{key}"); |
|---|
| 179 |
} |
|---|
| 180 |
} |
|---|
| 181 |
1; |
|---|
| 182 |
} |
|---|
| 183 |
|
|---|
| 184 |
sub finish_parsing_end { |
|---|
| 185 |
if ($Option{logfile} !~ m/stderr/i) { |
|---|
| 186 |
if(!Mail::SpamAssassin::Logger::add(method=>'file', filename=>$Option{logfile})) { |
|---|
| 187 |
debuglog("Could not log into $Option{logfile}"); |
|---|
| 188 |
} else { |
|---|
| 189 |
Mail::SpamAssassin::Logger::add_facilities('file'); |
|---|
| 190 |
debuglog("Now logging to \"$Option{logfile}\""); |
|---|
| 191 |
} |
|---|
| 192 |
} |
|---|
| 193 |
unless (@paths) { |
|---|
| 194 |
foreach my $p ( '/usr/local/netpbm/bin', '/usr/local/bin', '/usr/bin' ) { |
|---|
| 195 |
if (-d $p) { |
|---|
| 196 |
push @paths, $p; |
|---|
| 197 |
debuglog("Searching in: $p"); |
|---|
| 198 |
} |
|---|
| 199 |
} |
|---|
| 200 |
} |
|---|
| 201 |
foreach my $a (@bin_utils) { |
|---|
| 202 |
if (defined $App{$a} and ! -x $App{$a}) { |
|---|
| 203 |
debuglog("cannot exec $a, removing..."); |
|---|
| 204 |
delete $App{$a}; |
|---|
| 205 |
} |
|---|
| 206 |
foreach my $p (@paths) { |
|---|
| 207 |
my $f = "$p/$a"; |
|---|
| 208 |
if (! defined $App{$a} and -x $f) { |
|---|
| 209 |
$App{$a} = $f; |
|---|
| 210 |
last; |
|---|
| 211 |
} |
|---|
| 212 |
} |
|---|
| 213 |
if (defined $App{$a}) { |
|---|
| 214 |
debuglog("Using $a => $App{$a}"); |
|---|
| 215 |
} else { |
|---|
| 216 |
debuglog("Cannot find executable for $a"); |
|---|
| 217 |
} |
|---|
| 218 |
} |
|---|
| 219 |
if ($Option{enable_image_hashing} == 2 and -r $Option{digest_db}) { |
|---|
| 220 |
my %DB; my $dbm; my $err = 0; |
|---|
| 221 |
my $now = time - ($Option{db_max_days}*86400); |
|---|
| 222 |
tie %DB, 'MLDBM', $Option{db_hash} or $err++; |
|---|
| 223 |
if ($err) { |
|---|
| 224 |
debuglog("Could not open \"$Option{db_hash}\""); |
|---|
| 225 |
} else { |
|---|
| 226 |
my $hash = 0; |
|---|
| 227 |
debuglog("Expiring records prior to: ".scalar(localtime($now))); |
|---|
| 228 |
foreach my $k (keys %DB) { |
|---|
| 229 |
my $db = $DB{$k}; |
|---|
| 230 |
if ($db->{check} < $now) { |
|---|
| 231 |
debuglog("Expire: <$k> Reason: $db->{check} < $now"); |
|---|
| 232 |
delete $DB{$k}; $hash++; |
|---|
| 233 |
} |
|---|
| 234 |
} |
|---|
| 235 |
debuglog("Expired <$hash> Image Hashes after $Option{db_max_days} day(s)") |
|---|
| 236 |
if ($hash>0); |
|---|
| 237 |
$hash = 0; |
|---|
| 238 |
open HASH, $Option{digest_db}; |
|---|
| 239 |
while (<HASH>) { |
|---|
| 240 |
chomp; |
|---|
| 241 |
my($score,$basic,$key) = split('::',$_,3); |
|---|
| 242 |
next if (defined $DB{$key}); |
|---|
| 243 |
$dbm = $DB{$key}; |
|---|
| 244 |
$dbm->{score} = $score; |
|---|
| 245 |
$dbm->{basic} = $basic; |
|---|
| 246 |
$dbm->{input} = |
|---|
| 247 |
$dbm->{check} = time; |
|---|
| 248 |
$dbm->{match} = 1; |
|---|
| 249 |
$DB{$key} = $dbm; |
|---|
| 250 |
$hash++; |
|---|
| 251 |
} |
|---|
| 252 |
close HASH; |
|---|
| 253 |
debuglog("Imported <$hash> Image Hashes from \"$Option{digest_db}\"") |
|---|
| 254 |
if ($hash>0); |
|---|
| 255 |
$hash = scalar(keys %DB); |
|---|
| 256 |
debuglog("<$hash> Known BAD Image Hashes Available"); |
|---|
| 257 |
} |
|---|
| 258 |
untie %DB; $err = 0; |
|---|
| 259 |
tie %DB, 'MLDBM', $Option{db_safe} or $err++; |
|---|
| 260 |
if ($err) { |
|---|
| 261 |
debuglog("Could not open \"$Option{db_safe}\""); |
|---|
| 262 |
} else { |
|---|
| 263 |
my $hash = 0; |
|---|
| 264 |
foreach my $k (keys %DB) { |
|---|
| 265 |
my $db = $DB{$k}; |
|---|
| 266 |
if ($db->{check} < $now) { |
|---|
| 267 |
debuglog("Expire: <$k> Reason: $db->{check} < $now"); |
|---|
| 268 |
delete $DB{$k}; $hash++; |
|---|
| 269 |
} |
|---|
| 270 |
} |
|---|
| 271 |
debuglog("Expired <$hash> Image Hashes after $Option{db_max_days} day(s)") |
|---|
| 272 |
if ($hash>0); |
|---|
| 273 |
$hash = scalar(keys %DB); |
|---|
| 274 |
debuglog("<$hash> Known GOOD Image Hashes Available"); |
|---|
| 275 |
} |
|---|
| 276 |
untie %DB; |
|---|
| 277 |
} |
|---|
| 278 |
load_global_words( $Option{global_wordlist} ); |
|---|
| 279 |
|
|---|
| 280 |
unless (@scansets) { |
|---|
| 281 |
@scansets = ( '$gocr -i $pfile', '$gocr -l 180 -d 2 -i $pfile'); |
|---|
| 282 |
} |
|---|
| 283 |
foreach (@scansets) { |
|---|
| 284 |
debuglog("Using scan: $_"); |
|---|
| 285 |
} |
|---|
| 286 |
} |
|---|
| 287 |
|
|---|
| 288 |
sub dummy_check { |
|---|
| 289 |
return 0; |
|---|
| 290 |
} |
|---|
| 291 |
|
|---|
| 292 |
sub load_global_words { |
|---|
| 293 |
unless ( -r $_[0] ) { |
|---|
| 294 |
debuglog("Cannot read Global wordlist: \"$_[0]\"\n Please check file path and permissions are correct."); |
|---|
| 295 |
return; |
|---|
| 296 |
} |
|---|
| 297 |
my $cnt = 0; |
|---|
| 298 |
open WORDLIST, "<$_[0]"; |
|---|
| 299 |
while(my $w = <WORDLIST>) { |
|---|
| 300 |
chomp($w); |
|---|
| 301 |
$w =~ s/\s*//; |
|---|
| 302 |
$w =~ s/ |
|---|
| 303 |
next unless $w; |
|---|
| 304 |
my $wt = $Option{threshold}; |
|---|
| 305 |
if ($w =~ /^(.*?)::(0(\.\d+){0,1})/) { |
|---|
| 306 |
($w, $wt) = (lc($1), $2); |
|---|
| 307 |
$wt = $Option{threshold} unless ($wt =~ m/[\d\.]+/); |
|---|
| 308 |
} |
|---|
| 309 |
$words{$w} = $wt; $cnt++; |
|---|
| 310 |
} |
|---|
| 311 |
close WORDLIST; |
|---|
| 312 |
debuglog("Loaded <$cnt> words from \"$_[0]\""); |
|---|
| 313 |
} |
|---|
| 314 |
|
|---|
| 315 |
sub load_personal_words { |
|---|
| 316 |
unless ( -e $_[0] ) { |
|---|
| 317 |
|
|---|
| 318 |
return; |
|---|
| 319 |
} |
|---|
| 320 |
unless ( -r $_[0] ) { |
|---|
| 321 |
debuglog("Cannot read from wordlist \"$_[0]\"\n Please make sure that permissions are correct." |
|---|
| 322 |
); |
|---|
| 323 |
return; |
|---|
| 324 |
} |
|---|
| 325 |
my $cnt = 0; |
|---|
| 326 |
open WORDLIST, "<$_[0]"; |
|---|
| 327 |
while(my $w = <WORDLIST>) { |
|---|
| 328 |
chomp($w); |
|---|
| 329 |
$w =~ s/\s*//; |
|---|
| 330 |
$w =~ s/ |
|---|
| 331 |
next unless $w; |
|---|
| 332 |
my $wt = $Option{threshold}; |
|---|
| 333 |
if ($w =~ /^(.*?)::(0(\.\d+){0,1})/) { |
|---|
| 334 |
($w, $wt) = ($1, $2); |
|---|
| 335 |
$wt = $Option{threshold} unless ($wt =~ m/[\d\.]+/); |
|---|
| 336 |
} |
|---|
| 337 |
$words{$w} = $wt; $cnt++; |
|---|
| 338 |
} |
|---|
| 339 |
close WORDLIST; |
|---|
| 340 |
debuglog("Updated Word List with <$cnt> words from \"$_[0]\""); |
|---|
| 341 |
} |
|---|
| 342 |
|
|---|
| 343 |
sub max { |
|---|
| 344 |
unless ( defined( $_[0] ) and defined( $_[1] ) ) { return 0 } |
|---|
| 345 |
unless ( defined( $_[0] ) ) { return $_[1] } |
|---|
| 346 |
unless ( defined( $_[1] ) ) { return $_[0] } |
|---|
| 347 |
if ( $_[0] < $_[1] ) { return $_[1] } |
|---|
| 348 |
else { return $_[0] } |
|---|
| 349 |
} |
|---|
| 350 |
|
|---|
| 351 |
sub within_threshold { |
|---|
| 352 |
my $digest = shift; |
|---|
| 353 |
my $record = shift; |
|---|
| 354 |
|
|---|
| 355 |
my ($dimg,$dkey) = split('::',$digest); |
|---|
| 356 |
my ($rimg,$rkey) = split('::',$record); |
|---|
| 357 |
my ($ds, $dh, $dw, $dcn) = split(':',$dimg); |
|---|
| 358 |
my ($rs, $rh, $rw, $rcn) = split(':',$rimg); |
|---|
| 359 |
return(0) unless $rs; |
|---|
| 360 |
return(0) unless $rh; |
|---|
| 361 |
return(0) unless $rw; |
|---|
| 362 |
return(0) unless $rcn; |
|---|
| 363 |
return(0) unless $rkey; |
|---|
| 364 |
return(0) if ((abs($ds - $rs ) / $rs ) > $Threshold{s}); |
|---|
| 365 |
return(0) if ((abs($dh - $rh ) / $rh ) > $Threshold{h}); |
|---|
| 366 |
return(0) if ((abs($dw - $rw ) / $rw ) > $Threshold{w}); |
|---|
| 367 |
return(0) if ((abs($dcn - $rcn) / $rcn) > $Threshold{cn}); |
|---|
| 368 |
|
|---|
| 369 |
my @rcf = split('::',$rkey); |
|---|
| 370 |
my @dcf = split('::',$dkey); |
|---|
| 371 |
|
|---|
| 372 |
my (@dcfs, @rcfs); |
|---|
| 373 |
foreach (@dcf) { push @dcfs,split(':',$_); } |
|---|
| 374 |
foreach (@rcf) { push @rcfs,split(':',$_); } |
|---|
| 375 |
|
|---|
| 376 |
my $total = scalar(@rcfs); |
|---|
| 377 |
if ($total == scalar(@dcfs)) { |
|---|
| 378 |
my $match = 0; |
|---|
| 379 |
foreach (0 .. ($total-1)) { |
|---|
| 380 |
$match++ if (abs($dcfs[$_] - $rcfs[$_]) <= $Threshold{c}); |
|---|
| 381 |
} |
|---|
| 382 |
debuglog("image matched <$match> of <$total> colors"); |
|---|
| 383 |
return(1) if ($match == $total); |
|---|
| 384 |
} |
|---|
| 385 |
return(0); |
|---|
| 386 |
} |
|---|
| 387 |
|
|---|
| 388 |
sub fmt_time { |
|---|
| 389 |
my $when = time - $_[0]; |
|---|
| 390 |
my $ret; |
|---|
| 391 |
|
|---|
| 392 |
if ($when>86400) { |
|---|
| 393 |
my $d = int($when/86400); |
|---|
| 394 |
$when -= $d*86400; |
|---|
| 395 |
$ret = "$d days,"; |
|---|
| 396 |
} |
|---|
| 397 |
if ($when>3600) { |
|---|
| 398 |
my $h = int($when/3600); |
|---|
| 399 |
$when -= $h*3600; |
|---|
| 400 |
$ret .= " $h hours"; |
|---|
| 401 |
} |
|---|
| 402 |
if ($when>60) { |
|---|
| 403 |
my $m = int($when/60); |
|---|
| 404 |
$when -= $m*60; |
|---|
| 405 |
$ret .= " $m minutes"; |
|---|
| 406 |
} |
|---|
| 407 |
if ($when>0) { |
|---|
| 408 |
$ret .= " $when seconds"; |
|---|
| 409 |
} |
|---|
| 410 |
$ret .= " ago"; |
|---|
| 411 |
return $ret; |
|---|
| 412 |
} |
|---|
| 413 |
|
|---|
| 414 |
sub check_image_hash_db { |
|---|
| 415 |
my $digest = $_[0]; |
|---|
| 416 |
my $dbfile = $_[1] || $Option{db_hash}; |
|---|
| 417 |
my $fname = $_[2]; |
|---|
| 418 |
my $ctype = $_[3]; |
|---|
| 419 |
my ($img, $key) = split('::', $digest,2); |
|---|
| 420 |
my $hash = $digest; |
|---|
| 421 |
my $ret = 0; my $txt = 'Exact'; |
|---|
| 422 |
my $dinfo; |
|---|
| 423 |
my %DB = (); my $dbm; my $new = $key; |
|---|
| 424 |
|
|---|
| 425 |
if ($Option{enable_image_hashing} == 2) { |
|---|
| 426 |
tie %DB, 'MLDBM', $dbfile, O_RDWR or $ret++; |
|---|
| 427 |
if ($ret>0) { |
|---|
| 428 |
debuglog("No Image Hash database found at \"$dbfile\", or permissions wrong."); |
|---|
| 429 |
return (0,''); |
|---|
| 430 |
} |
|---|
| 431 |
if (defined $DB{$key}) { |
|---|
| 432 |
$dbm = $DB{$key}; |
|---|
| 433 |
if ($img eq $dbm->{basic}) { |
|---|
| 434 |
$ret = $dbm->{score} || 0.001; |
|---|
| 435 |
$dinfo = $dbm->{dinfo} || ''; |
|---|
| 436 |
$dbm->{fname} = $fname; |
|---|
| 437 |
$dbm->{ctype} = $ctype; |
|---|
| 438 |
debuglog("Updating $txt info File:'$fname' Type:'$ctype'"); |
|---|
| 439 |
$DB{$key} = $dbm; |
|---|
| 440 |
} |
|---|
| 441 |
} |
|---|
| 442 |
if ($ret == 0) { |
|---|
| 443 |
my $now = time - ($Option{db_max_days}*86400); |
|---|
| 444 |
foreach my $k (keys %DB) { |
|---|
| 445 |
$dbm = $DB{$k}; |
|---|
| 446 |
$hash = $dbm->{basic} ? $dbm->{basic} : '0:0:0:0' . '::' . $k; |
|---|
| 447 |
if (within_threshold($digest,$hash)) { |
|---|
| 448 |
$ret = $dbfile eq $Option{db_hash} ? $dbm->{score} : $dbm->{match}; |
|---|
| 449 |
$txt = 'Approx'; $new = $k; $dinfo = $dbm->{dinfo} || ''; |
|---|
| 450 |
debuglog("Found in: <$dbfile>"); |
|---|
| 451 |
last; |
|---|
| 452 |
} |
|---|
| 453 |
|
|---|
| 454 |
$dbm->{check} = $now - 1 unless defined $dbm->{check}; |
|---|
| 455 |
if ($dbm->{check} < $now) { |
|---|
| 456 |
debuglog("Expiring <$k> older than $Option{db_max_days} days"); |
|---|
| 457 |
delete $DB{$k}; |
|---|
| 458 |
} |
|---|
| 459 |
} |
|---|
| 460 |
} |
|---|
| 461 |
if ($ret>0) { |
|---|
| 462 |
$dbm->{match}++; |
|---|
| 463 |
if ($dbfile eq $Option{db_hash}) { |
|---|
| 464 |
$ret = sprintf("%0.3f",$dbm->{score}); |
|---|
| 465 |
debuglog("Found Score <$ret> for $txt Image Hash"); |
|---|
| 466 |
} |
|---|
| 467 |
debuglog("Matched [$dbm->{match}] time(s). Prev match: ".fmt_time($dbm->{check})); |
|---|
| 468 |
$dbm->{check} = time; |
|---|
| 469 |
$DB{$new} = $dbm; |
|---|
| 470 |
} |
|---|
| 471 |
untie %DB; |
|---|
| 472 |
return ($ret,$dinfo); |
|---|
| 473 |
} elsif ($Option{enable_image_hashing} == 1) { |
|---|
| 474 |
$ret = open HASH, $Option{digest_db}; |
|---|
| 475 |
unless($ret) { |
|---|
| 476 |
debuglog("No Image Hash database found at \"$Option{digest_db}\", or permissions wrong."); |
|---|
| 477 |
return (0,''); |
|---|
| 478 |
} |
|---|
| 479 |
while (<HASH>) { |
|---|
| 480 |
chomp; |
|---|
| 481 |
($ret,$hash) = split('::',$_,2); |
|---|
| 482 |
if (within_threshold($digest,$hash)) { |
|---|
| 483 |
debuglog("Found Score <$ret> for Hash <$hash>"); |
|---|
| 484 |
return ($ret,''); |
|---|
| 485 |
} |
|---|
| 486 |
} |
|---|
| 487 |
close HASH; |
|---|
| 488 |
return (0,''); |
|---|
| 489 |
} |
|---|
| 490 |
} |
|---|
| 491 |
|
|---|
| 492 |
sub add_image_hash_db { |
|---|
| 493 |
my $digest = $_[0]; |
|---|
| 494 |
my $score = $_[1]; |
|---|
| 495 |
my $ret = 0; |
|---|
| 496 |
|
|---|
| 497 |
if ($Option{enable_image_hashing} == 2) { |
|---|
| 498 |
my $dbfile = $_[2] || $Option{db_hash}; |
|---|
| 499 |
my %DB = (); |
|---|
| 500 |
tie %DB, 'MLDBM', $dbfile or $ret++; |
|---|
| 501 |
if ($ret>0) { |
|---|
| 502 |
debuglog("Unable to open/create Image Hash database at \"$dbfile\", check permissions."); |
|---|
| 503 |
return; |
|---|
| 504 |
} |
|---|
| 505 |
debuglog("Adding Hash to \"$dbfile\""); |
|---|
| 506 |
|
|---|
| 507 |
my ($img,$key) = split('::',$digest,2); |
|---|
| 508 |
my $dbm = $DB{$key}; |
|---|
| 509 |
$dbm->{fname} = $_[3]; |
|---|
| 510 |
$dbm->{ctype} = $_[4]; |
|---|
| 511 |
$dbm->{dinfo} = $_[5]; |
|---|
| 512 |
$dbm->{basic} = $img; |
|---|
| 513 |
$dbm->{score} = $score; |
|---|
| 514 |
$dbm->{input} = |
|---|
| 515 |
$dbm->{check} = time; |
|---|
| 516 |
$dbm->{match} = $dbfile eq $Option{db_hash} ? 0 : 1; |
|---|
| 517 |
$DB{$key} = $dbm; |
|---|
| 518 |
untie %DB; |
|---|
| 519 |
} elsif ($Option{enable_image_hashing} == 1) { |
|---|
| 520 |
if (-e $Option{digest_db}) { |
|---|
| 521 |
$ret = open DB, ">>$Option{digest_db}"; |
|---|
| 522 |
} else { |
|---|
| 523 |
$ret = open DB, ">$Option{digest_db}"; |
|---|
| 524 |
} |
|---|
| 525 |
unless ($ret) { |
|---|
| 526 |
debuglog("Unable to open/create Image Hash database at \"$Option{digest_db}\", check permissions."); |
|---|
| 527 |
return; |
|---|
| 528 |
} |
|---|
| 529 |
debuglog("Adding Hash to \"$Option{digest_db}\""); |
|---|
| 530 |
flock( DB, LOCK_EX ); |
|---|
| 531 |
seek( DB, 0, 2 ); |
|---|
| 532 |
print DB "${score}::${digest}\n"; |
|---|
| 533 |
flock( DB, LOCK_UN ); |
|---|
| 534 |
close(DB); |
|---|
| 535 |
} |
|---|
| 536 |
debuglog("Digest: $digest"); |
|---|
| 537 |
} |
|---|
| 538 |
|
|---|
| 539 |
sub calc_image_hash { |
|---|
| 540 |
my $pfile = $_[0]; |
|---|
| 541 |
my ($rcode, $hash); |
|---|
| 542 |
|
|---|
| 543 |
foreach my $a (qw/ppmhist/) { |
|---|
| 544 |
unless (defined $App{$a}) { |
|---|
| 545 |
info("FuzzyOcr: calc_image_hash cannot exec $a"); |
|---|
| 546 |
return (1, ''); |
|---|
| 547 |
} |
|---|
| 548 |
} |
|---|
| 549 |
|
|---|
| 550 |
my $img = new Image::Magick; |
|---|
| 551 |
my ($w,$h,$s,$t) = $img->ping($pfile); |
|---|
| 552 |
|
|---|
| 553 |
$t = Mail::SpamAssassin::Timeout->new({ secs => $Option{timeout} }); |
|---|
| 554 |
my @stdout_data; |
|---|
| 555 |
$rcode = $t->run_and_catch(sub { |
|---|
| 556 |
@stdout_data = qx($App{ppmhist} -noheader $pfile 2>/dev/null); |
|---|
| 557 |
}); |
|---|
| 558 |
if ($rcode) { |
|---|
| 559 |
chomp $rcode; |
|---|
| 560 |
debuglog("$App{ppmhist}: Timed out [$rcode], skipping..."); |
|---|
| 561 |
return (1, ''); |
|---|
| 562 |
} |
|---|
| 563 |
my $cnt = 0; |
|---|
| 564 |
my $c = scalar(@stdout_data); |
|---|
| 565 |
$hash = sprintf "%d:%d:%d:%d",$s,$h,$w,$c; |
|---|
| 566 |
if ($Threshold{max_hash}) { |
|---|
| 567 |
foreach (@stdout_data) { |
|---|
| 568 |
$_ =~ s/ +/ /g; |
|---|
| 569 |
my(@d) = split(' ', $_); |
|---|
| 570 |
$hash .= sprintf("::%d:%d:%d:%d:%d",@d); |
|---|
| 571 |
if ($cnt++ ge $Threshold{max_hash}) { |
|---|
| 572 |
last; |
|---|
| 573 |
} |
|---|
| 574 |
} |
|---|
| 575 |
} |
|---|
| 576 |
debuglog("Got: <$hash>"); |
|---|
| 577 |
return(0, $hash); |
|---|
| 578 |
} |
|---|
| 579 |
|
|---|
| 580 |
sub debuglog { |
|---|
| 581 |
my @lines = split('\n',$_[0]); |
|---|
| 582 |
my $limit = $_[1] || 1; |
|---|
| 583 |
if ( $Option{verbose} > $limit ) { |
|---|
| 584 |
foreach (@lines) { info("FuzzyOcr: $_"); } |
|---|
| 585 |
} else { |
|---|
| 586 |
foreach (@lines) { dbg ("FuzzyOcr: $_"); } |
|---|
| 587 |
} |
|---|
| 588 |
} |
|---|
| 589 |
|
|---|
| 590 |
sub wrong_ctype { |
|---|
| 591 |
my ( $format, $ctype ) = @_; |
|---|
| 592 |
if ($Score{wrongctype}) { |
|---|
| 593 |
my $debuginfo = ""; |
|---|
| 594 |
if ( $Option{verbose} > 0 ) { |
|---|
| 595 |
$debuginfo = |
|---|
| 596 |
("Image has format \"$format\" but content-type is \"$ctype\""); |
|---|
| 597 |
debuglog($debuginfo); |
|---|
| 598 |
} |
|---|
| 599 |
for my $set ( 0 .. 3 ) { |
|---|
| 600 |
$pms->{conf}->{scoreset}->[$set]->{"FUZZY_OCR_WRONG_CTYPE"} = |
|---|
| 601 |
sprintf( "%0.3f", $Score{wrongctype} ); |
|---|
| 602 |
} |
|---|
| 603 |
$pms->_handle_hit( "FUZZY_OCR_WRONG_CTYPE", $Score{wrongctype}, "BODY: ", |
|---|
| 604 |
$pms->{conf}->{descriptions}->{FUZZY_OCR_WRONG_CTYPE} . "\n$debuginfo" ); |
|---|
| 605 |
} |
|---|
| 606 |
} |
|---|
| 607 |
|
|---|
| 608 |
sub corrupt_img { |
|---|
| 609 |
my ($score, $err) = @_; |
|---|
| 610 |
if ($score>0) { |
|---|
| 611 |
my $debuginfo = ""; |
|---|
| 612 |
if ( $Option{verbose} > 0 ) { |
|---|
| 613 |
chomp($err); |
|---|
| 614 |
$debuginfo = ("Corrupt image: $err"); |
|---|
| 615 |
debuglog($debuginfo); |
|---|
| 616 |
} |
|---|
| 617 |
for my $set ( 0 .. 3 ) { |
|---|
| 618 |
$pms->{conf}->{scoreset}->[$set]->{"FUZZY_OCR_CORRUPT_IMG"} = |
|---|
| 619 |
sprintf( "%0.3f", $score ); |
|---|
| 620 |
} |
|---|
| 621 |
$pms->_handle_hit( "FUZZY_OCR_CORRUPT_IMG", $score, "BODY: ", |
|---|
| 622 |
$pms->{conf}->{descriptions}->{FUZZY_OCR_CORRUPT_IMG} . "\n$debuginfo" ); |
|---|
| 623 |
} |
|---|
| 624 |
} |
|---|
| 625 |
|
|---|
| 626 |
sub known_img_hash { |
|---|
| 627 |
my $score = $_[0] || $Score{base}; |
|---|
| 628 |
my $dinfo = $_[1] ? "\n$_[1]" : ''; |
|---|
| 629 |
for my $set ( 0 .. 3 ) { |
|---|
| 630 |
$pms->{conf}->{scoreset}->[$set]->{"FUZZY_OCR_KNOWN_HASH"} = |
|---|
| 631 |
sprintf( "%0.3f", $score ); |
|---|
| 632 |
} |
|---|
| 633 |
$pms->_handle_hit( "FUZZY_OCR_KNOWN_HASH", $score, "BODY: ", |
|---|
| 634 |
$pms->{conf}->{descriptions}->{FUZZY_OCR_KNOWN_HASH} . $dinfo ); |
|---|
| 635 |
} |
|---|
| 636 |
|
|---|
| 637 |
sub removedir { |
|---|
| 638 |
my $dir = $_[0]; |
|---|
| 639 |
return unless -d $dir; |
|---|
| 640 |
opendir D, $dir; |
|---|
| 641 |
my @files = readdir D; |
|---|
| 642 |
closedir D; |
|---|
| 643 |
foreach my $f (@files) { |
|---|
| 644 |
next if $f eq '.'; |
|---|
| 645 |
next if $f eq '..'; |
|---|
| 646 |
my $ff = Mail::SpamAssassin::Util::untaint_file_path("$dir/$f"); |
|---|
| 647 |
unless (unlink $ff) { |
|---|
| 648 |
debuglog("Cannot remove: $ff"); |
|---|
| 649 |
} |
|---|
| 650 |
} |
|---|
| 651 |
debuglog("Remove DIR: $dir"); |
|---|
| 652 |
unless(rmdir $dir) { |
|---|
| 653 |
debuglog("Cannot remove DIR: $dir"); |
|---|
| 654 |
} |
|---|
| 655 |
} |
|---|
| 656 |
|
|---|
| 657 |
sub fuzzyocr_check { |
|---|
| 658 |
( $self, $pms ) = @_; |
|---|
| 659 |
if ( $pms->get_score() > $Score{autodisable} ) { |
|---|
| 660 |
debuglog("Scan canceled, message has already more than $Score{autodisable} points."); |
|---|
| 661 |
return 0; |
|---|
| 662 |
} |
|---|
| 663 |
|
|---|
| 664 |
my $imgdir; |
|---|
| 665 |
my %imgfiles = (); |
|---|
| 666 |
my @found = (); |
|---|
| 667 |
my @hashes = (); |
|---|
| 668 |
my $cnt = 0; |
|---|
| 669 |
my $imgerr = 0; |
|---|
| 670 |
|
|---|
| 671 |
|
|---|
| 672 |
|
|---|
| 673 |
if ($Option{personal_wordlist} =~ m/^\//) { |
|---|
| 674 |
load_personal_words( $Option{personal_wordlist} ); |
|---|
| 675 |
} else { |
|---|
| 676 |
my $homedir = (getpwuid($<))[7]; |
|---|
| 677 |
if ($homedir) { |
|---|
| 678 |
load_personal_words( $homedir . "/$Option{personal_wordlist}" ); |
|---|
| 679 |
} elsif (defined($ENV{HOME})) { |
|---|
| 680 |
load_personal_words( $ENV{HOME} . "/$Option{personal_wordlist}" ); |
|---|
| 681 |
} else { |
|---|
| 682 |
debuglog("Variable \$ENV{HOME} not defined and getpwuid failed, personal wordlist function not available..."); |
|---|
| 683 |
} |
|---|
| 684 |
} |
|---|
| 685 |
|
|---|
| 686 |
foreach my $p ( |
|---|
| 687 |
$pms->{msg}->find_parts(qr(^image\b)i), |
|---|
| 688 |
$pms->{msg}->find_parts(qr(Application/Octet-Stream)i) |
|---|
| 689 |
) { |
|---|
| 690 |
my $ctype = $p->{'type'}; |
|---|
| 691 |
my $fname = $p->{'name'} || 'unknown'; |
|---|
| 692 |
if (($fname eq 'unknown') and |
|---|
| 693 |
(defined $p->{'headers'}->{'content-id'}) |
|---|
| 694 |
){ |
|---|
| 695 |
$fname = join('',@{$p->{'headers'}->{'content-id'}}); |
|---|
| 696 |
$fname =~ s/[<>]//g; |
|---|
| 697 |
$fname =~ tr/\@\$\%\&/_/s; |
|---|
| 698 |
} |
|---|
| 699 |
my $test = 0; |
|---|
| 700 |
$test++ if ($ctype =~ /image/i); |
|---|
| 701 |
$test++ if ($fname =~ /(gif|jpg|jpeg|png|bmp|tiff)$/i); |
|---|
| 702 |
|
|---|
| 703 |
if ($test == 0) { |
|---|
| 704 |
debuglog("Skipping file with content-type=\"$ctype\" name=\"$fname\""); |
|---|
| 705 |
next; |
|---|
| 706 |
} |
|---|
| 707 |
|
|---|
| 708 |
$imgdir = Mail::SpamAssassin::Util::secure_tmpdir() unless ($imgdir); |
|---|
| 709 |
unless ($imgdir) { |
|---|
| 710 |
debuglog("Scan canceled, cannot create Image TMPDIR."); |
|---|
| 711 |
return 0; |
|---|
| 712 |
} |
|---|
| 713 |
|
|---|
| 714 |
my $imgfilename = $imgdir . "/raw.eml"; |
|---|
| 715 |
unless (-e $imgfilename) { |
|---|
| 716 |
if (open RAW, ">$imgfilename") { |
|---|
| 717 |
print RAW $pms->{msg}->get_pristine(); |
|---|
| 718 |
close RAW; |
|---|
| 719 |
debuglog("Saved: $imgfilename"); |
|---|
| 720 |
} |
|---|
| 721 |
} |
|---|
| 722 |
|
|---|
| 723 |
$fname =~ tr{a-zA-Z0-9\.}{_}cs; |
|---|
| 724 |
$imgfilename = Mail::SpamAssassin::Util::untaint_file_path( |
|---|
| 725 |
$imgdir . "/" . $fname |
|---|
| 726 |
); |
|---|
| 727 |
my $unique = 0; |
|---|
| 728 |
while (-e $imgfilename) { |
|---|
| 729 |
$imgfilename = Mail::SpamAssassin::Util::untaint_file_path( |
|---|
| 730 |
$imgdir . "/" . chr(65+$unique) . "." . $fname |
|---|
| 731 |
); |
|---|
| 732 |
$unique++; |
|---|
| 733 |
} |
|---|
| 734 |
|
|---|
| 735 |
unless (open PICT, ">$imgfilename") { |
|---|
| 736 |
debuglog("Cannot write \"$imgfilename\", skipping..."); |
|---|
| 737 |
next; |
|---|
| 738 |
} |
|---|
| 739 |
my $pdata = $p->decode(); |
|---|
| 740 |
binmode PICT; |
|---|
| 741 |
print PICT $pdata; |
|---|
| 742 |
close PICT; |
|---|
| 743 |
debuglog("Wrote: $imgfilename"); $cnt++; |
|---|
| 744 |
$imgfiles{$imgfilename}{header} = substr($pdata,0,6); |
|---|
| 745 |
$imgfiles{$imgfilename}{ctype} = $ctype; |
|---|
| 746 |
$imgfiles{$imgfilename}{fname} = $fname; |
|---|
| 747 |
} |
|---|
| 748 |
|
|---|
| 749 |
if ($cnt == 0) { |
|---|
| 750 |
|
|---|
| 751 |
removedir($imgdir) if (defined($imgdir) and ($Option{keep_bad_images}<2)); |
|---|
| 752 |
return 0; |
|---|
| 753 |
} |
|---|
| 754 |
debuglog("Found: $cnt images"); $cnt = 0; |
|---|
| 755 |
my $t = Mail::SpamAssassin::Timeout->new({ secs => $Option{timeout} }); |
|---|
| 756 |
my $retcode; |
|---|
| 757 |
|
|---|
| 758 |
my $haserr = open RAWERR, ">$imgdir/raw.err"; |
|---|
| 759 |
debuglog("Errors to: $imgdir/raw.err") if ($haserr>0); |
|---|
| 760 |
|
|---|
| 761 |
IMAGE: |
|---|
| 762 |
foreach my $file (keys %imgfiles) { |
|---|
| 763 |
my $pic = $imgfiles{$file}; |
|---|
| 764 |
debuglog("Analyzing file with content-type=\"$$pic{ctype}\""); |
|---|
| 765 |
my @used_scansets = (); |
|---|
| 766 |
my $corrupt = 0; |
|---|
| 767 |
my $digest; |
|---|
| 768 |
my $ptype = 0; |
|---|
| 769 |
my $tfile = $file; |
|---|
| 770 |
my $pfile = $file . ".pnm"; |
|---|
| 771 |
my $efile = $file . ".err"; |
|---|
| 772 |
debuglog("pfile => $pfile"); |
|---|
| 773 |
debuglog("efile => $efile"); |
|---|
| 774 |
|
|---|
| 775 |
if ( substr($$pic{header},0,3) eq "\x47\x49\x46" ) { |
|---|
| 776 |
debuglog("Found GIF header name=\"$$pic{fname}\""); |
|---|