| 1 |
|
|---|
| 2 |
|
|---|
| 3 |
|
|---|
| 4 |
|
|---|
| 5 |
|
|---|
| 6 |
package FuzzyOcr; |
|---|
| 7 |
|
|---|
| 8 |
use strict; |
|---|
| 9 |
use warnings; |
|---|
| 10 |
use Mail::SpamAssassin; |
|---|
| 11 |
use Mail::SpamAssassin::Logger; |
|---|
| 12 |
use Mail::SpamAssassin::Util; |
|---|
| 13 |
use Mail::SpamAssassin::Timeout; |
|---|
| 14 |
use Mail::SpamAssassin::Plugin; |
|---|
| 15 |
|
|---|
| 16 |
use Time::HiRes qw( gettimeofday tv_interval ); |
|---|
| 17 |
use String::Approx 'adistr'; |
|---|
| 18 |
use FileHandle; |
|---|
| 19 |
use Fcntl ':flock'; |
|---|
| 20 |
use POSIX; |
|---|
| 21 |
|
|---|
| 22 |
use lib qw(/etc/mail/spamassassin); |
|---|
| 23 |
|
|---|
| 24 |
use FuzzyOcr::Logging qw(debuglog errorlog warnlog infolog); |
|---|
| 25 |
use FuzzyOcr::Config qw(kill_pid |
|---|
| 26 |
get_tmpdir |
|---|
| 27 |
set_tmpdir |
|---|
| 28 |
get_all_tmpdirs |
|---|
| 29 |
get_pms |
|---|
| 30 |
save_pms |
|---|
| 31 |
get_timeout |
|---|
| 32 |
get_mysql_ddb |
|---|
| 33 |
get_scansets |
|---|
| 34 |
get_wordlist |
|---|
| 35 |
set_config |
|---|
| 36 |
get_config |
|---|
| 37 |
parse_config |
|---|
| 38 |
finish_parsing_end |
|---|
| 39 |
read_words); |
|---|
| 40 |
use FuzzyOcr::Hashing qw(check_image_hash_db add_image_hash_db calc_image_hash); |
|---|
| 41 |
use FuzzyOcr::Deanimate qw(deanimate); |
|---|
| 42 |
use FuzzyOcr::Scoring qw(wrong_ctype wrong_extension corrupt_img known_img_hash); |
|---|
| 43 |
use FuzzyOcr::Misc qw(max removedir removedirs save_execute); |
|---|
| 44 |
|
|---|
| 45 |
our @ISA = qw(Mail::SpamAssassin::Plugin); |
|---|
| 46 |
|
|---|
| 47 |
|
|---|
| 48 |
sub new { |
|---|
| 49 |
my ( $class, $mailsa ) = @_; |
|---|
| 50 |
$class = ref($class) || $class; |
|---|
| 51 |
my $self = $class->SUPER::new($mailsa); |
|---|
| 52 |
bless( $self, $class ); |
|---|
| 53 |
$self->register_eval_rule("fuzzyocr_check"); |
|---|
| 54 |
$self->register_eval_rule("dummy_check"); |
|---|
| 55 |
$self->set_config($mailsa->{conf}); |
|---|
| 56 |
return $self; |
|---|
| 57 |
} |
|---|
| 58 |
|
|---|
| 59 |
sub dummy_check { |
|---|
| 60 |
return 0; |
|---|
| 61 |
} |
|---|
| 62 |
|
|---|
| 63 |
sub fuzzyocr_check { |
|---|
| 64 |
my ( $self, $pms ) = @_; |
|---|
| 65 |
my $conf = get_config(); |
|---|
| 66 |
|
|---|
| 67 |
save_pms($pms); |
|---|
| 68 |
|
|---|
| 69 |
my $end; |
|---|
| 70 |
my $begin = [gettimeofday]; |
|---|
| 71 |
if ($conf->{focr_global_timeout}) { |
|---|
| 72 |
my $t = get_timeout(); |
|---|
| 73 |
debuglog("Global Timeout set at ".$conf->{focr_timeout}." sec."); |
|---|
| 74 |
$t->run(sub { |
|---|
| 75 |
$end = fuzzyocr_do( $self, $conf, $pms ); |
|---|
| 76 |
}); |
|---|
| 77 |
if ($t->timed_out()) { |
|---|
| 78 |
infolog("Scan timed out after $conf->{focr_timeout} seconds."); |
|---|
| 79 |
infolog("Killing possibly running pid..."); |
|---|
| 80 |
my ($ret, $pid) = kill_pid(); |
|---|
| 81 |
if ($ret > 0) { |
|---|
| 82 |
infolog("Successfully killed PID $pid"); |
|---|
| 83 |
} elsif ($ret < 0) { |
|---|
| 84 |
infolog("No processes left... exiting"); |
|---|
| 85 |
} else { |
|---|
| 86 |
infolog("Failed to kill PID $pid, stale process!"); |
|---|
| 87 |
} |
|---|
| 88 |
infolog("Removing possibly leftover tempdirs..."); |
|---|
| 89 |
removedirs(get_all_tmpdirs()); |
|---|
| 90 |
return 0; |
|---|
| 91 |
} |
|---|
| 92 |
} else { |
|---|
| 93 |
$end = fuzzyocr_do( $self, $conf, $pms ); |
|---|
| 94 |
} |
|---|
| 95 |
debuglog("Processed in ". |
|---|
| 96 |
sprintf("%.6f",tv_interval($begin, [gettimeofday])) |
|---|
| 97 |
." sec."); |
|---|
| 98 |
return $end; |
|---|
| 99 |
} |
|---|
| 100 |
|
|---|
| 101 |
sub fuzzyocr_do { |
|---|
| 102 |
my ( $self, $conf, $pms ) = @_; |
|---|
| 103 |
|
|---|
| 104 |
my $internal_score = 0; |
|---|
| 105 |
my $current_score = $pms->get_score(); |
|---|
| 106 |
my $score = $conf->{focr_autodisable_score} || 100; |
|---|
| 107 |
|
|---|
| 108 |
if ( $current_score > $score ) { |
|---|
| 109 |
infolog("Scan canceled, message has already more than $score points ($current_score)."); |
|---|
| 110 |
return 0; |
|---|
| 111 |
} |
|---|
| 112 |
|
|---|
| 113 |
my $nscore = $conf->{focr_autodiable_negative_score} || -100; |
|---|
| 114 |
if ( $current_score < $nscore ) { |
|---|
| 115 |
infolog("Scan canceled, message has less than $nscore points ($current_score)."); |
|---|
| 116 |
return 0; |
|---|
| 117 |
} |
|---|
| 118 |
|
|---|
| 119 |
my $imgdir; |
|---|
| 120 |
my %imgfiles = (); |
|---|
| 121 |
my @found = (); |
|---|
| 122 |
my @hashes = (); |
|---|
| 123 |
my $cnt = 0; |
|---|
| 124 |
my $imgerr = 0; |
|---|
| 125 |
my $main = $self->{main}; |
|---|
| 126 |
|
|---|
| 127 |
my $from = $pms->get('From') ? $pms->get('From') : "<no sender>"; |
|---|
| 128 |
my $to = $pms->get('To') ? $pms->get('To') : "<no receipients>"; |
|---|
| 129 |
my $msgid = $pms->get('Message-Id') ? $pms->get('Message-Id') : "<no messageid>"; |
|---|
| 130 |
|
|---|
| 131 |
chomp($from, $to, $msgid); |
|---|
| 132 |
|
|---|
| 133 |
debuglog("Starting FuzzyOcr..."); |
|---|
| 134 |
infolog("Processing Message with ID \"$msgid\" ($from -> $to)"); |
|---|
| 135 |
foreach my $p ( |
|---|
| 136 |
$pms->{msg}->find_parts(qr(^image\b)i), |
|---|
| 137 |
$pms->{msg}->find_parts(qr(Application/Octet-Stream)i) |
|---|
| 138 |
) { |
|---|
| 139 |
my $ctype = $p->{'type'}; |
|---|
| 140 |
my $fname = $p->{'name'} || 'unknown'; |
|---|
| 141 |
if (($fname eq 'unknown') and |
|---|
| 142 |
(defined $p->{'headers'}->{'content-id'}) |
|---|
| 143 |
){ |
|---|
| 144 |
$fname = join('',@{$p->{'headers'}->{'content-id'}}); |
|---|
| 145 |
$fname =~ s/[<>]//g; |
|---|
| 146 |
$fname =~ tr/\@\$\%\&/_/s; |
|---|
| 147 |
} |
|---|
| 148 |
|
|---|
| 149 |
my $filename = $fname; $filename =~ tr{a-zA-Z0-9\.}{_}cs; |
|---|
| 150 |
my $pdata = $p->decode(); |
|---|
| 151 |
my $pdatalen = length($pdata); |
|---|
| 152 |
my $w = 0; my $h = 0; |
|---|
| 153 |
|
|---|
| 154 |
my $blah = substr($pdata,0,3); |
|---|
| 155 |
|
|---|
| 156 |
if ( substr($pdata,0,3) eq "\x47\x49\x46" ) { |
|---|
| 157 |
|
|---|
| 158 |
$imgfiles{$filename}{ftype} = 1; |
|---|
| 159 |
($w,$h) = unpack("vv",substr($pdata,6,4)); |
|---|
| 160 |
infolog("GIF: [${h}x${w}] $filename"); |
|---|
| 161 |
$imgfiles{$filename}{width} = $w; |
|---|
| 162 |
$imgfiles{$filename}{height} = $h; |
|---|
| 163 |
} elsif ( substr($pdata,0,2) eq "\xff\xd8" ) { |
|---|
| 164 |
|
|---|
| 165 |
my @Markers = (0xC0,0xC1,0xC2,0xC3,0xC5,0xC6,0xC7,0xC9,0xCA,0xCB,0xCD,0xCE,0xCF); |
|---|
| 166 |
my $pos = 2; |
|---|
| 167 |
while ($pos < $pdatalen) { |
|---|
| 168 |
my ($b,$m) = unpack("CC",substr($pdata,$pos,2)); $pos += 2; |
|---|
| 169 |
if ($b != 0xff) { |
|---|
| 170 |
infolog("Invalid JPEG image"); |
|---|
| 171 |
$pos = $pdatalen + 1; |
|---|
| 172 |
last; |
|---|
| 173 |
} |
|---|
| 174 |
my $skip = 0; |
|---|
| 175 |
foreach my $mm (@Markers) { |
|---|
| 176 |
if ($mm == $m) { |
|---|
| 177 |
$skip++; last; |
|---|
| 178 |
} |
|---|
| 179 |
} |
|---|
| 180 |
last if ($skip); |
|---|
| 181 |
$pos += unpack("n",substr($pdata,$pos,2)); |
|---|
| 182 |
} |
|---|
| 183 |
if ($pos > $pdatalen) { |
|---|
| 184 |
errorlog("Cannot find image dimensions"); |
|---|
| 185 |
} else { |
|---|
| 186 |
($h,$w) = unpack("nn",substr($pdata,$pos+3,4)); |
|---|
| 187 |
infolog("JPEG: [${h}x${w}] $filename"); |
|---|
| 188 |
$imgfiles{$filename}{ftype} = 2; |
|---|
| 189 |
$imgfiles{$filename}{height} = $h; |
|---|
| 190 |
$imgfiles{$filename}{width} = $w; |
|---|
| 191 |
} |
|---|
| 192 |
} elsif ( substr($pdata,0,4) eq "\x89\x50\x4e\x47" ) { |
|---|
| 193 |
|
|---|
| 194 |
($w,$h) = unpack("NN",substr($pdata,16,8)); |
|---|
| 195 |
$imgfiles{$filename}{ftype} = 3; |
|---|
| 196 |
$imgfiles{$filename}{width} = $w; |
|---|
| 197 |
$imgfiles{$filename}{height} = $h; |
|---|
| 198 |
infolog("PNG: [${h}x${w}] $filename"); |
|---|
| 199 |
} elsif ( substr($pdata,0,2) eq "BM" ) { |
|---|
| 200 |
|
|---|
| 201 |
($w,$h) = unpack("NN",substr($pdata,18,8)); |
|---|
| 202 |
$imgfiles{$filename}{ftype} = 4; |
|---|
| 203 |
$imgfiles{$filename}{width} = $w; |
|---|
| 204 |
$imgfiles{$filename}{height} = $h; |
|---|
| 205 |
infolog("BMP: [${h}x${w}] $filename"); |
|---|
| 206 |
} elsif ( |
|---|
| 207 |
|
|---|
| 208 |
(substr($pdata,0,4) eq "\x4d\x4d\x00\x2a") or |
|---|
| 209 |
(substr($pdata,0,4) eq "\x49\x49\x2a\x00") |
|---|
| 210 |
) { |
|---|
| 211 |
my $worder = (substr($pdata,0,2) eq "\x4d\x4d") ? 0 : 1; |
|---|
| 212 |
my $offset = unpack($worder?"V":"N",substr($pdata,4,4)); |
|---|
| 213 |
my $number = unpack($worder?"v":"n",substr($pdata,$offset,2)) - 1; |
|---|
| 214 |
foreach my $n (0 .. $number) { |
|---|
| 215 |
my $add = 2 + ($n * 12); |
|---|
| 216 |
my ($id,$tag,$cnt,$val) = unpack($worder?"vvVV":"nnNN",substr($pdata,$offset+$add,12)); |
|---|
| 217 |
$h = $val if ($id == 256); |
|---|
| 218 |
$w = $val if ($id == 257); |
|---|
| 219 |
last if ($h != 0 and $w != 0); |
|---|
| 220 |
} |
|---|
| 221 |
infolog("TIFF: [${h}x${w}] $filename ($worder)"); |
|---|
| 222 |
infolog("Cannot determine size of TIFF image, setting to '1x1'") if ($h == 0 and $w == 0); |
|---|
| 223 |
$imgfiles{$filename}{ftype} = 5; |
|---|
| 224 |
$imgfiles{$filename}{width} = $w ? $w : 1; |
|---|
| 225 |
$imgfiles{$filename}{height} = $h ? $h : 1; |
|---|
| 226 |
} |
|---|
| 227 |
|
|---|
| 228 |
|
|---|
| 229 |
unless (defined $imgfiles{$filename}{ftype}) { |
|---|
| 230 |
infolog("Skipping file with content-type=\"$ctype\" name=\"$fname\""); |
|---|
| 231 |
delete $imgfiles{$filename}; |
|---|
| 232 |
next; |
|---|
| 233 |
|
|---|
| 234 |
} |
|---|
| 235 |
|
|---|
| 236 |
|
|---|
| 237 |
$imgdir = Mail::SpamAssassin::Util::secure_tmpdir(); |
|---|
| 238 |
unless ($imgdir) { |
|---|
| 239 |
errorlog("Scan canceled, cannot create Image TMPDIR."); |
|---|
| 240 |
return 0; |
|---|
| 241 |
} |
|---|
| 242 |
set_tmpdir($imgdir); |
|---|
| 243 |
|
|---|
| 244 |
|
|---|
| 245 |
my $imgfilename = Mail::SpamAssassin::Util::untaint_file_path( |
|---|
| 246 |
$imgdir . "/" . $fname |
|---|
| 247 |
); |
|---|
| 248 |
my $unique = 0; |
|---|
| 249 |
while (-e $imgfilename) { |
|---|
| 250 |
$imgfilename = Mail::SpamAssassin::Util::untaint_file_path( |
|---|
| 251 |
$imgdir . "/" . chr(65+$unique) . "." . $fname |
|---|
| 252 |
); |
|---|
| 253 |
$unique++; |
|---|
| 254 |
} |
|---|
| 255 |
|
|---|
| 256 |
|
|---|
| 257 |
$imgfiles{$filename}{fname} = $fname; |
|---|
| 258 |
$imgfiles{$filename}{ctype} = $ctype; |
|---|
| 259 |
$imgfiles{$filename}{fsize} = $pdatalen; |
|---|
| 260 |
$imgfiles{$filename}{fpath} = $imgfilename; |
|---|
| 261 |
|
|---|
| 262 |
|
|---|
| 263 |
unless (open PICT, ">$imgfilename") { |
|---|
| 264 |
errorlog("Cannot write \"$imgfilename\", skipping..."); |
|---|
| 265 |
delete $imgfiles{$filename}; |
|---|
| 266 |
removedir($imgdir); |
|---|
| 267 |
next; |
|---|
| 268 |
} |
|---|
| 269 |
binmode PICT; |
|---|
| 270 |
print PICT $pdata; |
|---|
| 271 |
close PICT; |
|---|
| 272 |
debuglog("Saved: $imgfilename"); |
|---|
| 273 |
|
|---|
| 274 |
|
|---|
| 275 |
$cnt++; |
|---|
| 276 |
|
|---|
| 277 |
|
|---|
| 278 |
my $rawfilename = $imgdir . "/raw.eml"; |
|---|
| 279 |
if (open RAW, ">$rawfilename") { |
|---|
| 280 |
print RAW $pms->{msg}->get_pristine(); |
|---|
| 281 |
close RAW; |
|---|
| 282 |
debuglog("Saved: $rawfilename"); |
|---|
| 283 |
} |
|---|
| 284 |
|
|---|
| 285 |
} |
|---|
| 286 |
|
|---|
| 287 |
if ($cnt == 0) { |
|---|
| 288 |
debuglog("Skipping OCR, no image files found..."); |
|---|
| 289 |
return 0; |
|---|
| 290 |
} |
|---|
| 291 |
infolog("Found: $cnt images"); $cnt = 0; |
|---|
| 292 |
if ($conf->{focr_enable_image_hashing} == 3) { |
|---|
| 293 |
$conf->{focr_mysql_ddb} = get_mysql_ddb(); |
|---|
| 294 |
} |
|---|
| 295 |
|
|---|
| 296 |
|
|---|
| 297 |
if ($conf->{focr_personal_wordlist} =~ m/^\//) { |
|---|
| 298 |
read_words( $conf->{focr_personal_wordlist} ); |
|---|
| 299 |
} else { |
|---|
| 300 |
my $peruserlist = $main->sed_path($conf->{focr_personal_wordlist}); |
|---|
| 301 |
if ( -r $peruserlist ) { |
|---|
| 302 |
read_words( $peruserlist ); |
|---|
| 303 |
} else { |
|---|
| 304 |
|
|---|
| 305 |
if ( -e $peruserlist ) { |
|---|
| 306 |
errorlog("Cannot read personal_wordlist: $peruserlist, skipping..."); |
|---|
| 307 |
} |
|---|
| 308 |
} |
|---|
| 309 |
} |
|---|
| 310 |
|
|---|
| 311 |
IMAGE: |
|---|
| 312 |
my $haserr; |
|---|
| 313 |
foreach my $filename (keys %imgfiles) { |
|---|
| 314 |
my $pic = $imgfiles{$filename}; |
|---|
| 315 |
|
|---|
| 316 |
my @used_scansets = (); |
|---|
| 317 |
my $corrupt = 0; |
|---|
| 318 |
my $suffix = 0; |
|---|
| 319 |
my $generic_ctype = 0; |
|---|
| 320 |
my $digest; |
|---|
| 321 |
my $file = $$pic{fpath}; |
|---|
| 322 |
my $tfile = $file; |
|---|
| 323 |
my $pfile = $file . ".pnm"; |
|---|
| 324 |
my $efile = $file . ".err"; |
|---|
| 325 |
debuglog("pfile => $pfile"); |
|---|
| 326 |
debuglog("efile => $efile"); |
|---|
| 327 |
|
|---|
| 328 |
|
|---|
| 329 |
$haserr = $Mail::SpamAssassin::Logger::LOG_SA{level} == 3; |
|---|
| 330 |
|
|---|
| 331 |
if ($haserr) { |
|---|
| 332 |
$haserr = open RAWERR, ">$imgdir/raw.err"; |
|---|
| 333 |
debuglog("Errors to: $imgdir/raw.err") if ($haserr>0); |
|---|
| 334 |
} |
|---|
| 335 |
|
|---|
| 336 |
my $mimetype = $$pic{ctype}; |
|---|
| 337 |
if($mimetype =~ m'application/octet-stream'i) { |
|---|
| 338 |
$generic_ctype = 1; |
|---|
| 339 |
} |
|---|
| 340 |
|
|---|
| 341 |
if($$pic{fname} =~ /\.([\w-]+)$/) { |
|---|
| 342 |
$suffix = $1; |
|---|
| 343 |
} |
|---|
| 344 |
if ($suffix) { |
|---|
| 345 |
debuglog("File has Content-Type \"$mimetype\" and File Extension \"$suffix\""); |
|---|
| 346 |
} else { |
|---|
| 347 |
debuglog("File has Content-Type \"$mimetype\" and no File Extension"); |
|---|
| 348 |
} |
|---|
| 349 |
|
|---|
| 350 |
if ( $$pic{ftype} == 1 ) { |
|---|
| 351 |
infolog("Found GIF header name=\"$$pic{fname}\""); |
|---|
| 352 |
if ($conf->{focr_skip_gif}) { |
|---|
| 353 |
infolog("Skipping image check"); |
|---|
| 354 |
next IMAGE; |
|---|
| 355 |
} |
|---|
| 356 |
if (defined($conf->{focr_max_size_gif}) and ($$pic{fsize} > $conf->{focr_max_size_gif})) { |
|---|
| 357 |
infolog("GIF file size ($$pic{fsize}) exceeds maximum file size for this format, skipping..."); |
|---|
| 358 |
next; |
|---|
| 359 |
} |
|---|
| 360 |
|
|---|
| 361 |
if ( ($$pic{ctype} !~ /gif/i) and not $generic_ctype) { |
|---|
| 362 |
wrong_ctype( "GIF", $$pic{ctype} ); |
|---|
| 363 |
$internal_score += $conf->{'focr_wrongctype_score'}; |
|---|
| 364 |
} |
|---|
| 365 |
|
|---|
| 366 |
if ( $suffix and $suffix !~ /gif/i) { |
|---|
| 367 |
wrong_extension( "GIF", $suffix); |
|---|
| 368 |
$internal_score += $conf->{'focr_wrongext_score'}; |
|---|
| 369 |
} |
|---|
| 370 |
|
|---|
| 371 |
my $interlaced_gif = 0; |
|---|
| 372 |
my $image_count = 0; |
|---|
| 373 |
|
|---|
| 374 |
foreach my $a (qw/gifsicle giftext giffix gifinter giftopnm/) { |
|---|
| 375 |
unless (defined $conf->{"focr_bin_$a"}) { |
|---|
| 376 |
errorlog("Cannot exec $a, skipping image"); |
|---|
| 377 |
next IMAGE; |
|---|
| 378 |
} |
|---|
| 379 |
} |
|---|
| 380 |
my @stderr_data; |
|---|
| 381 |
|
|---|
| 382 |
my ($retcode, @stdout_data) = save_execute( |
|---|
| 383 |
"$conf->{focr_bin_giftext} $file", |
|---|
| 384 |
undef, |
|---|
| 385 |
">$imgdir/giftext.info", |
|---|
| 386 |
">>$imgdir/giftext.err", 1); |
|---|
| 387 |
|
|---|
| 388 |
if ($retcode<0) { |
|---|
| 389 |
chomp $retcode; |
|---|
| 390 |
errorlog("$conf->{focr_bin_giftext} Timed out [$retcode], skipping..."); |
|---|
| 391 |
++$imgerr if $conf->{focr_keep_bad_images}>0; next; |
|---|
| 392 |
} |
|---|
| 393 |
|
|---|
| 394 |
foreach (@stdout_data) { |
|---|
| 395 |
unless ($interlaced_gif) { |
|---|
| 396 |
if ( $_ =~ /Image is Interlaced/i ) { |
|---|
| 397 |
$interlaced_gif = 1; |
|---|
| 398 |
} |
|---|
| 399 |
} |
|---|
| 400 |
if ( $_ =~ /^Image |
|---|
| 401 |
$image_count++; |
|---|
| 402 |
} |
|---|
| 403 |
} |
|---|
| 404 |
if ($interlaced_gif or ($image_count > 1)) { |
|---|
| 405 |
infolog("Image is interlaced or animated..."); |
|---|
| 406 |
} |
|---|
| 407 |
else { |
|---|
| 408 |
infolog("Image is single non-interlaced..."); |
|---|
| 409 |
$tfile .= "-fixed.gif"; |
|---|
| 410 |
printf RAWERR "## $conf->{focr_bin_giffix} $file >$tfile 2>>$efile\n" if ($haserr>0); |
|---|
| 411 |
|
|---|
| 412 |
$retcode = save_execute("$conf->{focr_bin_giffix} $file", undef, ">$tfile", ">>$efile"); |
|---|
| 413 |
|
|---|
| 414 |
if ($retcode<0) { |
|---|
| 415 |
chomp $retcode; |
|---|
| 416 |
errorlog("$conf->{focr_bin_giffix}: Timed out [$retcode], skipping..."); |
|---|
| 417 |
printf RAWERR "?? Timed out > $retcode\n" if ($haserr>0); |
|---|
| 418 |
++$imgerr if $conf->{focr_keep_bad_images}>0; next; |
|---|
| 419 |
} |
|---|
| 420 |
|
|---|
| 421 |
if (open ERR, $efile) { |
|---|
| 422 |
@stderr_data = <ERR>; |
|---|
| 423 |
close ERR; |
|---|
| 424 |
foreach (@stderr_data) { |
|---|
| 425 |
if ( $_ =~ /GIF-LIB error/i ) { |
|---|
| 426 |
$corrupt = $_; |
|---|
| 427 |
last; |
|---|
| 428 |
} |
|---|
| 429 |
} |
|---|
| 430 |
} |
|---|
| 431 |
} |
|---|
| 432 |
|
|---|
| 433 |
if (defined($conf->{focr_max_size_gif}) and (((stat($tfile))[7]) > $conf->{focr_max_size_gif})) { |
|---|
| 434 |
infolog("Fixed GIF file size ($$pic{fsize}) exceeds maximum file size for this format, skipping..."); |
|---|
| 435 |
next; |
|---|
| 436 |
} |
|---|
| 437 |
|
|---|
| 438 |
if ($corrupt) { |
|---|
| 439 |
if ($interlaced_gif or ($image_count > 1)) { |
|---|
| 440 |
infolog("Skipping corrupted interlaced image..."); |
|---|
| 441 |
corrupt_img($conf->{focr_corrupt_unfixable_score}, $corrupt); |
|---|
| 442 |
$internal_score += $conf->{focr_corrupt_unfixable_score}; |
|---|
| 443 |
next; |
|---|
| 444 |
} |
|---|
| 445 |
if (-z $tfile) { |
|---|
| 446 |
infolog("Uncorrectable corruption detected, skipping non-interlaced image..."); |
|---|
| 447 |
corrupt_img($conf->{focr_corrupt_unfixable_score}, $corrupt); |
|---|
| 448 |
$internal_score += $conf->{focr_corrupt_unfixable_score}; |
|---|
| 449 |
next; |
|---|
| 450 |
} |
|---|
| 451 |
infolog("Image is corrupt, but seems fixable, continuing..."); |
|---|
| 452 |
corrupt_img($conf->{focr_corrupt_score}, $corrupt); |
|---|
| 453 |
$internal_score += $conf->{focr_corrupt_score}; |
|---|
| 454 |
} |
|---|
| 455 |
|
|---|
| 456 |
if ($image_count > 1) { |
|---|
| 457 |
infolog("File contains <$image_count> images, deanimating..."); |
|---|
| 458 |
$tfile = deanimate($tfile); |
|---|
| 459 |
} |
|---|
| 460 |
|
|---|
| 461 |
if ($interlaced_gif) { |
|---|
| 462 |
infolog("Processing interlaced_gif $tfile..."); |
|---|
| 463 |
my $cfile = $tfile; |
|---|
| 464 |
if ($tfile =~ m/\.gif$/i) { |
|---|
| 465 |
$tfile =~ s/\.gif$/-fixed.gif/i; |
|---|
| 466 |
} else { |
|---|
| 467 |
$tfile .= ".gif"; |
|---|
| 468 |
} |
|---|
| 469 |
printf RAWERR qq( |
|---|
| 470 |
|
|---|
| 471 |
$retcode = save_execute("$conf->{focr_bin_gifinter} $cfile", undef, ">$tfile", ">>$efile"); |
|---|
| 472 |
|
|---|
| 473 |
if ($retcode<0) { |
|---|
| 474 |
chomp $retcode; |
|---|
| 475 |
printf RAWERR "?? Timed out > $retcode\n" if ($haserr>0); |
|---|
| 476 |
errorlog("$conf->{focr_bin_gifinter}: Timed out [$retcode], skipping..."); |
|---|
| 477 |
++$imgerr if $conf->{focr_keep_bad_images}>0; next; |
|---|
| 478 |
} elsif ($retcode>0) { |
|---|
| 479 |
chomp $retcode; |
|---|
| 480 |
printf RAWERR "?? [$retcode] returned from $conf->{focr_bin_gifinter}\n" if ($haserr>0); |
|---|
| 481 |
errorlog("$conf->{focr_bin_gifinter}: Returned [$retcode], skipping..."); |
|---|
| 482 |
++$imgerr if $conf->{focr_keep_bad_images}>0; next; |
|---|
| 483 |
} |
|---|
| 484 |
} |
|---|
| 485 |
|
|---|
| 486 |
printf RAWERR qq( |
|---|
| 487 |
|
|---|
| 488 |
$retcode = save_execute("$conf->{focr_bin_giftopnm} $tfile", undef, ">$pfile", ">>$efile"); |
|---|
| 489 |
|
|---|
| 490 |
if ($retcode<0) { |
|---|
| 491 |
chomp $retcode; |
|---|
| 492 |
printf RAWERR "?? Timed out > $retcode\n" if ($haserr>0); |
|---|
| 493 |
errorlog("$conf->{focr_bin_giftopnm}: Timed out [$retcode], skipping..."); |
|---|
| 494 |
++$imgerr if $conf->{focr_keep_bad_images}>0; next; |
|---|
| 495 |
} elsif ($retcode>0) { |
|---|
| 496 |
chomp $retcode; |
|---|
| 497 |
printf RAWERR "?? [$retcode] returned from $conf->{focr_bin_giftopnm}\n" if ($haserr>0); |
|---|
| 498 |
errorlog("$conf->{focr_bin_giftopnm}: Returned [$retcode], skipping..."); |
|---|
| 499 |
++$imgerr if $conf->{focr_keep_bad_images}>0; next; |
|---|
| 500 |
} |
|---|
| 501 |
} |
|---|
| 502 |
elsif ( $$pic{ftype} == 2 ) { |
|---|
| 503 |
infolog("Found JPEG header name=\"$$pic{fname}\""); |
|---|
| 504 |
if ($conf->{focr_skip_jpeg}) { |
|---|
| 505 |
infolog("Skipping image check"); |
|---|
| 506 |
next IMAGE; |
|---|
| 507 |
} |
|---|
| 508 |
|
|---|
| 509 |
if (defined($conf->{focr_max_size_jpeg}) and ($$pic{fsize} > $conf->{focr_max_size_jpeg})) { |
|---|
| 510 |
infolog("JPEG file size ($$pic{fsize}) exceeds maximum file size for this format, skipping..."); |
|---|
| 511 |
next; |
|---|
| 512 |
} |
|---|
| 513 |
if ( ($$pic{ctype} !~ /(jpeg|jpg)/i) and not $generic_ctype) { |
|---|
| 514 |
wrong_ctype( "JPEG", $$pic{ctype} ); |
|---|
| 515 |
$internal_score += $conf->{'focr_wrongctype_score'}; |
|---|
| 516 |
} |
|---|
| 517 |
|
|---|
| 518 |
if ( $suffix and $suffix !~ /(jpeg|jpg|jfif)/i) { |
|---|
| 519 |
wrong_extension( "JPEG", $suffix); |
|---|
| 520 |
$internal_score += $conf->{'focr_wrongext_score'}; |
|---|
| 521 |
} |
|---|
| 522 |
|
|---|
| 523 |
foreach my $a (qw/jpegtopnm/) { |
|---|
| 524 |
unless (defined $conf->{"focr_bin_$a"}) { |
|---|
| 525 |
errorlog("Cannot exec $a, skipping image"); |
|---|
| 526 |
next IMAGE; |
|---|
| 527 |
} |
|---|
| 528 |
} |
|---|
| 529 |
printf RAWERR qq( |
|---|
| 530 |
my $retcode = save_execute("$conf->{focr_bin_jpegtopnm} $file", undef, ">$pfile", ">>$efile"); |
|---|
| 531 |
|
|---|
| 532 |
if ($retcode<0) { |
|---|
| 533 |
chomp $retcode; |
|---|
| 534 |
printf RAWERR "?? Timed out > $retcode\n" if ($haserr>0); |
|---|
| 535 |
errorlog("$conf->{focr_bin_jpegtopnm}: Timed out [$retcode], skipping..."); |
|---|
| 536 |
++$imgerr if $conf->{focr_keep_bad_images}>0; next; |
|---|
| 537 |
} elsif ($retcode>0) { |
|---|
| 538 |
chomp $retcode; |
|---|
| 539 |
printf RAWERR "?? [$retcode] returned from $conf->{focr_bin_jpegtopnm}\n" if ($haserr>0); |
|---|
| 540 |
errorlog("$conf->{focr_bin_jpegtopnm}: Returned [$retcode], skipping..."); |
|---|
| 541 |
++$imgerr if $conf->{focr_keep_bad_images}>0; next; |
|---|
| 542 |
} |
|---|
| 543 |
} |
|---|
| 544 |
elsif ( $$pic{ftype} == 3 ) { |
|---|
| 545 |
infolog("Found PNG header name=\"$$pic{fname}\""); |
|---|
| 546 |
if ($conf->{focr_skip_png}) { |
|---|
| 547 |
infolog("Skipping image check"); |
|---|
| 548 |
next IMAGE; |
|---|
| 549 |
} |
|---|
| 550 |
if (defined($conf->{focr_max_size_png}) and ($$pic{fsize} > $conf->{focr__max_size_png})) { |
|---|
| 551 |
infolog("PNG file size ($$pic{fsize}) exceeds maximum file size for this format, skipping..."); |
|---|
| 552 |
next; |
|---|
| 553 |
} |
|---|
| 554 |
if ( ($$pic{ctype} !~ /png/i) and not $generic_ctype) { |
|---|
| 555 |
wrong_ctype( "PNG", $$pic{ctype} ); |
|---|
| 556 |
$internal_score += $conf->{'focr_wrongctype_score'}; |
|---|
| 557 |
} |
|---|
| 558 |
if ( $suffix and $suffix !~ /(png)/i) { |
|---|
| 559 |
wrong_extension( "PNG", $suffix); |
|---|
| 560 |
$internal_score += $conf->{'focr_wrongext_score'}; |
|---|
| 561 |
} |
|---|
| 562 |
foreach my $a (qw/pngtopnm/) { |
|---|
| 563 |
unless (defined $conf->{"focr_bin_$a"}) { |
|---|
| 564 |
errorlog("Cannot exec $a, skipping image"); |
|---|
| 565 |
next IMAGE; |
|---|
| 566 |
} |
|---|
| 567 |
} |
|---|
| 568 |
|
|---|
| 569 |
printf RAWERR qq( |
|---|
| 570 |
my $retcode = save_execute("$conf->{focr_bin_pngtopnm} $file", undef, ">$pfile", ">>$efile"); |
|---|
| 571 |
|
|---|
| 572 |
if ($retcode<0) { |
|---|
| 573 |
chomp $retcode; |
|---|
| 574 |
printf RAWERR "?? Timed out > $retcode\n" if ($haserr>0); |
|---|
| 575 |
errorlog("$conf->{focr_bin_pngtopnm}: Timed out [$retcode], skipping..."); |
|---|
| 576 |
++$imgerr if $conf->{focr_keep_bad_images}>0; next; |
|---|
| 577 |
} elsif ($retcode>0) { |
|---|
| 578 |
chomp $retcode; |
|---|
| 579 |
printf RAWERR "?? [$retcode] returned from $conf->{focr_bin_pngtopnm}\n" if ($haserr>0); |
|---|
| 580 |
errorlog("$conf->{focr_bin_pngtopnm}: Returned [$retcode], skipping..."); |
|---|
| 581 |
++$imgerr if $conf->{focr_keep_bad_images}>0; next; |
|---|
| 582 |
} |
|---|
| 583 |
} |
|---|
| 584 |
elsif ( $$pic{ftype} == 4 ) { |
|---|
| 585 |
infolog("Found BMP header name=\"$$pic{fname}\""); |
|---|
| 586 |
if ($conf->{focr_skip_bmp}) { |
|---|
| 587 |
infolog("Skipping image check"); |
|---|
| 588 |
next IMAGE; |
|---|
| 589 |
} |
|---|
| 590 |
if (defined($conf->{focr_max_size_bmp}) and ($$pic{fsize} > $conf->{focr_max_size_bmp})) { |
|---|
| 591 |
infolog("BMP file size ($$pic{fsize}) exceeds maximum file size for this format, skipping..."); |
|---|
| 592 |
next; |
|---|
| 593 |
} |
|---|
| 594 |
if ( ($$pic{ctype} !~ /bmp/i) and not $generic_ctype) { |
|---|
| 595 |
wrong_ctype( "BMP", $$pic{ctype} ); |
|---|
| 596 |
$internal_score += $conf->{'focr_wrongctype_score'}; |
|---|
| 597 |
} |
|---|
| 598 |
if ( $suffix and $suffix !~ /(bmp)/i) { |
|---|
| 599 |
wrong_extension( "BMP", $suffix); |
|---|
| 600 |
$internal_score += $conf->{'focr_wrongext_score'}; |
|---|
| 601 |
} |
|---|
| 602 |
foreach my $a (qw/bmptopnm/) { |
|---|
| 603 |
unless (defined $conf->{"focr_bin_$a"}) { |
|---|
| 604 |
errorlog("Cannot exec $a, skipping image"); |
|---|
| 605 |
next IMAGE; |
|---|
| 606 |
} |
|---|
| 607 |
} |
|---|
| 608 |
printf RAWERR qq( |
|---|
| 609 |
|
|---|
| 610 |
my $retcode = save_execute("$conf->{focr_bin_bmptopnm} $file", undef, ">$pfile", ">>$efile"); |
|---|
| 611 |
if ($retcode<0) { |
|---|
| 612 |
chomp $retcode; |
|---|
| 613 |
printf RAWERR "?? Timed out > $retcode\n" if ($haserr>0); |
|---|
| 614 |
errorlog("$conf->{focr_bin_bmptopnm}: Timed out [$retcode], skipping..."); |
|---|
| 615 |
++$imgerr if $conf->{focr_keep_bad_images}>0; next; |
|---|
| 616 |
} elsif ($retcode>0) { |
|---|
| 617 |
chomp $retcode; |
|---|
| 618 |
printf RAWERR "?? [$retcode] returned from $conf->{focr_bin_bmptopnm}\n" if ($haserr>0); |
|---|
| 619 |
errorlog("$conf->{focr_bin_bmptopnm}: Returned [$retcode], skipping..."); |
|---|
| 620 |
++$imgerr if $conf->{focr_keep_bad_images}>0; next; |
|---|
| 621 |
} |
|---|
| 622 |
} |
|---|
| 623 |
elsif ( $$pic{ftype} == 5 ) { |
|---|
| 624 |
infolog("Found TIFF header name=\"$$pic{fname}\""); |
|---|
| 625 |
if ($conf->{focr_skip_tiff}) { |
|---|
| 626 |
infolog("Skipping image check"); |
|---|
| 627 |
next IMAGE; |
|---|
| 628 |
} |
|---|
| 629 |
if (defined($conf->{focr_max_size_tiff}) and ($$pic{fsize} > $conf->{focr_max_size_tiff})) { |
|---|
| 630 |
infolog("TIFF file size ($$pic{fsize}) exceeds maximum file size for this format, skipping..."); |
|---|
| 631 |
next; |
|---|
| 632 |
} |
|---|
| 633 |
if ( ($$pic{ctype} !~ /tif/i) and not $generic_ctype) { |
|---|
| 634 |
wrong_ctype( "TIFF", $$pic{ctype} ); |
|---|
| 635 |
$internal_score += $conf->{'focr_wrongctype_score'}; |
|---|
| 636 |
} |
|---|
| 637 |
if ( $suffix and $suffix !~ /tif/i) { |
|---|
| 638 |
wrong_extension( "TIFF", $suffix); |
|---|
| 639 |
$internal_score += $conf->{'focr_wrongext_score'}; |
|---|
| 640 |
} |
|---|
| 641 |
|
|---|
| 642 |
foreach my $a (qw/tifftopnm/) { |
|---|
| 643 |
unless (defined $conf->{"focr_bin_$a"}) { |
|---|
| 644 |
errorlog("Cannot exec $a, skipping image"); |
|---|
| 645 |
next IMAGE; |
|---|
| 646 |
} |
|---|
| 647 |
} |
|---|
| 648 |
printf RAWERR qq( |
|---|
| 649 |
my $retcode = save_execute("$conf->{focr_bin_tifftopnm} $file", undef, ">$pfile", ">>$efile"); |
|---|
| 650 |
|
|---|
| 651 |
if ($retcode<0) { |
|---|
| 652 |
chomp $retcode; |
|---|
| 653 |
printf RAWERR "?? Timed out > $retcode\n" if ($haserr>0); |
|---|
| 654 |
errorlog("$conf->{focr_bin_tifftopnm}: Timed out [$retcode], skipping..."); |
|---|
| 655 |
++$imgerr if $conf->{focr_keep_bad_images}>0; next; |
|---|
| 656 |
} elsif ($retcode>0) { |
|---|
| 657 |
chomp $retcode; |
|---|
| 658 |
printf RAWERR "?? [$retcode] returned from $conf->{focr_bin_tifftopnm}\n" if ($haserr>0); |
|---|
| 659 |
errorlog("$conf->{focr_bin_tifftopnm}: Returned [$retcode], skipping..."); |
|---|
| 660 |
++$imgerr if $conf->{focr_keep_bad_images}>0; next; |
|---|
| 661 |
} |
|---|
| 662 |
} |
|---|
| 663 |
else { |
|---|
| 664 |
errorlog("Image type not recognized, unknown format. Skipping this image..."); |
|---|
| 665 |
next; |
|---|
| 666 |
} |
|---|
| 667 |
|
|---|
| 668 |
if($conf->{focr_enable_image_hashing}) { |
|---|
| 669 |
infolog("Calculating image hash for: $pfile"); |
|---|
| 670 |
($corrupt, $digest) = calc_image_hash($pfile,$pic); |
|---|
| 671 |
if ($corrupt) { |
|---|
| 672 |
infolog("Error calculating the image hash, skipping hash check..."); |
|---|
| 673 |
} else { |
|---|
| 674 |
my ($score, $dinfo, $whash); |
|---|
| 675 |
$whash = $conf->{focr_enable_image_hashing} == 3 |
|---|
| 676 |
? $conf->{focr_mysql_hash} |
|---|
| 677 |
: $conf->{focr_db_hash}; |
|---|
| 678 |
($score,$dinfo) = check_image_hash_db($digest, $whash, $$pic{fname}, $$pic{ctype}, $$pic{ftype}); |
|---|
| 679 |
if ($score > 0) { |
|---|
| 680 |
known_img_hash($score,$dinfo); |
|---|
| 681 |
infolog("Message is SPAM. $dinfo") if ($conf->{focr_enable_image_hashing} < 3); |
|---|
| 682 |
removedirs(get_all_tmpdirs()); |
|---|
| 683 |
return 0; |
|---|
| 684 |
} |
|---|
| 685 |
$whash = $conf->{focr_enable_image_hashing} == 3 |
|---|
| 686 |
? $conf->{focr_mysql_safe} |
|---|
| 687 |
: $conf->{focr_db_safe}; |
|---|
| 688 |
($score,$dinfo) = check_image_hash_db($digest, $whash, $$pic{fname}, $$pic{ctype}, $$pic{ftype}); |
|---|
| 689 |
if ($score > 0) { |
|---|
| 690 |
infolog("Image in KNOWN_GOOD. Skipping OCR checks..."); |
|---|
| 691 |
next IMAGE; |
|---|
| 692 |
} |
|---|
| 693 |
} |
|---|
| 694 |
if ($digest eq '') { |
|---|
| 695 |
infolog("Empty Hash, skipping..."); |
|---|
| 696 |
next IMAGE; |
|---|
| 697 |
} |
|---|
| 698 |
} else { |
|---|
| 699 |
infolog("Image hashing disabled in configuration, skipping..."); |
|---|
| 700 |
} |
|---|
| 701 |
|
|---|
| 702 |
|
|---|
| 703 |
|
|---|
| 704 |
|
|---|