| 1 | # <@LICENSE> |
|---|
| 2 | # Licensed to the Apache Software Foundation (ASF) under one or more |
|---|
| 3 | # contributor license agreements. See the NOTICE file distributed with |
|---|
| 4 | # this work for additional information regarding copyright ownership. |
|---|
| 5 | # The ASF licenses this file to you under the Apache License, Version 2.0 |
|---|
| 6 | # (the "License"); you may not use this file except in compliance with |
|---|
| 7 | # the License. You may obtain a copy of the License at: |
|---|
| 8 | # |
|---|
| 9 | # http://www.apache.org/licenses/LICENSE-2.0 |
|---|
| 10 | # |
|---|
| 11 | # Unless required by applicable law or agreed to in writing, software |
|---|
| 12 | # distributed under the License is distributed on an "AS IS" BASIS, |
|---|
| 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|---|
| 14 | # See the License for the specific language governing permissions and |
|---|
| 15 | # limitations under the License. |
|---|
| 16 | # </@LICENSE> |
|---|
| 17 | |
|---|
| 18 | # FuzzyOcr plugin, version 3.6 |
|---|
| 19 | # |
|---|
| 20 | # written by Christian Holler (decoder_at_own-hero_dot_net) |
|---|
| 21 | # and Jorge Valdes (jorge_at_joval_dot_info) |
|---|
| 22 | |
|---|
| 23 | package FuzzyOcr; |
|---|
| 24 | |
|---|
| 25 | use strict; |
|---|
| 26 | use warnings; |
|---|
| 27 | use POSIX; |
|---|
| 28 | use Fcntl ':flock'; |
|---|
| 29 | use Mail::SpamAssassin; |
|---|
| 30 | use Mail::SpamAssassin::Logger; |
|---|
| 31 | use Mail::SpamAssassin::Util; |
|---|
| 32 | use Mail::SpamAssassin::Timeout; |
|---|
| 33 | use Mail::SpamAssassin::Plugin; |
|---|
| 34 | |
|---|
| 35 | use Time::HiRes qw( gettimeofday tv_interval ); |
|---|
| 36 | use String::Approx 'adistr'; |
|---|
| 37 | use FileHandle; |
|---|
| 38 | |
|---|
| 39 | use lib qw(/etc/mail/spamassassin); # Allow placing of FuzzyOcr in siteconfigdir |
|---|
| 40 | |
|---|
| 41 | use FuzzyOcr::Logging qw(debuglog errorlog warnlog infolog); |
|---|
| 42 | use FuzzyOcr::Config qw(kill_pid |
|---|
| 43 | get_tmpdir |
|---|
| 44 | set_tmpdir |
|---|
| 45 | get_all_tmpdirs |
|---|
| 46 | get_pms |
|---|
| 47 | save_pms |
|---|
| 48 | get_timeout |
|---|
| 49 | get_mysql_ddb |
|---|
| 50 | get_scansets |
|---|
| 51 | get_wordlist |
|---|
| 52 | set_config |
|---|
| 53 | get_config |
|---|
| 54 | parse_config |
|---|
| 55 | finish_parsing_end |
|---|
| 56 | read_words); |
|---|
| 57 | use FuzzyOcr::Hashing qw(check_image_hash_db add_image_hash_db calc_image_hash); |
|---|
| 58 | use FuzzyOcr::Deanimate qw(deanimate); |
|---|
| 59 | use FuzzyOcr::Scoring qw(wrong_ctype wrong_extension corrupt_img known_img_hash); |
|---|
| 60 | use FuzzyOcr::Misc qw(max removedir removedirs save_execute); |
|---|
| 61 | |
|---|
| 62 | our @ISA = qw(Mail::SpamAssassin::Plugin); |
|---|
| 63 | |
|---|
| 64 | # constructor: register the eval rule |
|---|
| 65 | sub new { |
|---|
| 66 | my ( $class, $mailsa ) = @_; |
|---|
| 67 | $class = ref($class) || $class; |
|---|
| 68 | my $self = $class->SUPER::new($mailsa); |
|---|
| 69 | bless( $self, $class ); |
|---|
| 70 | $self->register_eval_rule("fuzzyocr_check"); |
|---|
| 71 | $self->register_eval_rule("dummy_check"); |
|---|
| 72 | $self->set_config($mailsa->{conf}); |
|---|
| 73 | return $self; |
|---|
| 74 | } |
|---|
| 75 | |
|---|
| 76 | sub dummy_check { |
|---|
| 77 | return 0; |
|---|
| 78 | } |
|---|
| 79 | |
|---|
| 80 | sub fuzzyocr_check { |
|---|
| 81 | my ( $self, $pms ) = @_; |
|---|
| 82 | my $conf = get_config(); |
|---|
| 83 | |
|---|
| 84 | save_pms($pms); |
|---|
| 85 | |
|---|
| 86 | my $end; |
|---|
| 87 | my $begin = [gettimeofday]; |
|---|
| 88 | if ($conf->{focr_global_timeout}) { |
|---|
| 89 | my $t = get_timeout(); |
|---|
| 90 | debuglog("Global Timeout set at ".$conf->{focr_timeout}." sec."); |
|---|
| 91 | $t->run(sub { |
|---|
| 92 | $end = fuzzyocr_do( $self, $conf, $pms ); |
|---|
| 93 | }); |
|---|
| 94 | if ($t->timed_out()) { |
|---|
| 95 | infolog("Scan timed out after $conf->{focr_timeout} seconds."); |
|---|
| 96 | infolog("Killing possibly running pid..."); |
|---|
| 97 | my ($ret, $pid) = kill_pid(); |
|---|
| 98 | if ($ret > 0) { |
|---|
| 99 | infolog("Successfully killed PID $pid"); |
|---|
| 100 | } elsif ($ret < 0) { |
|---|
| 101 | infolog("No processes left... exiting"); |
|---|
| 102 | } else { |
|---|
| 103 | infolog("Failed to kill PID $pid, stale process!"); |
|---|
| 104 | } |
|---|
| 105 | infolog("Removing possibly leftover tempdirs..."); |
|---|
| 106 | removedirs(get_all_tmpdirs()); |
|---|
| 107 | return 0; |
|---|
| 108 | } |
|---|
| 109 | } else { |
|---|
| 110 | $end = fuzzyocr_do( $self, $conf, $pms ); |
|---|
| 111 | } |
|---|
| 112 | debuglog("Processed in ". |
|---|
| 113 | sprintf("%.6f",tv_interval($begin, [gettimeofday])) |
|---|
| 114 | ." sec."); |
|---|
| 115 | return $end; |
|---|
| 116 | } |
|---|
| 117 | |
|---|
| 118 | sub fuzzyocr_do { |
|---|
| 119 | my ( $self, $conf, $pms ) = @_; |
|---|
| 120 | |
|---|
| 121 | my $internal_score = 0; |
|---|
| 122 | my $current_score = $pms->get_score(); |
|---|
| 123 | my $score = $conf->{focr_autodisable_score} || 100; |
|---|
| 124 | |
|---|
| 125 | if ( $current_score > $score ) { |
|---|
| 126 | infolog("Scan canceled, message has already more than $score points ($current_score)."); |
|---|
| 127 | return 0; |
|---|
| 128 | } |
|---|
| 129 | |
|---|
| 130 | my $nscore = $conf->{focr_autodisable_negative_score} || -100; |
|---|
| 131 | if ( $current_score < $nscore ) { |
|---|
| 132 | infolog("Scan canceled, message has less than $nscore points ($current_score)."); |
|---|
| 133 | return 0; |
|---|
| 134 | } |
|---|
| 135 | |
|---|
| 136 | my $imgdir; |
|---|
| 137 | my %imgfiles = (); |
|---|
| 138 | my @found = (); |
|---|
| 139 | my @hashes = (); |
|---|
| 140 | my $cnt = 0; |
|---|
| 141 | my $imgerr = 0; |
|---|
| 142 | my $main = $self->{main}; |
|---|
| 143 | |
|---|
| 144 | debuglog("Starting FuzzyOcr..."); |
|---|
| 145 | |
|---|
| 146 | #Show PMS info if asked to |
|---|
| 147 | if ($conf->{focr_log_pmsinfo}) { |
|---|
| 148 | my $msgid = $pms->get('Message-Id') ? $pms->get('Message-Id') : "<no messageid>"; |
|---|
| 149 | my $from = $pms->get('From') ? $pms->get('From') : "<no sender>"; |
|---|
| 150 | my $to = $pms->get('To') ? $pms->get('To') : "<no receipients>"; |
|---|
| 151 | chomp($from, $to, $msgid); |
|---|
| 152 | infolog("Processing Message with ID \"$msgid\" ($from -> $to)"); |
|---|
| 153 | } |
|---|
| 154 | |
|---|
| 155 | foreach my $p ( |
|---|
| 156 | $pms->{msg}->find_parts(qr(^image\b)i), |
|---|
| 157 | $pms->{msg}->find_parts(qr(Application/Octet-Stream)i), |
|---|
| 158 | $pms->{msg}->find_parts(qr(application/pdf)i) |
|---|
| 159 | ) { |
|---|
| 160 | my $ctype = $p->{'type'}; |
|---|
| 161 | my $fname = $p->{'name'} || 'unknown'; |
|---|
| 162 | if (($fname eq 'unknown') and |
|---|
| 163 | (defined $p->{'headers'}->{'content-id'}) |
|---|
| 164 | ){ |
|---|
| 165 | $fname = join('',@{$p->{'headers'}->{'content-id'}}); |
|---|
| 166 | $fname =~ s/[<>]//g; |
|---|
| 167 | $fname =~ tr/\@\$\%\&/_/s; |
|---|
| 168 | } |
|---|
| 169 | |
|---|
| 170 | my $filename = $fname; $filename =~ tr{a-zA-Z0-9\-.}{_}cs; |
|---|
| 171 | debuglog("fname: \"$fname\" => \"$filename\""); |
|---|
| 172 | my $pdata = $p->decode(); |
|---|
| 173 | my $pdatalen = length($pdata); |
|---|
| 174 | my $w = 0; my $h = 0; |
|---|
| 175 | |
|---|
| 176 | if ( substr($pdata,0,3) eq "\x47\x49\x46" ) { |
|---|
| 177 | ## GIF File |
|---|
| 178 | $imgfiles{$filename}{ftype} = 1; |
|---|
| 179 | ($w,$h) = unpack("vv",substr($pdata,6,4)); |
|---|
| 180 | infolog("GIF: [${h}x${w}] $filename ($pdatalen)"); |
|---|
| 181 | $imgfiles{$filename}{width} = $w; |
|---|
| 182 | $imgfiles{$filename}{height} = $h; |
|---|
| 183 | } elsif ( substr($pdata,0,2) eq "\xff\xd8" ) { |
|---|
| 184 | ## JPEG File |
|---|
| 185 | my @Markers = (0xC0,0xC1,0xC2,0xC3,0xC5,0xC6,0xC7,0xC9,0xCA,0xCB,0xCD,0xCE,0xCF); |
|---|
| 186 | my $pos = 2; |
|---|
| 187 | while ($pos < $pdatalen) { |
|---|
| 188 | my ($b,$m) = unpack("CC",substr($pdata,$pos,2)); $pos += 2; |
|---|
| 189 | if ($b != 0xff) { |
|---|
| 190 | infolog("Invalid JPEG image"); |
|---|
| 191 | $pos = $pdatalen + 1; |
|---|
| 192 | last; |
|---|
| 193 | } |
|---|
| 194 | my $skip = 0; |
|---|
| 195 | foreach my $mm (@Markers) { |
|---|
| 196 | if ($mm == $m) { |
|---|
| 197 | $skip++; last; |
|---|
| 198 | } |
|---|
| 199 | } |
|---|
| 200 | last if ($skip); |
|---|
| 201 | $pos += unpack("n",substr($pdata,$pos,2)); |
|---|
| 202 | } |
|---|
| 203 | if ($pos > $pdatalen) { |
|---|
| 204 | errorlog("Cannot find image dimensions"); |
|---|
| 205 | } else { |
|---|
| 206 | ($h,$w) = unpack("nn",substr($pdata,$pos+3,4)); |
|---|
| 207 | infolog("JPEG: [${h}x${w}] $filename ($pdatalen)"); |
|---|
| 208 | $imgfiles{$filename}{ftype} = 2; |
|---|
| 209 | $imgfiles{$filename}{height} = $h; |
|---|
| 210 | $imgfiles{$filename}{width} = $w; |
|---|
| 211 | } |
|---|
| 212 | } elsif ( substr($pdata,0,4) eq "\x89\x50\x4e\x47" ) { |
|---|
| 213 | # PNG File |
|---|
| 214 | ($w,$h) = unpack("NN",substr($pdata,16,8)); |
|---|
| 215 | $imgfiles{$filename}{ftype} = 3; |
|---|
| 216 | $imgfiles{$filename}{width} = $w; |
|---|
| 217 | $imgfiles{$filename}{height} = $h; |
|---|
| 218 | infolog("PNG: [${h}x${w}] $filename ($pdatalen)"); |
|---|
| 219 | } elsif ( substr($pdata,0,2) eq "BM" ) { |
|---|
| 220 | ## BMP File |
|---|
| 221 | ($w,$h) = unpack("VV",substr($pdata,18,8)); |
|---|
| 222 | $imgfiles{$filename}{ftype} = 4; |
|---|
| 223 | $imgfiles{$filename}{width} = $w; |
|---|
| 224 | $imgfiles{$filename}{height} = $h; |
|---|
| 225 | infolog("BMP: [${h}x${w}] $filename ($pdatalen)"); |
|---|
| 226 | } elsif ( |
|---|
| 227 | ## TIFF File |
|---|
| 228 | (substr($pdata,0,4) eq "\x4d\x4d\x00\x2a") or |
|---|
| 229 | (substr($pdata,0,4) eq "\x49\x49\x2a\x00") |
|---|
| 230 | ) { |
|---|
| 231 | my $worder = (substr($pdata,0,2) eq "\x4d\x4d") ? 0 : 1; |
|---|
| 232 | my $offset = unpack($worder?"V":"N",substr($pdata,4,4)); |
|---|
| 233 | my $number = unpack($worder?"v":"n",substr($pdata,$offset,2)) - 1; |
|---|
| 234 | foreach my $n (0 .. $number) { |
|---|
| 235 | my $add = 2 + ($n * 12); |
|---|
| 236 | my ($id,$tag,$cnt,$val) = unpack($worder?"vvVV":"nnNN",substr($pdata,$offset+$add,12)); |
|---|
| 237 | $h = $val if ($id == 256); |
|---|
| 238 | $w = $val if ($id == 257); |
|---|
| 239 | last if ($h != 0 and $w != 0); |
|---|
| 240 | } |
|---|
| 241 | infolog("TIFF: [${h}x${w}] $filename ($pdatalen) ($worder)"); |
|---|
| 242 | infolog("Cannot determine size of TIFF image, setting to '1x1'") if ($h == 0 and $w == 0); |
|---|
| 243 | $imgfiles{$filename}{ftype} = 5; |
|---|
| 244 | $imgfiles{$filename}{width} = $w ? $w : 1; |
|---|
| 245 | $imgfiles{$filename}{height} = $h ? $h : 1; |
|---|
| 246 | } elsif (substr($pdata,0,5) eq "\x25\x50\x44\x46\x2d") { |
|---|
| 247 | my $version = substr($pdata,5,3); |
|---|
| 248 | infolog("PDF: [version $version] $filename ($pdatalen)"); |
|---|
| 249 | $imgfiles{$filename}{ftype} = 6; |
|---|
| 250 | $imgfiles{$filename}{version} = $version; |
|---|
| 251 | $imgfiles{$filename}{width} = 0; |
|---|
| 252 | $imgfiles{$filename}{height} = 0; |
|---|
| 253 | } |
|---|
| 254 | |
|---|
| 255 | #Skip unless we found the right header |
|---|
| 256 | unless (defined $imgfiles{$filename}{ftype}) { |
|---|
| 257 | infolog("Skipping file with content-type=\"$ctype\" name=\"$fname\""); |
|---|
| 258 | delete $imgfiles{$filename}; |
|---|
| 259 | next; |
|---|
| 260 | } |
|---|
| 261 | if ($imgfiles{$filename}{ftype} == 6) { |
|---|
| 262 | unless ($conf->{focr_scan_pdfs}) { |
|---|
| 263 | infolog("Skipping PDF file: PDF Scanning was disabled in config"); |
|---|
| 264 | next; |
|---|
| 265 | } |
|---|
| 266 | } else { |
|---|
| 267 | #Skip images that cannot contain text |
|---|
| 268 | if ($imgfiles{$filename}{height} < $conf->{focr_min_height}) { |
|---|
| 269 | infolog("Skipping image: height < $conf->{focr_min_height}"); |
|---|
| 270 | delete $imgfiles{$filename}; |
|---|
| 271 | next; |
|---|
| 272 | } |
|---|
| 273 | |
|---|
| 274 | #Skip images that cannot contain text |
|---|
| 275 | if ($imgfiles{$filename}{width} < $conf->{focr_min_width}) { |
|---|
| 276 | infolog("Skipping image: width < $conf->{focr_min_width}"); |
|---|
| 277 | delete $imgfiles{$filename}; |
|---|
| 278 | next; |
|---|
| 279 | } |
|---|
| 280 | |
|---|
| 281 | #Skip too big images, screenshots etc |
|---|
| 282 | if ($imgfiles{$filename}{height} > $conf->{focr_max_height}) { |
|---|
| 283 | infolog("Skipping image: height > $conf->{focr_max_height}"); |
|---|
| 284 | delete $imgfiles{$filename}; |
|---|
| 285 | next; |
|---|
| 286 | } |
|---|
| 287 | |
|---|
| 288 | #Skip too big images, screenshots etc |
|---|
| 289 | if ($imgfiles{$filename}{width} > $conf->{focr_max_width}) { |
|---|
| 290 | infolog("Skipping image: width > $conf->{focr_max_width}"); |
|---|
| 291 | delete $imgfiles{$filename}; |
|---|
| 292 | next; |
|---|
| 293 | } |
|---|
| 294 | } |
|---|
| 295 | #Found Image!! Get a temporary dir to save image |
|---|
| 296 | $imgdir = Mail::SpamAssassin::Util::secure_tmpdir(); |
|---|
| 297 | unless ($imgdir) { |
|---|
| 298 | errorlog("Scan canceled, cannot create Image TMPDIR."); |
|---|
| 299 | return 0; |
|---|
| 300 | } |
|---|
| 301 | set_tmpdir($imgdir); |
|---|
| 302 | |
|---|
| 303 | #Generete unique filename to store image |
|---|
| 304 | my $imgfilename = Mail::SpamAssassin::Util::untaint_file_path( |
|---|
| 305 | $imgdir . "/" . $filename |
|---|
| 306 | ); |
|---|
| 307 | my $unique = 0; |
|---|
| 308 | while (-e $imgfilename) { |
|---|
| 309 | $imgfilename = Mail::SpamAssassin::Util::untaint_file_path( |
|---|
| 310 | $imgdir . "/" . chr(65+$unique) . "." . $filename |
|---|
| 311 | ); |
|---|
| 312 | $unique++; |
|---|
| 313 | } |
|---|
| 314 | |
|---|
| 315 | #Save important constants |
|---|
| 316 | $imgfiles{$filename}{fname} = $fname; |
|---|
| 317 | $imgfiles{$filename}{ctype} = $ctype; |
|---|
| 318 | $imgfiles{$filename}{fsize} = $pdatalen; |
|---|
| 319 | $imgfiles{$filename}{fpath} = $imgfilename; |
|---|
| 320 | |
|---|
| 321 | #Save Image to disk. |
|---|
| 322 | unless (open PICT, ">$imgfilename") { |
|---|
| 323 | errorlog("Cannot write \"$imgfilename\", skipping..."); |
|---|
| 324 | delete $imgfiles{$filename}; |
|---|
| 325 | removedir($imgdir); |
|---|
| 326 | next; |
|---|
| 327 | } |
|---|
| 328 | binmode PICT; |
|---|
| 329 | print PICT $pdata; |
|---|
| 330 | close PICT; |
|---|
| 331 | debuglog("Saved: $imgfilename"); |
|---|
| 332 | |
|---|
| 333 | #Increment valid image file counter |
|---|
| 334 | $cnt++; |
|---|
| 335 | |
|---|
| 336 | #keep raw email for debugging later |
|---|
| 337 | my $rawfilename = $imgdir . "/raw.eml"; |
|---|
| 338 | if (open RAW, ">$rawfilename") { |
|---|
| 339 | print RAW $pms->{msg}->get_pristine(); |
|---|
| 340 | close RAW; |
|---|
| 341 | debuglog("Saved: $rawfilename"); |
|---|
| 342 | } |
|---|
| 343 | |
|---|
| 344 | } |
|---|
| 345 | |
|---|
| 346 | if ($cnt == 0) { |
|---|
| 347 | debuglog("Skipping OCR, no image files found..."); |
|---|
| 348 | return 0; |
|---|
| 349 | } |
|---|
| 350 | infolog("Found: $cnt images"); $cnt = 0; |
|---|
| 351 | if ($conf->{focr_enable_image_hashing} == 3) { |
|---|
| 352 | $conf->{focr_mysql_ddb} = get_mysql_ddb(); |
|---|
| 353 | } |
|---|
| 354 | |
|---|
| 355 | # Try to load personal wordlist |
|---|
| 356 | unless ($conf->{focr_no_homedirs}) { |
|---|
| 357 | if ($conf->{focr_personal_wordlist} =~ m/^\//) { |
|---|
| 358 | read_words( $conf->{focr_personal_wordlist} ); |
|---|
| 359 | } else { |
|---|
| 360 | my $peruserlist = $main->sed_path($conf->{focr_personal_wordlist}); |
|---|
| 361 | if ( -r $peruserlist ) { |
|---|
| 362 | read_words( $peruserlist ); |
|---|
| 363 | } else { |
|---|
| 364 | # Only complain if the file exists |
|---|
| 365 | if ( -e $peruserlist ) { |
|---|
| 366 | errorlog("Cannot read personal_wordlist: $peruserlist, skipping..."); |
|---|
| 367 | } |
|---|
| 368 | } |
|---|
| 369 | } |
|---|
| 370 | } |
|---|
| 371 | my $haserr; |
|---|
| 372 | foreach my $filename (keys %imgfiles) { |
|---|
| 373 | my $pic = $imgfiles{$filename}; |
|---|
| 374 | #infolog("Analyzing file with content-type=\"$$pic{ctype}\""); |
|---|
| 375 | my @used_scansets = (); |
|---|
| 376 | my $corrupt = 0; |
|---|
| 377 | my $suffix = 0; |
|---|
| 378 | my $generic_ctype = 0; |
|---|
| 379 | my $digest; |
|---|
| 380 | my $file = $$pic{fpath}; |
|---|
| 381 | my $tfile = $file; |
|---|
| 382 | my $pfile = $file . ".pnm"; |
|---|
| 383 | my $efile = $file . ".err"; |
|---|
| 384 | debuglog("pfile => $pfile"); |
|---|
| 385 | debuglog("efile => $efile"); |
|---|
| 386 | |
|---|
| 387 | #Open ERRORLOG |
|---|
| 388 | $haserr = $Mail::SpamAssassin::Logger::LOG_SA{level} == 3; |
|---|
| 389 | |
|---|
| 390 | if ($haserr) { |
|---|
| 391 | $haserr = open RAWERR, ">$imgdir/raw.err"; |
|---|
| 392 | debuglog("Errors to: $imgdir/raw.err") if ($haserr>0); |
|---|
| 393 | } |
|---|
| 394 | |
|---|
| 395 | my $mimetype = $$pic{ctype}; |
|---|
| 396 | if($mimetype =~ m'application/octet-stream'i) { |
|---|
| 397 | $generic_ctype = 1; |
|---|
| 398 | } |
|---|
| 399 | |
|---|
| 400 | if($$pic{fname} =~ /\.([\w-]+)$/) { |
|---|
| 401 | $suffix = $1; |
|---|
| 402 | } |
|---|
| 403 | if ($suffix) { |
|---|
| 404 | debuglog("File has Content-Type \"$mimetype\" and File Extension \"$suffix\""); |
|---|
| 405 | } else { |
|---|
| 406 | debuglog("File has Content-Type \"$mimetype\" and no File Extension"); |
|---|
| 407 | } |
|---|
| 408 | |
|---|
| 409 | if ( $$pic{ftype} == 1 ) { |
|---|
| 410 | infolog("Found GIF header name=\"$$pic{fname}\""); |
|---|
| 411 | if ($conf->{focr_skip_gif}) { |
|---|
| 412 | infolog("Skipping image check"); |
|---|
| 413 | next; |
|---|
| 414 | } |
|---|
| 415 | if (defined($conf->{focr_max_size_gif}) and ($$pic{fsize} > $conf->{focr_max_size_gif})) { |
|---|
| 416 | infolog("GIF file size ($$pic{fsize}) exceeds maximum file size for this format, skipping..."); |
|---|
| 417 | next; |
|---|
| 418 | } |
|---|
| 419 | |
|---|
| 420 | if ( ($$pic{ctype} !~ /gif/i) and not $generic_ctype) { |
|---|
| 421 | wrong_ctype( "GIF", $$pic{ctype} ); |
|---|
| 422 | $internal_score += $conf->{'focr_wrongctype_score'}; |
|---|
| 423 | } |
|---|
| 424 | |
|---|
| 425 | if ( $suffix and $suffix !~ /gif/i) { |
|---|
| 426 | wrong_extension( "GIF", $suffix); |
|---|
| 427 | $internal_score += $conf->{'focr_wrongext_score'}; |
|---|
| 428 | } |
|---|
| 429 | |
|---|
| 430 | my $interlaced_gif = 0; |
|---|
| 431 | my $image_count = 0; |
|---|
| 432 | |
|---|
| 433 | foreach my $a (qw/gifsicle giftext giffix gifinter giftopnm/) { |
|---|
| 434 | unless (defined $conf->{"focr_bin_$a"}) { |
|---|
| 435 | errorlog("Cannot exec $a, skipping image"); |
|---|
| 436 | next; |
|---|
| 437 | } |
|---|
| 438 | } |
|---|
| 439 | |
|---|
| 440 | my @stderr_data; |
|---|
| 441 | my ($retcode, @stdout_data) = save_execute( |
|---|
| 442 | "$conf->{focr_bin_giftext} $file", |
|---|
| 443 | undef, |
|---|
| 444 | ">$imgdir/giftext.info", |
|---|
| 445 | ">>$imgdir/giftext.err", 1); |
|---|
| 446 | |
|---|
| 447 | if ($retcode<0) { # only care if we timed out |
|---|
| 448 | chomp $retcode; |
|---|
| 449 | errorlog("$conf->{focr_bin_giftext} Timed out [$retcode], skipping..."); |
|---|
| 450 | ++$imgerr if $conf->{focr_keep_bad_images}>0; next; |
|---|
| 451 | } |
|---|
| 452 | |
|---|
| 453 | foreach (@stdout_data) { |
|---|
| 454 | unless ($interlaced_gif) { |
|---|
| 455 | if ( $_ =~ /Image is Interlaced/i ) { |
|---|
| 456 | $interlaced_gif = 1; |
|---|
| 457 | } |
|---|
| 458 | } |
|---|
| 459 | if ( $_ =~ /^Image #/ ) { |
|---|
| 460 | $image_count++; |
|---|
| 461 | } |
|---|
| 462 | } |
|---|
| 463 | if ($interlaced_gif or ($image_count > 1)) { |
|---|
| 464 | infolog("Image is interlaced or animated..."); |
|---|
| 465 | } |
|---|
| 466 | else { |
|---|
| 467 | infolog("Image is single non-interlaced..."); |
|---|
| 468 | $tfile .= "-fixed.gif"; |
|---|
| 469 | printf RAWERR "## $conf->{focr_bin_giffix} $file >$tfile 2>>$efile\n" if ($haserr>0); |
|---|
| 470 | |
|---|
| 471 | $retcode = save_execute("$conf->{focr_bin_giffix} $file", undef, ">$tfile", ">>$efile"); |
|---|
| 472 | |
|---|
| 473 | if ($retcode<0) { # only care if we timed out |
|---|
| 474 | chomp $retcode; |
|---|
| 475 | errorlog("$conf->{focr_bin_giffix}: Timed out [$retcode], skipping..."); |
|---|
| 476 | printf RAWERR "?? Timed out > $retcode\n" if ($haserr>0); |
|---|
| 477 | ++$imgerr if $conf->{focr_keep_bad_images}>0; next; |
|---|
| 478 | } |
|---|
| 479 | |
|---|
| 480 | if (open ERR, $efile) { |
|---|
| 481 | @stderr_data = <ERR>; |
|---|
| 482 | close ERR; |
|---|
| 483 | foreach (@stderr_data) { |
|---|
| 484 | if ( $_ =~ /GIF-LIB error/i ) { |
|---|
| 485 | $corrupt = $_; |
|---|
| 486 | last; |
|---|
| 487 | } |
|---|
| 488 | } |
|---|
| 489 | } |
|---|
| 490 | } |
|---|
| 491 | my $fixedsize = (stat($tfile))[7]; |
|---|
| 492 | if (defined($conf->{focr_max_size_gif}) and ($fixedsize > $conf->{focr_max_size_gif})) { |
|---|
| 493 | infolog("Fixed GIF file size ($fixedsize) exceeds maximum file size for this format, skipping..."); |
|---|
| 494 | next; |
|---|
| 495 | } |
|---|
| 496 | |
|---|
| 497 | if ($corrupt) { |
|---|
| 498 | if ($interlaced_gif or ($image_count > 1)) { |
|---|
| 499 | infolog("Skipping corrupted interlaced image..."); |
|---|
| 500 | corrupt_img($conf->{focr_corrupt_unfixable_score}, $corrupt); |
|---|
| 501 | $internal_score += $conf->{focr_corrupt_unfixable_score}; |
|---|
| 502 | next; |
|---|
| 503 | } |
|---|
| 504 | if (-z $tfile) { |
|---|
| 505 | infolog("Uncorrectable corruption detected, skipping non-interlaced image..."); |
|---|
| 506 | corrupt_img($conf->{focr_corrupt_unfixable_score}, $corrupt); |
|---|
| 507 | $internal_score += $conf->{focr_corrupt_unfixable_score}; |
|---|
| 508 | next; |
|---|
| 509 | } |
|---|
| 510 | infolog("Image is corrupt, but seems fixable, continuing..."); |
|---|
| 511 | corrupt_img($conf->{focr_corrupt_score}, $corrupt); |
|---|
| 512 | $internal_score += $conf->{focr_corrupt_score}; |
|---|
| 513 | } |
|---|
| 514 | |
|---|
| 515 | if ($image_count > 1) { |
|---|
| 516 | infolog("File contains <$image_count> images, deanimating..."); |
|---|
| 517 | $tfile = deanimate($tfile); |
|---|
| 518 | } |
|---|
| 519 | |
|---|
| 520 | if ($interlaced_gif) { |
|---|
| 521 | infolog("Processing interlaced_gif $tfile..."); |
|---|
| 522 | my $cfile = $tfile; |
|---|
| 523 | if ($tfile =~ m/\.gif$/i) { |
|---|
| 524 | $tfile =~ s/\.gif$/-fixed.gif/i; |
|---|
| 525 | } else { |
|---|
| 526 | $tfile .= ".gif"; |
|---|
| 527 | } |
|---|
| 528 | printf RAWERR qq(## $conf->{focr_bin_gifinter} $cfile >$tfile 2>>$efile\n) if ($haserr>0); |
|---|
| 529 | |
|---|
| 530 | $retcode = save_execute("$conf->{focr_bin_gifinter} $cfile", undef, ">$tfile", ">>$efile"); |
|---|
| 531 | |
|---|
| 532 | if ($retcode<0) { |
|---|
| 533 | chomp $retcode; |
|---|
| 534 | printf RAWERR "?? Timed out > $retcode\n" if ($haserr>0); |
|---|
| 535 | errorlog("$conf->{focr_bin_gifinter}: Timed out [$retcode], skipping..."); |
|---|
| 536 | ++$imgerr if $conf->{focr_keep_bad_images}>0; next; |
|---|
| 537 | } elsif ($retcode>0) { |
|---|
| 538 | chomp $retcode; |
|---|
| 539 | printf RAWERR "?? [$retcode] returned from $conf->{focr_bin_gifinter}\n" if ($haserr>0); |
|---|
| 540 | errorlog("$conf->{focr_bin_gifinter}: Returned [$retcode], skipping..."); |
|---|
| 541 | ++$imgerr if $conf->{focr_keep_bad_images}>0; next; |
|---|
| 542 | } |
|---|
| 543 | } |
|---|
| 544 | |
|---|
| 545 | printf RAWERR qq(## $conf->{focr_bin_giftopnm} $tfile >$pfile 2>>$efile\n) if ($haserr>0); |
|---|
| 546 | |
|---|
| 547 | $retcode = save_execute("$conf->{focr_bin_giftopnm} $tfile", undef, ">$pfile", ">>$efile"); |
|---|
| 548 | |
|---|
| 549 | if ($retcode<0) { |
|---|
| 550 | chomp $retcode; |
|---|
| 551 | printf RAWERR "?? Timed out > $retcode\n" if ($haserr>0); |
|---|
| 552 | errorlog("$conf->{focr_bin_giftopnm}: Timed out [$retcode], skipping..."); |
|---|
| 553 | ++$imgerr if $conf->{focr_keep_bad_images}>0; next; |
|---|
| 554 | } elsif ($retcode>0) { |
|---|
| 555 | chomp $retcode; |
|---|
| 556 | printf RAWERR "?? [$retcode] returned from $conf->{focr_bin_giftopnm}\n" if ($haserr>0); |
|---|
| 557 | errorlog("$conf->{focr_bin_giftopnm}: Returned [$retcode], skipping..."); |
|---|
| 558 | ++$imgerr if $conf->{focr_keep_bad_images}>0; next; |
|---|
| 559 | } |
|---|
| 560 | } |
|---|
| 561 | elsif ( $$pic{ftype} == 2 ) { |
|---|
| 562 | infolog("Found JPEG header name=\"$$pic{fname}\""); |
|---|
| 563 | if ($conf->{focr_skip_jpeg}) { |
|---|
| 564 | infolog("Skipping image check"); |
|---|
| 565 | next; |
|---|
| 566 | } |
|---|
| 567 | |
|---|
| 568 | if (defined($conf->{focr_max_size_jpeg}) and ($$pic{fsize} > $conf->{focr_max_size_jpeg})) { |
|---|
| 569 | infolog("JPEG file size ($$pic{fsize}) exceeds maximum file size for this format, skipping..."); |
|---|
| 570 | next; |
|---|
| 571 | } |
|---|
| 572 | if ( ($$pic{ctype} !~ /(jpeg|jpg)/i) and not $generic_ctype) { |
|---|
| 573 | wrong_ctype( "JPEG", $$pic{ctype} ); |
|---|
| 574 | $internal_score += $conf->{'focr_wrongctype_score'}; |
|---|
| 575 | } |
|---|
| 576 | |
|---|
| 577 | if ( $suffix and $suffix !~ /(jpeg|jpg|jfif)/i) { |
|---|
| 578 | wrong_extension( "JPEG", $suffix); |
|---|
| 579 | $internal_score += $conf->{'focr_wrongext_score'}; |
|---|
| 580 | } |
|---|
| 581 | |
|---|
| 582 | foreach my $a (qw/jpegtopnm/) { |
|---|
| 583 | unless (defined $conf->{"focr_bin_$a"}) { |
|---|
| 584 | errorlog("Cannot exec $a, skipping image"); |
|---|
| 585 | next; |
|---|
| 586 | } |
|---|
| 587 | } |
|---|
| 588 | printf RAWERR qq(## $conf->{focr_bin_jpegtopnm} $file >$pfile 2>>$efile\n) if ($haserr>0); |
|---|
| 589 | my $retcode = save_execute("$conf->{focr_bin_jpegtopnm} $file", undef, ">$pfile", ">>$efile"); |
|---|
| 590 | |
|---|
| 591 | if ($retcode<0) { |
|---|
| 592 | chomp $retcode; |
|---|
| 593 | printf RAWERR "?? Timed out > $retcode\n" if ($haserr>0); |
|---|
| 594 | errorlog("$conf->{focr_bin_jpegtopnm}: Timed out [$retcode], skipping..."); |
|---|
| 595 | ++$imgerr if $conf->{focr_keep_bad_images}>0; next; |
|---|
| 596 | } elsif ($retcode>0) { |
|---|
| 597 | chomp $retcode; |
|---|
| 598 | printf RAWERR "?? [$retcode] returned from $conf->{focr_bin_jpegtopnm}\n" if ($haserr>0); |
|---|
| 599 | errorlog("$conf->{focr_bin_jpegtopnm}: Returned [$retcode], skipping..."); |
|---|
| 600 | ++$imgerr if $conf->{focr_keep_bad_images}>0; next; |
|---|
| 601 | } |
|---|
| 602 | } |
|---|
| 603 | elsif ( $$pic{ftype} == 3 ) { |
|---|
| 604 | infolog("Found PNG header name=\"$$pic{fname}\""); |
|---|
| 605 | if ($conf->{focr_skip_png}) { |
|---|
| 606 | infolog("Skipping image check"); |
|---|
| 607 | next; |
|---|
| 608 | } |
|---|
| 609 | if (defined($conf->{focr_max_size_png}) and ($$pic{fsize} > $conf->{focr_max_size_png})) { |
|---|
| 610 | infolog("PNG file size ($$pic{fsize}) exceeds maximum file size for this format, skipping..."); |
|---|
| 611 | next; |
|---|
| 612 | } |
|---|
| 613 | if ( ($$pic{ctype} !~ /png/i) and not $generic_ctype) { |
|---|
| 614 | wrong_ctype( "PNG", $$pic{ctype} ); |
|---|
| 615 | $internal_score += $conf->{'focr_wrongctype_score'}; |
|---|
| 616 | } |
|---|
| 617 | if ( $suffix and $suffix !~ /(png)/i) { |
|---|
| 618 | wrong_extension( "PNG", $suffix); |
|---|
| 619 | $internal_score += $conf->{'focr_wrongext_score'}; |
|---|
| 620 | } |
|---|
| 621 | foreach my $a (qw/pngtopnm/) { |
|---|
| 622 | unless (defined $conf->{"focr_bin_$a"}) { |
|---|
| 623 | errorlog("Cannot exec $a, skipping image"); |
|---|
| 624 | next; |
|---|
| 625 | } |
|---|
| 626 | } |
|---|
| 627 | |
|---|
| 628 | printf RAWERR qq(## $conf->{focr_bin_pngtopnm} $file >$pfile 2>>$efile\n) if ($haserr>0); |
|---|
| 629 | my $retcode = save_execute("$conf->{focr_bin_pngtopnm} $file", undef, ">$pfile", ">>$efile"); |
|---|
| 630 | |
|---|
| 631 | if ($retcode<0) { |
|---|
| 632 | chomp $retcode; |
|---|
| 633 | printf RAWERR "?? Timed out > $retcode\n" if ($haserr>0); |
|---|
| 634 | errorlog("$conf->{focr_bin_pngtopnm}: Timed out [$retcode], skipping..."); |
|---|
| 635 | ++$imgerr if $conf->{focr_keep_bad_images}>0; next; |
|---|
| 636 | } elsif ($retcode>0) { |
|---|
| 637 | chomp $retcode; |
|---|
| 638 | printf RAWERR "?? [$retcode] returned from $conf->{focr_bin_pngtopnm}\n" if ($haserr>0); |
|---|
| 639 | errorlog("$conf->{focr_bin_pngtopnm}: Returned [$retcode], skipping..."); |
|---|
| 640 | ++$imgerr if $conf->{focr_keep_bad_images}>0; next; |
|---|
| 641 | } |
|---|
| 642 | } |
|---|
| 643 | elsif ( $$pic{ftype} == 4 ) { |
|---|
| 644 | infolog("Found BMP header name=\"$$pic{fname}\""); |
|---|
| 645 | if ($conf->{focr_skip_bmp}) { |
|---|
| 646 | infolog("Skipping image check"); |
|---|
| 647 | next; |
|---|
| 648 | } |
|---|
| 649 | if (defined($conf->{focr_max_size_bmp}) and ($$pic{fsize} > $conf->{focr_max_size_bmp})) { |
|---|
| 650 | infolog("BMP file size ($$pic{fsize}) exceeds maximum file size for this format, skipping..."); |
|---|
| 651 | next; |
|---|
| 652 | } |
|---|
| 653 | if ( ($$pic{ctype} !~ /bmp/i) and not $generic_ctype) { |
|---|
| 654 | wrong_ctype( "BMP", $$pic{ctype} ); |
|---|
| 655 | $internal_score += $conf->{'focr_wrongctype_score'}; |
|---|
| 656 | } |
|---|
| 657 | if ( $suffix and $suffix !~ /(bmp)/i) { |
|---|
| 658 | wrong_extension( "BMP", $suffix); |
|---|
| 659 | $internal_score += $conf->{'focr_wrongext_score'}; |
|---|
| 660 | } |
|---|
| 661 | foreach my $a (qw/bmptopnm/) { |
|---|
| 662 | unless (defined $conf->{"focr_bin_$a"}) { |
|---|
| 663 | errorlog("Cannot exec $a, skipping image"); |
|---|
| 664 | next; |
|---|
| 665 | } |
|---|
| 666 | } |
|---|
| 667 | printf RAWERR qq(## $conf->{focr_bin_bmptopnm} $file >$pfile 2>>$efile\n) if ($haserr>0); |
|---|
| 668 | |
|---|
| 669 | my $retcode = save_execute("$conf->{focr_bin_bmptopnm} $file", undef, ">$pfile", ">>$efile"); |
|---|
| 670 | if ($retcode<0) { |
|---|
| 671 | chomp $retcode; |
|---|
| 672 | printf RAWERR "?? Timed out > $retcode\n" if ($haserr>0); |
|---|
| 673 | errorlog("$conf->{focr_bin_bmptopnm}: Timed out [$retcode], skipping..."); |
|---|
| 674 | ++$imgerr if $conf->{focr_keep_bad_images}>0; next; |
|---|
| 675 | } elsif ($retcode>0) { |
|---|
| 676 | chomp $retcode; |
|---|
| 677 | printf RAWERR "?? [$retcode] returned from $conf->{focr_bin_bmptopnm}\n" if ($haserr>0); |
|---|
| 678 | errorlog("$conf->{focr_bin_bmptopnm}: Returned [$retcode], skipping..."); |
|---|
| 679 | ++$imgerr if $conf->{focr_keep_bad_images}>0; next; |
|---|
| 680 | } |
|---|
| 681 | } |
|---|
| 682 | elsif ( $$pic{ftype} == 5 ) { |
|---|
| 683 | infolog("Found TIFF header name=\"$$pic{fname}\""); |
|---|
| 684 | if ($conf->{focr_skip_tiff}) { |
|---|
| 685 | infolog("Skipping image check"); |
|---|
| 686 | next; |
|---|
| 687 | } |
|---|
| 688 | if (defined($conf->{focr_max_size_tiff}) and ($$pic{fsize} > $conf->{focr_max_size_tiff})) { |
|---|
| 689 | infolog("TIFF file size ($$pic{fsize}) exceeds maximum file size for this format, skipping..."); |
|---|
| 690 | next; |
|---|
| 691 | } |
|---|
| 692 | if ( ($$pic{ctype} !~ /tif/i) and not $generic_ctype) { |
|---|
| 693 | wrong_ctype( "TIFF", $$pic{ctype} ); |
|---|
| 694 | $internal_score += $conf->{'focr_wrongctype_score'}; |
|---|
| 695 | } |
|---|
| 696 | if ( $suffix and $suffix !~ /tif/i) { |
|---|
| 697 | wrong_extension( "TIFF", $suffix); |
|---|
| 698 | $internal_score += $conf->{'focr_wrongext_score'}; |
|---|
| 699 | } |
|---|
| 700 | |
|---|
| 701 | foreach my $a (qw/tifftopnm/) { |
|---|
| 702 | unless (defined $conf->{"focr_bin_$a"}) { |
|---|
| 703 | errorlog("Cannot exec $a, skipping image"); |
|---|
| 704 | next; |
|---|
| 705 | } |
|---|
| 706 | } |
|---|
| 707 | printf RAWERR qq(## $conf->{focr_bin_tifftopnm} $file >$pfile 2>>$efile\n) if ($haserr>0); |
|---|
| 708 | my $retcode = save_execute("$conf->{focr_bin_tifftopnm} $file", undef, ">$pfile", ">>$efile"); |
|---|
| 709 | |
|---|
| 710 | if ($retcode<0) { |
|---|
| 711 | chomp $retcode; |
|---|
| 712 | printf RAWERR "?? Timed out > $retcode\n" if ($haserr>0); |
|---|
| 713 | errorlog("$conf->{focr_bin_tifftopnm}: Timed out [$retcode], skipping..."); |
|---|
| 714 | ++$imgerr if $conf->{focr_keep_bad_images}>0; next; |
|---|
| 715 | } elsif ($retcode>0) { |
|---|
| 716 | chomp $retcode; |
|---|
| 717 | printf RAWERR "?? [$retcode] returned from $conf->{focr_bin_tifftopnm}\n" if ($haserr>0); |
|---|
| 718 | errorlog("$conf->{focr_bin_tifftopnm}: Returned [$retcode], skipping..."); |
|---|
| 719 | ++$imgerr if $conf->{focr_keep_bad_images}>0; next; |
|---|
| 720 | } |
|---|
| 721 | } elsif ($$pic{ftype} == 6) { |
|---|
| 722 | infolog("Found PDF header name=\"$$pic{fname}\""); |
|---|
| 723 | |
|---|
| 724 | my $missing_bin = 0; |
|---|
| 725 | foreach my $a (qw/pdftops pstopnm pdfinfo/) { |
|---|
| 726 | unless (defined $conf->{"focr_bin_$a"}) { |
|---|
| 727 | $missing_bin = 1; |
|---|
| 728 | errorlog("Cannot exec $a, skipping image"); |
|---|
| 729 | next; |
|---|
| 730 | } |
|---|
| 731 | } |
|---|
| 732 | |
|---|
| 733 | if ($missing_bin) { |
|---|
| 734 | next; |
|---|
| 735 | } |
|---|
| 736 | |
|---|
| 737 | my @stderr_data; |
|---|
| 738 | my ($retcode, @stdout_data) = save_execute( |
|---|
| 739 | "$conf->{focr_bin_pdfinfo} $file", |
|---|
| 740 | undef, |
|---|
| 741 | ">$imgdir/pdfinfo.info", |
|---|
| 742 | ">>$imgdir/pdfinfo.err", 1); |
|---|
| 743 | |
|---|
| 744 | foreach (@stdout_data) { |
|---|
| 745 | if ($_ =~ /^Pages:\s*([0-9]+)/) { |
|---|
| 746 | $$pic{pages} = $1; |
|---|
| 747 | } |
|---|
| 748 | } |
|---|
| 749 | |
|---|
| 750 | unless ($$pic{pages}) { |
|---|
| 751 | infolog("Can't determine page count of PDF Document\n"); |
|---|
| 752 | } |
|---|
| 753 | |
|---|
| 754 | if ($$pic{pages} > $conf->{focr_pdf_maxpages}) { |
|---|
| 755 | infolog("PDF has too many pages, skipping this file...\n"); |
|---|
| 756 | next; |
|---|
| 757 | } |
|---|
| 758 | |
|---|
| 759 | if ( ($$pic{ctype} !~ /pdf/i) and not $generic_ctype) { |
|---|
| 760 | wrong_ctype( "Application/PDF", $$pic{ctype} ); |
|---|
| 761 | $internal_score += $conf->{'focr_wrongctype_score'}; |
|---|
| 762 | } |
|---|
| 763 | |
|---|
| 764 | $retcode = save_execute("$conf->{focr_bin_pdftops} $file -", undef, ">$file.ps", ">>$efile"); |
|---|
| 765 | |
|---|
| 766 | if ($retcode<0) { |
|---|
| 767 | chomp $retcode; |
|---|
| 768 | printf RAWERR "?? Timed out > $retcode\n" if ($haserr>0); |
|---|
| 769 | errorlog("$conf->{focr_bin_pdftops}: Timed out [$retcode], skipping..."); |
|---|
| 770 | ++$imgerr if $conf->{focr_keep_bad_images}>0; next; |
|---|
| 771 | } elsif ($retcode>0) { |
|---|
| 772 | chomp $retcode; |
|---|
| 773 | printf RAWERR "?? [$retcode] returned from $conf->{focr_bin_pdftops}\n" if ($haserr>0); |
|---|
| 774 | errorlog("$conf->{focr_bin_pdftops}: Returned [$retcode], skipping..."); |
|---|
| 775 | ++$imgerr if $conf->{focr_keep_bad_images}>0; next; |
|---|
| 776 | } |
|---|
| 777 | |
|---|
| 778 | $retcode = save_execute("$conf->{focr_bin_pstopnm} -stdout -xsize=1000 $file.ps", undef, ">$pfile", ">>$efile"); |
|---|
| 779 | |
|---|
| 780 | if ($retcode<0) { |
|---|
| 781 | chomp $retcode; |
|---|
| 782 | printf RAWERR "?? Timed out > $retcode\n" if ($haserr>0); |
|---|
| 783 | errorlog("$conf->{focr_bin_pstopnm}: Timed out [$retcode], skipping..."); |
|---|
| 784 | ++$imgerr if $conf->{focr_keep_bad_images}>0; next; |
|---|
| 785 | } elsif ($retcode>0) { |
|---|
| 786 | chomp $retcode; |
|---|
| 787 | printf RAWERR "?? [$retcode] returned from $conf->{focr_bin_pstopnm}\n" if ($haserr>0); |
|---|
| 788 | errorlog("$conf->{focr_bin_pstopnm}: Returned [$retcode], skipping..."); |
|---|
| 789 | ++$imgerr if $conf->{focr_keep_bad_images}>0; next; |
|---|
| 790 | } |
|---|
| 791 | } |
|---|
| 792 | else { |
|---|
| 793 | errorlog("Image type not recognized, unknown format. Skipping this image..."); |
|---|
| 794 | next; |
|---|
| 795 | } |
|---|
| 796 | |
|---|
| 797 | if($conf->{focr_enable_image_hashing}) { |
|---|
| 798 | infolog("Calculating image hash for: $pfile"); |
|---|
| 799 | ($corrupt, $digest) = calc_image_hash($pfile,$pic); |
|---|
| 800 | if ($corrupt) { |
|---|
| 801 | infolog("Error calculating the image hash, skipping hash check..."); |
|---|
| 802 | } else { |
|---|
| 803 | my ($score, $dinfo, $whash); |
|---|
| 804 | $whash = $conf->{focr_enable_image_hashing} == 3 |
|---|
| 805 | ? $conf->{focr_mysql_hash} |
|---|
| 806 | : $conf->{focr_db_hash}; |
|---|
| 807 | ($score,$dinfo) = check_image_hash_db($digest, $whash, $$pic{fname}, $$pic{ctype}, $$pic{ftype}); |
|---|
| 808 | if ($score > 0) { |
|---|
| 809 | known_img_hash($score,$dinfo); |
|---|
| 810 | infolog("Message is SPAM. $dinfo") if ($conf->{focr_enable_image_hashing} < 3); |
|---|
| 811 | removedirs(get_all_tmpdirs()); |
|---|
| 812 | return 0; |
|---|
| 813 | } |
|---|
| 814 | $whash = $conf->{focr_enable_image_hashing} == 3 |
|---|
| 815 | ? $conf->{focr_mysql_safe} |
|---|
| 816 | : $conf->{focr_db_safe}; |
|---|
| 817 | ($score,$dinfo) = check_image_hash_db($digest, $whash, $$pic{fname}, $$pic{ctype}, $$pic{ftype}); |
|---|
| 818 | if ($score > 0) { |
|---|
| 819 | infolog("Image in KNOWN_GOOD. Skipping OCR checks..."); |
|---|
| 820 | next; |
|---|
| 821 | } |
|---|
| 822 | } |
|---|
| 823 | if ($digest eq '') { |
|---|
| 824 | infolog("Empty Hash, skipping..."); |
|---|
| 825 | next; |
|---|
| 826 | } |
|---|
| 827 | } else { |
|---|
| 828 | infolog("Image hashing disabled in configuration, skipping..."); |
|---|
| 829 | } |
|---|
| 830 | |
|---|
| 831 | # Note: $current_score is here the score that the message had at the beginning |
|---|
| 832 | # and $score is the autodisable_score defined in the config |
|---|
| 833 | # $internal_score describes the score that the message got by FuzzyOcr so far. |
|---|
| 834 | if ($internal_score + $current_score > $score) { |
|---|
| 835 | my $total = $internal_score + $current_score; |
|---|
| 836 | warnlog("FuzzyOcr stopped, message got $internal_score points by other FuzzyOcr tests ($total>$score)."); |
|---|
| 837 | #infolog("OCR canceled, message got already more than $score points ($total) by other FuzzyOcr tests."); |
|---|
| 838 | return 0; |
|---|
| 839 | } |
|---|
| 840 | |
|---|
| 841 | my @ocr_results = (); |
|---|
| 842 | my $scansets = get_scansets(); |
|---|
| 843 | my $newlist = ''; |
|---|
| 844 | foreach my $s (@$scansets) { |
|---|
| 845 | $newlist .= ' ' . $s->{label} . '(' . $s->{hit_counter} . ')'; |
|---|
| 846 | } |
|---|
| 847 | infolog("Scanset Order:$newlist"); |
|---|
| 848 | my $mcnt = 0; |
|---|
| 849 | my $modus = 0; |
|---|
| 850 | my $modus_match = 0; |
|---|
| 851 | my $wref = get_wordlist(); |
|---|
| 852 | my %words = %$wref; |
|---|
| 853 | |
|---|
| 854 | foreach my $scanset (@$scansets) { |
|---|
| 855 | my $scanlabel = $scanset->{label}; |
|---|
| 856 | my $scancmd = $scanset->{command}; |
|---|
| 857 | if ($scancmd =~ m/^\$/) { |
|---|
| 858 | warnlog("Skipping $scanlabel, invalid command '$scancmd'"); |
|---|
| 859 | next; |
|---|
| 860 | } |
|---|
| 861 | if (($$pic{ftype} != 6) and ($scancmd =~ m/ocrad/) and |
|---|
| 862 | ($$pic{width} < 16 or $$pic{height} < 16)) { |
|---|
| 863 | warnlog("Skipping $scanlabel, image too small"); |
|---|
| 864 | next; |
|---|
| 865 | } |
|---|
| 866 | my $cmcnt = 0; |
|---|
| 867 | my @cfound; |
|---|
| 868 | if (defined $scanset->{args}) { |
|---|
| 869 | $scancmd .= ' ' . $scanset->{args}; |
|---|
| 870 | } |
|---|
| 871 | printf RAWERR qq(## $scancmd\n) if ($haserr>0); |
|---|
| 872 | my ($retcode, @result) = $scanset->run($pfile); |
|---|
| 873 | if ($retcode<0) { |
|---|
| 874 | if ($retcode == -1) { |
|---|
| 875 | printf RAWERR qq(Timeout[$conf->{focr_timeout}]: $scancmd\n) if ($haserr>0); |
|---|
| 876 | errorlog("Timeout[$scanlabel]: \"$scancmd\" took more than $conf->{focr_timeout} sec."); |
|---|
| 877 | } elsif ($retcode == -2) { |
|---|
| 878 | printf RAWERR qq(Cannot exec[$scanlabel]: $scancmd\n) if ($haserr>0); |
|---|
| 879 | errorlog("Cannot execute($scanlabel): \"$scancmd\""); |
|---|
| 880 | } else { |
|---|
| 881 | printf RAWERR qq(Unknown error <$retcode>: $scancmd\n) if ($haserr>0); |
|---|
| 882 | errorlog("Unknown error: [$retcode]..."); |
|---|
| 883 | } |
|---|
| 884 | infolog("Skipping scanset, trying next..."); |
|---|
| 885 | next; |
|---|
| 886 | } elsif ($retcode>0) { |
|---|
| 887 | chomp $retcode; |
|---|
| 888 | my $errstr = "Return code: $retcode, Error: "; |
|---|
| 889 | $errstr .= join( '', @result ); |
|---|
| 890 | warnlog("Errors in Scanset \"$scanlabel\""); |
|---|
| 891 | warnlog($errstr); |
|---|
| 892 | warnlog("Skipping scanset because of errors, trying next..."); |
|---|
| 893 | printf RAWERR qq($errstr\n) if ($haserr>0); |
|---|
| 894 | next; |
|---|
| 895 | } |
|---|
| 896 | |
|---|
| 897 | debuglog("ocrdata=>>".join("",@result)."<<=end"); |
|---|
| 898 | foreach $modus (0 .. 1) { |
|---|
| 899 | $cmcnt = 0; |
|---|
| 900 | @cfound = (); |
|---|
| 901 | foreach my $ww (keys %words) { |
|---|
| 902 | my $w = lc $ww; |
|---|
| 903 | $w =~ s/[^a-z0-9 ]//g; |
|---|
| 904 | if ($modus) { |
|---|
| 905 | $w =~ s/ //g; |
|---|
| 906 | } |
|---|
| 907 | if ($conf->{focr_strip_numbers}) { |
|---|
| 908 | $w =~ s/[0-9]//g; |
|---|
| 909 | } |
|---|
| 910 | my $wcnt = 0; |
|---|
| 911 | foreach (@result) { |
|---|
| 912 | $_ = lc; |
|---|
| 913 | if ($modus) { |
|---|
| 914 | s/ //g; |
|---|
| 915 | } |
|---|
| 916 | if ($conf->{focr_strip_numbers}) { |
|---|
| 917 | tr/!;|(0815/iiicoals/; |
|---|
| 918 | s/[0-9]//g; |
|---|
| 919 | } else { |
|---|
| 920 | tr/!;|(/iiic/; |
|---|
| 921 | } |
|---|
| 922 | s/[^a-z0-9 ]//g; |
|---|
| 923 | my $matched = abs(adistr( $w, $_ )); |
|---|
| 924 | if ( $matched < $words{$ww} ) { |
|---|
| 925 | $wcnt++; |
|---|
| 926 | infolog( |
|---|
| 927 | "Scanset \"$scanlabel\" found word \"$w\" with fuzz of " |
|---|
| 928 | . sprintf("%0.4f",$matched) |
|---|
| 929 | . "\nline: \"$_\"" |
|---|
| 930 | ); |
|---|
| 931 | if ($conf->{focr_unique_matches}) { |
|---|
| 932 | last; |
|---|
| 933 | } |
|---|
| 934 | } |
|---|
| 935 | } |
|---|
| 936 | $cmcnt += $wcnt; |
|---|
| 937 | if ( ( $conf->{focr_verbose} > 0 ) and ($wcnt) ) { |
|---|
| 938 | push( @cfound, "\"$w\" in $wcnt lines" ); |
|---|
| 939 | } |
|---|
| 940 | } |
|---|
| 941 | $mcnt = max($mcnt, $cmcnt); |
|---|
| 942 | if ($mcnt == $cmcnt) { |
|---|
| 943 | @found = @cfound; |
|---|
| 944 | } |
|---|
| 945 | if ((not $modus) and ($cmcnt >= $conf->{focr_counts_required})) { |
|---|
| 946 | if ($mcnt == $cmcnt) { |
|---|
| 947 | $modus_match = 0; |
|---|
| 948 | } |
|---|
| 949 | debuglog("Enough OCR Hits without space stripping, skipping second matching pass..."); |
|---|
| 950 | last; |
|---|
| 951 | } elsif (not $modus) { |
|---|
| 952 | debuglog("Not enough OCR Hits without space stripping, doing second matching pass..."); |
|---|
| 953 | if ($mcnt == $cmcnt) { |
|---|
| 954 | $modus_match = 1; |
|---|
| 955 | } |
|---|
| 956 | } |
|---|
| 957 | } |
|---|
| 958 | if ($mcnt >= $conf->{focr_counts_required} and $conf->{focr_minimal_scanset}) { |
|---|
| 959 | infolog("Scanset \"$scanlabel\" generates enough hits ($mcnt), skipping further scansets..."); |
|---|
| 960 | if ($conf->{focr_autosort_scanset}) { |
|---|
| 961 | foreach my $s (@$scansets) { |
|---|
| 962 | if ($s->{label} eq $scanlabel) { |
|---|
| 963 | if ($s->{hit_counter} < $conf->{focr_autosort_buffer}) { |
|---|
| 964 | $s->{hit_counter} = $s->{hit_counter} + 1; |
|---|
| 965 | } |
|---|
| 966 | } else { |
|---|
| 967 | if ($s->{hit_counter} > 0) { |
|---|
| 968 | $s->{hit_counter} = $s->{hit_counter} - 1; |
|---|
| 969 | } |
|---|
| 970 | } |
|---|
| 971 | } |
|---|
| 972 | |
|---|
| 973 | } |
|---|
| 974 | last; |
|---|
| 975 | } |
|---|
| 976 | } |
|---|
| 977 | if ($conf->{focr_enable_image_hashing}) { |
|---|
| 978 | my $info = join('::',$mcnt,$$pic{fname},$$pic{ctype},$$pic{ftype},$digest); |
|---|
| 979 | push(@hashes, $info); |
|---|
| 980 | } |
|---|
| 981 | |
|---|
| 982 | # Normal match or match without spaces? |
|---|
| 983 | if ($modus_match) { |
|---|
| 984 | $cnt += $mcnt; |
|---|
| 985 | } else { |
|---|
| 986 | $cnt += $conf->{focr_twopass_scoring_factor} * $mcnt; |
|---|
| 987 | } |
|---|
| 988 | } |
|---|
| 989 | close RAWERR if ($haserr>0); |
|---|
| 990 | |
|---|
| 991 | if ($cnt == 0) { |
|---|
| 992 | if ($conf->{focr_enable_image_hashing} > 1 and @hashes) { |
|---|
| 993 | infolog("Message is ham, saving..."); |
|---|
| 994 | foreach my $h (@hashes) { |
|---|
| 995 | my ($mcnt,$fname,$ctype,$ftype,$digest) = split('::',$h,5); |
|---|
| 996 | next if $mcnt; |
|---|
| 997 | my $whash = $conf->{focr_enable_image_hashing} == 3 |
|---|
| 998 | ? $conf->{focr_mysql_safe} |
|---|
| 999 | : $conf->{focr_db_safe}; |
|---|
| 1000 | add_image_hash_db($digest,0,$whash,$fname,$ctype,$ftype); |
|---|
| 1001 | } |
|---|
| 1002 | } |
|---|
| 1003 | } else { |
|---|
| 1004 | my $score = '0.000'; |
|---|
| 1005 | my $debuginfo = ("Words found:\n".join( "\n", @found )."\n($cnt word occurrences found)" ); |
|---|
| 1006 | if ($cnt >= $conf->{focr_counts_required}) { |
|---|
| 1007 | $score = sprintf "%0.3f", $conf->{focr_base_score} + |
|---|
| 1008 | (( $cnt - $conf->{focr_counts_required} ) * $conf->{focr_add_score} ); |
|---|
| 1009 | infolog("Message is spam, score = $score"); |
|---|
| 1010 | } else { |
|---|
| 1011 | $score = sprintf("%0.3f", $conf->{focr_add_score} * $cnt) if $conf->{focr_score_ham}; |
|---|
| 1012 | infolog("Message is ham, score = $score"); |
|---|
| 1013 | } |
|---|
| 1014 | if ($conf->{focr_enable_image_hashing} and |
|---|
| 1015 | $conf->{focr_hashing_learn_scanned} and |
|---|
| 1016 | $score > 0) { |
|---|
| 1017 | foreach my $h (@hashes) { |
|---|
| 1018 | my ($mcnt,$fname,$ctype,$ftype,$digest) = split('::',$h,5); |
|---|
| 1019 | next unless $mcnt; |
|---|
| 1020 | my $whash = $conf->{focr_enable_image_hashing} == 3 |
|---|
| 1021 | ? $conf->{focr_mysql_hash} |
|---|
| 1022 | : $conf->{focr_db_hash}; |
|---|
| 1023 | add_image_hash_db($digest,$score,$whash,$fname,$ctype,$ftype,$debuginfo); |
|---|
| 1024 | } |
|---|
| 1025 | } |
|---|
| 1026 | if ( $conf->{focr_verbose} > 0 and $conf->{focr_verbose} < 3 ) { |
|---|
| 1027 | infolog($debuginfo) unless ($conf->{focr_enable_image_hashing} == 3); |
|---|
| 1028 | } |
|---|
| 1029 | for my $set ( 0 .. 3 ) { |
|---|
| 1030 | $pms->{conf}->{scoreset}->[$set]->{"FUZZY_OCR"} = $score; |
|---|
| 1031 | } |
|---|
| 1032 | |
|---|
| 1033 | #$pms->test_log("Words found:"); |
|---|
| 1034 | |
|---|
| 1035 | #foreach (@found) { |
|---|
| 1036 | # $pms->test_log($_); |
|---|
| 1037 | # } |
|---|
| 1038 | |
|---|
| 1039 | # $pms->test_log("($cnt word occurrences found)"); |
|---|
| 1040 | my @dinfo = split('\n', $debuginfo); |
|---|
| 1041 | foreach (@dinfo) { |
|---|
| 1042 | $pms->test_log($_); |
|---|
| 1043 | } |
|---|
| 1044 | |
|---|
| 1045 | $pms->_handle_hit( "FUZZY_OCR", $score, "BODY: ", "BODY", |
|---|
| 1046 | $pms->{conf}->get_description_for_rule("FUZZY_OCR")); |
|---|
| 1047 | } |
|---|
| 1048 | if ($imgerr == 0 and $conf->{focr_keep_bad_images}<2) { |
|---|
| 1049 | removedirs(get_all_tmpdirs()); |
|---|
| 1050 | } |
|---|
| 1051 | if ($conf->{focr_enable_image_hashing} == 3) { |
|---|
| 1052 | if (defined $conf->{focr_mysql_ddb}) { |
|---|
| 1053 | $conf->{focr_mysql_ddb}->disconnect; |
|---|
| 1054 | } |
|---|
| 1055 | } |
|---|
| 1056 | debuglog("FuzzyOcr ending successfully..."); |
|---|
| 1057 | return 0; |
|---|
| 1058 | } |
|---|
| 1059 | |
|---|
| 1060 | 1; |
|---|
| 1061 | #vim: et ts=4 sw=4 |
|---|