| 228 | | } |
|---|
| | 229 | } elsif (substr($pdata,0,5) eq "\x25\x50\x44\x46\x2d") { |
|---|
| | 230 | my $version = substr($pdata,5,3); |
|---|
| | 231 | infolog("PDF: [version $version] $filename ($pdatalen)"); |
|---|
| | 232 | $imgfiles{$filename}{ftype} = 6; |
|---|
| | 233 | $imgfiles{$filename}{version} = $version; |
|---|
| | 234 | $imgfiles{$filename}{width} = 0; |
|---|
| | 235 | $imgfiles{$filename}{height} = 0; |
|---|
| | 236 | } |
|---|
| 236 | | |
|---|
| 237 | | #Skip images that cannot contain text |
|---|
| 238 | | if ($imgfiles{$filename}{height} < $conf->{focr_min_height}) { |
|---|
| 239 | | infolog("Skipping image: height < $conf->{focr_min_height}"); |
|---|
| 240 | | delete $imgfiles{$filename}; |
|---|
| 241 | | next; |
|---|
| 242 | | } |
|---|
| 243 | | |
|---|
| 244 | | #Skip images that cannot contain text |
|---|
| 245 | | if ($imgfiles{$filename}{width} < $conf->{focr_min_width}) { |
|---|
| 246 | | infolog("Skipping image: width < $conf->{focr_min_width}"); |
|---|
| 247 | | delete $imgfiles{$filename}; |
|---|
| 248 | | next; |
|---|
| 249 | | } |
|---|
| 250 | | |
|---|
| 251 | | #Skip too big images, screenshots etc |
|---|
| 252 | | if ($imgfiles{$filename}{height} > $conf->{focr_max_height}) { |
|---|
| 253 | | infolog("Skipping image: height > $conf->{focr_max_height}"); |
|---|
| 254 | | delete $imgfiles{$filename}; |
|---|
| 255 | | next; |
|---|
| 256 | | } |
|---|
| 257 | | |
|---|
| 258 | | #Skip too big images, screenshots etc |
|---|
| 259 | | if ($imgfiles{$filename}{width} > $conf->{focr_max_width}) { |
|---|
| 260 | | infolog("Skipping image: width > $conf->{focr_max_width}"); |
|---|
| 261 | | delete $imgfiles{$filename}; |
|---|
| 262 | | next; |
|---|
| 263 | | } |
|---|
| 264 | | |
|---|
| | 244 | if ($imgfiles{$filename}{ftype} == 6) { |
|---|
| | 245 | unless ($conf->{focr_scan_pdfs}) { |
|---|
| | 246 | infolog("Skipping PDF file: PDF Scanning was disabled in config"); |
|---|
| | 247 | next; |
|---|
| | 248 | } |
|---|
| | 249 | } else { |
|---|
| | 250 | #Skip images that cannot contain text |
|---|
| | 251 | if ($imgfiles{$filename}{height} < $conf->{focr_min_height}) { |
|---|
| | 252 | infolog("Skipping image: height < $conf->{focr_min_height}"); |
|---|
| | 253 | delete $imgfiles{$filename}; |
|---|
| | 254 | next; |
|---|
| | 255 | } |
|---|
| | 256 | |
|---|
| | 257 | #Skip images that cannot contain text |
|---|
| | 258 | if ($imgfiles{$filename}{width} < $conf->{focr_min_width}) { |
|---|
| | 259 | infolog("Skipping image: width < $conf->{focr_min_width}"); |
|---|
| | 260 | delete $imgfiles{$filename}; |
|---|
| | 261 | next; |
|---|
| | 262 | } |
|---|
| | 263 | |
|---|
| | 264 | #Skip too big images, screenshots etc |
|---|
| | 265 | if ($imgfiles{$filename}{height} > $conf->{focr_max_height}) { |
|---|
| | 266 | infolog("Skipping image: height > $conf->{focr_max_height}"); |
|---|
| | 267 | delete $imgfiles{$filename}; |
|---|
| | 268 | next; |
|---|
| | 269 | } |
|---|
| | 270 | |
|---|
| | 271 | #Skip too big images, screenshots etc |
|---|
| | 272 | if ($imgfiles{$filename}{width} > $conf->{focr_max_width}) { |
|---|
| | 273 | infolog("Skipping image: width > $conf->{focr_max_width}"); |
|---|
| | 274 | delete $imgfiles{$filename}; |
|---|
| | 275 | next; |
|---|
| | 276 | } |
|---|
| | 277 | } |
|---|
| 691 | | } |
|---|
| | 704 | } elsif ($$pic{ftype} == 6) { |
|---|
| | 705 | infolog("Found PDF header name=\"$$pic{fname}\""); |
|---|
| | 706 | |
|---|
| | 707 | my $missing_bin = 0; |
|---|
| | 708 | foreach my $a (qw/pdftops pstopnm pdfinfo/) { |
|---|
| | 709 | unless (defined $conf->{"focr_bin_$a"}) { |
|---|
| | 710 | $missing_bin = 1; |
|---|
| | 711 | errorlog("Cannot exec $a, skipping image"); |
|---|
| | 712 | next; |
|---|
| | 713 | } |
|---|
| | 714 | } |
|---|
| | 715 | |
|---|
| | 716 | if ($missing_bin) { |
|---|
| | 717 | next; |
|---|
| | 718 | } |
|---|
| | 719 | |
|---|
| | 720 | my @stderr_data; |
|---|
| | 721 | my ($retcode, @stdout_data) = save_execute( |
|---|
| | 722 | "$conf->{focr_bin_pdfinfo} $file", |
|---|
| | 723 | undef, |
|---|
| | 724 | ">$imgdir/pdfinfo.info", |
|---|
| | 725 | ">>$imgdir/pdfinfo.err", 1); |
|---|
| | 726 | |
|---|
| | 727 | foreach (@stdout_data) { |
|---|
| | 728 | if ($_ =~ /^Pages:\s*([0-9]+)/) { |
|---|
| | 729 | $$pic{pages} = $1; |
|---|
| | 730 | } |
|---|
| | 731 | } |
|---|
| | 732 | |
|---|
| | 733 | unless ($$pic{pages}) { |
|---|
| | 734 | infolog("Can't determine page count of PDF Document\n"); |
|---|
| | 735 | } |
|---|
| | 736 | |
|---|
| | 737 | if ($$pic{pages} > $conf->{focr_pdf_maxpages}) { |
|---|
| | 738 | infolog("PDF has too many pages, skipping this file...\n"); |
|---|
| | 739 | next; |
|---|
| | 740 | } |
|---|
| | 741 | |
|---|
| | 742 | if ( ($$pic{ctype} !~ /pdf/i) and not $generic_ctype) { |
|---|
| | 743 | wrong_ctype( "Application/PDF", $$pic{ctype} ); |
|---|
| | 744 | $internal_score += $conf->{'focr_wrongctype_score'}; |
|---|
| | 745 | } |
|---|
| | 746 | |
|---|
| | 747 | $retcode = save_execute("$conf->{focr_bin_pdftops} $file -", undef, ">$file.ps", ">>$efile"); |
|---|
| | 748 | |
|---|
| | 749 | if ($retcode<0) { |
|---|
| | 750 | chomp $retcode; |
|---|
| | 751 | printf RAWERR "?? Timed out > $retcode\n" if ($haserr>0); |
|---|
| | 752 | errorlog("$conf->{focr_bin_pdftops}: Timed out [$retcode], skipping..."); |
|---|
| | 753 | ++$imgerr if $conf->{focr_keep_bad_images}>0; next; |
|---|
| | 754 | } elsif ($retcode>0) { |
|---|
| | 755 | chomp $retcode; |
|---|
| | 756 | printf RAWERR "?? [$retcode] returned from $conf->{focr_bin_pdftops}\n" if ($haserr>0); |
|---|
| | 757 | errorlog("$conf->{focr_bin_pdftops}: Returned [$retcode], skipping..."); |
|---|
| | 758 | ++$imgerr if $conf->{focr_keep_bad_images}>0; next; |
|---|
| | 759 | } |
|---|
| | 760 | |
|---|
| | 761 | $retcode = save_execute("$conf->{focr_bin_pstopnm} -stdout -xsize=1000 $file.ps", undef, ">$pfile", ">>$efile"); |
|---|
| | 762 | |
|---|
| | 763 | if ($retcode<0) { |
|---|
| | 764 | chomp $retcode; |
|---|
| | 765 | printf RAWERR "?? Timed out > $retcode\n" if ($haserr>0); |
|---|
| | 766 | errorlog("$conf->{focr_bin_pstopnm}: Timed out [$retcode], skipping..."); |
|---|
| | 767 | ++$imgerr if $conf->{focr_keep_bad_images}>0; next; |
|---|
| | 768 | } elsif ($retcode>0) { |
|---|
| | 769 | chomp $retcode; |
|---|
| | 770 | printf RAWERR "?? [$retcode] returned from $conf->{focr_bin_pstopnm}\n" if ($haserr>0); |
|---|
| | 771 | errorlog("$conf->{focr_bin_pstopnm}: Returned [$retcode], skipping..."); |
|---|
| | 772 | ++$imgerr if $conf->{focr_keep_bad_images}>0; next; |
|---|
| | 773 | } |
|---|
| | 774 | } |
|---|