Changeset 60
- Timestamp:
- 27.11.2006 16:53:35 (2 years ago)
- Files:
-
- trunk/devel/FuzzyOcr.cf (modified) (1 diff)
- trunk/devel/FuzzyOcr.pm (modified) (4 diffs)
- trunk/devel/FuzzyOcr.preps (added)
- trunk/devel/FuzzyOcr.scansets (added)
- trunk/devel/FuzzyOcr/Config.pm (modified) (7 diffs)
- trunk/devel/FuzzyOcr/Misc.pm (modified) (1 diff)
- trunk/devel/FuzzyOcr/Preprocessor.pm (modified) (3 diffs)
- trunk/devel/FuzzyOcr/Scanset.pm (modified) (4 diffs)
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
trunk/devel/FuzzyOcr.cf
r58 r60 56 56 ############################################################################################ 57 57 58 ##### Scansets, comma seperated (Default value: $gocr -i -, $gocr -l 180 -d 2 -i -) ##### 59 # Each scanset consists of one or more commands which make text out of pnm input. 60 # Each scanset is run seperately on the PNM data, results are combined in scoring. 61 #focr_scansets $gocr -i $pfile, $gocr -l 180 -d 2 -i $pfile, $gocr -l 140 -d 2 -i $pfile 58 59 ##### Scansets ##### 62 60 # 63 # An example that involves ocrad as well 64 #focr_scansets $gocr -i $pfile, $gocr -l 180 -d 2 -i $pfile, $ocrad -s5 -T 0.5 $pfile 61 ##Paths to the files containing Scansets and Preprocessors used in Scansets. 65 62 # 66 # Another one for ocrad only67 #focr_scanset s $ocrad -s5 -T 0.5 $pfile63 #focr_preprocessor_file /etc/mail/spamassassin/FuzzyOcr.preps 64 #focr_scanset_file /etc/mail/spamassassin/FuzzyOcr.scansets 68 65 # 69 # To use only one scan with default values, uncomment the next line instead 70 #focr_scansets $gocr -i $pfile 71 # 72 # Some example for more advanced sets 73 # Thisone uses the first the standard scan, then a scanset which first reduces the image to 3 colors and then scans it with custom settings 74 # and then it scans again only with these custom settings 75 # NOTE: This is for advanced users only, if you have questions how to use this, ask on the ML or on IRC 76 #focr_scansets $gocr -i $pfile, pnmnorm $pfile 2>$efile | pnmquant 3 2>>$efile | pnmnorm 2>>$efile | $gocr -l 180 -d 2 -i -, $gocr -l 180 -d 2 -i $pfile 77 ######################################################################################### 66 ######### 67 68 #################### 78 69 79 70 ##### Various Score/Scan settings ##### trunk/devel/FuzzyOcr.pm
r59 r60 140 140 141 141 $imgdir = Mail::SpamAssassin::Util::secure_tmpdir(); 142 142 143 143 set_tmpdir($imgdir); 144 144 … … 602 602 my $scansets = get_scansets(); 603 603 foreach my $scanset (@$scansets) { 604 my $scan = $scanset; 605 $scan =~ s/\$gocr/$conf->{focr_bin_gocr}/; 606 $scan =~ s/\$ocrad/$conf->{focr_bin_ocrad}/; 607 $scan =~ s/\$pfile/$pfile/; 608 $scan =~ s/\$efile/$efile/g; 609 #unlink $efile if (-e $efile); 610 #debuglog("Trying: $scanset"); 611 printf RAWERR qq(## $scan 2>>$efile\n) if ($haserr>0); 612 613 my ($retcode, @ocrdata) = save_execute("$scan", undef, ">$imgdir/ocr.temp", ">>$efile",1); 604 my $scancmd = $scanset->{ocr_command}; 605 my $scanlabel = $scanset->{label}; 606 printf RAWERR qq(## $scancmd\n) if ($haserr>0); 607 my ($retcode, @result) = $scanset->run($pfile); 614 608 if ($retcode<0) { 615 debuglog("Timeout: \"$scan set\" took more than $conf->{focr_timeout} sec.");616 debuglog("Skipping scanset due to timeout, trying next...");617 printf RAWERR qq(Timeout[$conf->{focr_timeout}]: $scan set\n) if ($haserr>0);609 debuglog("Timeout: \"$scancmd\" took more than $conf->{focr_timeout} sec."); 610 debuglog("Skipping scanset \"$scanlabel\" due to timeout, trying next..."); 611 printf RAWERR qq(Timeout[$conf->{focr_timeout}]: $scancmd\n) if ($haserr>0); 618 612 next; 619 613 } elsif ($retcode>0) { 620 614 chomp $retcode; 621 open ERR,$efile; 622 my @stderr = <ERR>; 623 close ERR; 624 my $errstr = join( '', $retcode,@stderr ); 625 debuglog("Errors in \"$scanset\""); 615 my $errstr = join( '', $retcode,@result ); 616 debuglog("Errors in Scanset \"$scanlabel\""); 626 617 debuglog($errstr); 627 618 debuglog("Skipping scanset because of errors, trying next..."); … … 630 621 } 631 622 632 debuglog("ocrdata=>>".join("",@ ocrdata)."<<=end") if ($conf->{focr_verbose}>2);633 push( @ocr_results, [@ ocrdata] );623 debuglog("ocrdata=>>".join("",@result)."<<=end") if ($conf->{focr_verbose}>2); 624 push( @ocr_results, [@result] ); 634 625 push( @used_scansets, $scanset ); 635 626 } … … 654 645 "Found word \"$w\" in line\n \"$_\" \n with fuzz of " 655 646 . sprintf("%0.4f",$matched) 656 . " scanned with scanset $used_scansets[$gcnt]"647 . " scanned with scanset \"$used_scansets[$gcnt]->{label}\"" 657 648 ); 658 649 } trunk/devel/FuzzyOcr/Config.pm
r58 r60 1 1 use strict; 2 2 package FuzzyOcr::Config; 3 4 use FuzzyOcr::Scanset; 5 use FuzzyOcr::Preprocessor; 3 6 4 7 use base 'Exporter'; … … 13 16 get_timeout 14 17 get_scansets 18 get_preprocessor 15 19 get_thresholds 16 20 get_config … … 39 43 our %words = (); 40 44 our @scansets; 45 our @preprocessors; 41 46 our $conf; 42 47 our $pms; … … 128 133 } 129 134 135 sub get_preprocessor { 136 my ($label) = @_; 137 foreach (@preprocessors) { 138 if ($_->{label} eq $label) { 139 return $_; 140 } 141 } 142 return 0; 143 } 144 130 145 sub get_thresholds { 131 146 return \%Threshold; … … 352 367 353 368 push (@cmds, { 354 setting => 'focr_scansets', 355 default => '$gocr -i $pfile, $gocr -l 180 -d 2 -i $pfile, $ocrad -s5 -T 0.5 $pfile', 369 setting => 'focr_scanset_file', 370 default => '/etc/mail/spamassassin/FuzzyOcr.scansets', 371 type => $Mail::SpamAssassin::Conf::CONF_TYPE_STRING 372 }); 373 push (@cmds, { 374 setting => 'focr_preprocessor_file', 375 default => '/etc/mail/spamassassin/FuzzyOcr.preps', 356 376 type => $Mail::SpamAssassin::Conf::CONF_TYPE_STRING 357 377 }); … … 441 461 debuglog(" $k => ".$conf->{$k}); 442 462 } 443 # Use specified scansets 444 @scansets = split(/,\s*/, $conf->{'focr_scansets'}); 463 # Parse preprocessor file 464 my $pfile = $conf->{'focr_preprocessor_file'}; 465 (my $retcode, @preprocessors) = parse_preprocessors($pfile); 466 if ($retcode) { 467 debuglog("Error parsing preprocessor file \"$pfile\", aborting..."); 468 return 1; 469 } 470 foreach my $prep (@preprocessors) { 471 my $prepcmd = $prep->{command}; 472 my $preplabel = $prep->{label}; 473 debuglog("Loaded preprocessor $preplabel: $prepcmd"); 474 } 475 476 # Parse scanset file 477 my $sfile = $conf->{'focr_scanset_file'}; 478 ($retcode, @scansets) = parse_scansets($sfile); 479 if ($retcode) { 480 debuglog("Error parsing scanset file \"$sfile\", aborting..."); 481 return 1; 482 } 445 483 foreach my $scan (@scansets) { 446 if ($scan =~ m/(gocr|ocrad)/) { 447 next unless -x $conf->{"focr_bin_$1"}; 448 } 449 debuglog("Using scan: $scan"); 484 my $scancmd = $scan->{ocr_command}; 485 my $scanlabel = $scan->{label}; 486 debuglog("Using scan $scanlabel: $scancmd"); 450 487 } 451 488 … … 702 739 } 703 740 741 sub parse_scansets { 742 my ($file) = @_; 743 744 open(SFILE, "<$file"); 745 if ($?) { 746 debuglog("Failed to open scanset file \"$file\", aborting..."); 747 return 1; 748 } 749 750 our @slabels; 751 our @scansets; 752 my $scanset; 753 754 while(<SFILE>) { 755 # We are in the middle of a scanset 756 if(defined $scanset) { 757 # Strip comments and ignore blank lines 758 chomp($_); 759 $_ =~ s/(\s)*#(.*)//; 760 unless ($_) { 761 next; 762 } 763 if ($_ =~ /^(\s)*preprocessors(\s)*=(\s)*(.*)$/i) { 764 my $prep = $4; 765 $scanset->{preprocessors} = $prep; 766 $prep =~ s/ //g; 767 my @preps = split(',', $prep); 768 foreach (@preps) { 769 unless(get_preprocessor($_)) { 770 debuglog("Unknown preprocessor \"$_\" used in scansets line $., aborting..."); 771 return 1; 772 } 773 } 774 } elsif ($_ =~ /^(\s)*ocr_command(\s)*=(\s)*(.*)$/i) { 775 my $cmd = $4; 776 if ($cmd =~ /(<|>|\||;)/) { 777 debuglog("OCR Command may not contain \"< > | ;\", aborting..."); 778 return 1; 779 } 780 $scanset->{ocr_command} = $cmd; 781 } elsif ($_ =~ /^(\s)*force_output_in(\s)*=(\s)*(.*)$/i) { 782 $scanset->{force_output_in} = $4; 783 # Scanset is closing 784 } elsif ($_ =~ /^(\s)*\}/) { 785 unless ($scanset->{ocr_command}) { 786 my $l = $scanset->{label}; 787 debuglog("Scanset \"$l\" is missing ocr_command line, aborting..."); 788 return 1; 789 } 790 push(@scansets, $scanset); 791 $scanset = undef; 792 } else { 793 debuglog("Unknown token at line $., aborting..."); 794 return 1; 795 } 796 # Start a new scanset 797 } elsif ($_ =~ /^(\s)*scanset(\s)+(.+?)(\s)+\{$/i) { 798 if (grep $_ eq $3, @slabels) { 799 debuglog("Error, label already used earlier in line $., aborting..."); 800 return 1; 801 } 802 $scanset = FuzzyOcr::Scanset->new($3); 803 push(@slabels, $3); 804 } 805 } 806 close(SFILE); 807 return (0, @scansets); 808 } 809 810 sub parse_preprocessors { 811 my ($file) = @_; 812 813 open(PFILE, "<$file"); 814 if ($?) { 815 debuglog("Failed to open preprocessor file \"$file\", aborting..."); 816 return 1; 817 } 818 819 our @plabels; 820 our @preprocessors; 821 my $preprocessor; 822 823 while(<PFILE>) { 824 chomp($_); 825 $_ =~ s/(\s)*#(.*)//; 826 unless ($_) { 827 next; 828 } 829 # We are in the middle of a preprocessor 830 if(defined $preprocessor) { 831 if ($_ =~ /^(\s)*command(\s)*=(\s)*(.*)$/i) { 832 my $cmd = $4; 833 if ($cmd =~ /(<|>|\||;)/) { 834 debuglog("Preprocessor Command may not contain \"< > | ;\", aborting..."); 835 return 1; 836 } 837 $preprocessor->{command} = $cmd; 838 # Preprocessor is closing 839 } elsif ($_ =~ /^(\s)*\}/) { 840 unless ($preprocessor->{command}) { 841 my $l = $preprocessor->{label}; 842 debuglog("Preprocessor \"$l\" is missing command line, aborting..."); 843 return 1; 844 } 845 push(@preprocessors, $preprocessor); 846 $preprocessor = undef; 847 } else { 848 debuglog("Unknown token at line $., aborting..."); 849 return 1; 850 } 851 # Start a new preprocessor 852 } elsif ($_ =~ /^(\s)*preprocessor(\s)+(.+?)(\s)+\{$/i) { 853 if (grep $_ eq $3, @plabels) { 854 debuglog("Error, label already used earlier in line $., aborting..."); 855 return 1; 856 } 857 $preprocessor = FuzzyOcr::Preprocessor->new($3); 858 push(@plabels, $3); 859 } 860 } 861 close(PFILE); 862 return (0, @preprocessors); 863 } 864 704 865 1; trunk/devel/FuzzyOcr/Misc.pm
r59 r60 6 6 7 7 use lib "../"; 8 use FuzzyOcr::Config qw(set_pid unset_pid get_timeout get_pms get_config set_config debuglog logfile); ;8 use FuzzyOcr::Config qw(set_pid unset_pid get_timeout get_pms get_config set_config debuglog logfile); 9 9 use Time::HiRes qw( time usleep ualarm gettimeofday tv_interval ); 10 10 trunk/devel/FuzzyOcr/Preprocessor.pm
r59 r60 2 2 3 3 use lib "../"; 4 use FuzzyOcr::Config qw(get_config debuglog get_tmpdir); 5 use FuzzyOcr::Misc qw(save_execute); 4 use FuzzyOcr::Config; 6 5 7 6 sub new { 8 my $class = shift; 9 10 my $label = shift: 11 my $command = shift; 7 my ($class, $label, $command) = @_; 12 8 13 9 bless { … … 19 15 sub run { 20 16 my ($self, $input) = @_; 21 my $tmpdir = get_tmpdir();17 my $tmpdir = FuzzyOcr::Config::get_tmpdir(); 22 18 my $label = $self->{label}; 23 19 my $output = "$tmpdir/prep.$label.out"; … … 43 39 44 40 # Run processor 45 my $retcode = save_execute($rcmd, $stdin, $stdout, $stderr);41 my $retcode = FuzzyOcr::Misc::save_execute($rcmd, $stdin, $stdout, $stderr); 46 42 47 43 # Return code trunk/devel/FuzzyOcr/Scanset.pm
r59 r60 2 2 3 3 use lib "../"; 4 use FuzzyOcr::Config qw(get_config debuglog get_tmpdir); 5 use FuzzyOcr::Misc qw(save_execute); 4 use FuzzyOcr::Config; 6 5 7 6 sub new { 8 my $class = shift; 9 10 my $label = shift; 11 my $preprocessors = shift; 12 my $ocr_command = shift; 7 my ($class, $label, $preprocessors, $ocr_command, $output_in) = @_; 13 8 14 9 bless { 15 10 "label" => $label, 16 11 "preprocessors" => $preprocessors, 17 "ocr_command" => $ocr_command 12 "ocr_command" => $ocr_command, 13 "force_output_in" => $output_in 18 14 }, $class; 19 15 } … … 21 17 sub run { 22 18 my ($self, $input) = @_; 23 my $tmpdir = get_tmpdir(); 19 my $conf = FuzzyOcr::Config::get_config(); 20 my $tmpdir = FuzzyOcr::Config::get_tmpdir(); 24 21 my $label = $self->{label}; 25 22 my $output = "$tmpdir/scanset.$label.out"; … … 32 29 # First, run all preprocessors 33 30 my $preprocessors = $self->{preprocessors}; 34 $preprocessors =~ s/ //g; 35 my @prep = split(',', $preprocessors); 36 foreach (@prep) { 37 my $proc = get_preprocessor($_); 38 my $label = $proc->{label}; 39 my $retcode = $proc->run($input); 40 if ($retcode) { 41 #TODO, error here 31 if ($preprocessors) { 32 $preprocessors =~ s/ //g; 33 my @prep = split(',', $preprocessors); 34 foreach (@prep) { 35 my $proc = FuzzyOcr::Config::get_preprocessor($_); 36 my $label = $proc->{label}; 37 my $retcode = $proc->run($input); 38 if ($retcode) { 39 #TODO, error here 40 } 41 # Input of next processor is output of last 42 $input = "$tmpdir/prep.$label.out"; 42 43 } 43 # Input of next processor is output of last44 $input = "$tmpdir/prep.$label.out";45 44 } 46 45 … … 60 59 } 61 60 61 #Replace supported scanner macros by full path 62 $rcmd =~ s/\$gocr/$conf->{focr_bin_gocr}/; 63 $rcmd =~ s/\$ocrad/$conf->{focr_bin_ocrad}/; 64 62 65 # Run scanner 63 my ($retcode, @result) = save_execute($rcmd, $stdin, $stdout, $stderr, 1); 66 my $out_in = $self->{force_output_in}; 67 my $retcode; 68 my @result; 64 69 70 # Scanset enforces OCR output in file $out_in (for example TesserAct has multiple files as output) 71 if ($out_in) { 72 $retcode = FuzzyOcr::Misc::save_execute($rcmd, $stdin, $stdout, $stderr); 73 open(INFILE, "<$out_in"); 74 @result = <INFILE>; 75 close(INFILE); 76 } else { 77 ($retcode, @result) = FuzzyOcr::Misc::save_execute($rcmd, $stdin, $stdout, $stderr, 1); 78 } 79 80 # If there were errors in the scan, return the errors instead of OCR results 81 if ($retcode>0) { 82 $stderr =~ tr/>|</ /; 83 open(INFILE, "<$stderr"); 84 @result = <INFILE>; 85 close(INFILE); 86 } 65 87 # Return scanner results and return code 88 66 89 return ($retcode, @result); 67 90 }
