Changeset 60

Show
Ignore:
Timestamp:
27.11.2006 16:53:35 (2 years ago)
Author:
decoder
Message:

New Scanset and Preprocessor System

Works at first sight but still needs some work and lots of testing
Error handling at some points still missing

Files:

Legend:

Unmodified
Added
Removed
Modified
Copied
Moved
  • trunk/devel/FuzzyOcr.cf

    r58 r60  
    5656############################################################################################ 
    5757 
    58 ##### Scansets, comma seperated (Default value: $gocr -i -, $gocr -l 180 -d 2 -i -) ##### 
    59 # Each scanset consists of one or more commands which make text out of pnm input. 
    60 # Each scanset is run seperately on the PNM data, results are combined in scoring. 
    61 #focr_scansets $gocr -i $pfile, $gocr -l 180 -d 2 -i $pfile, $gocr -l 140 -d 2 -i $pfile 
     58 
     59##### Scansets ##### 
    6260# 
    63 # An example that involves ocrad as well 
    64 #focr_scansets $gocr -i $pfile, $gocr -l 180 -d 2 -i $pfile, $ocrad -s5 -T 0.5 $pfile 
     61##Paths to the files containing Scansets and Preprocessors used in Scansets. 
    6562# 
    66 # Another one for ocrad only 
    67 #focr_scansets $ocrad -s5 -T 0.5 $pfile 
     63#focr_preprocessor_file /etc/mail/spamassassin/FuzzyOcr.preps 
     64#focr_scanset_file /etc/mail/spamassassin/FuzzyOcr.scansets 
    6865# 
    69 # To use only one scan with default values, uncomment the next line instead 
    70 #focr_scansets $gocr -i $pfile 
    71 
    72 # Some example for more advanced sets 
    73 # Thisone uses the first the standard scan, then a scanset which first reduces the image to 3 colors and then scans it with custom settings 
    74 # and then it scans again only with these custom settings 
    75 # NOTE: This is for advanced users only, if you have questions how to use this, ask on the ML or on IRC 
    76 #focr_scansets $gocr -i $pfile, pnmnorm $pfile 2>$efile | pnmquant 3 2>>$efile | pnmnorm 2>>$efile | $gocr -l 180 -d 2 -i -, $gocr -l 180 -d 2 -i $pfile 
    77 ######################################################################################### 
     66######### 
     67 
     68#################### 
    7869 
    7970##### Various Score/Scan settings ##### 
  • trunk/devel/FuzzyOcr.pm

    r59 r60  
    140140 
    141141        $imgdir = Mail::SpamAssassin::Util::secure_tmpdir(); 
    142          
     142 
    143143        set_tmpdir($imgdir); 
    144144 
     
    602602        my $scansets = get_scansets(); 
    603603        foreach my $scanset (@$scansets) { 
    604             my $scan = $scanset; 
    605             $scan =~ s/\$gocr/$conf->{focr_bin_gocr}/; 
    606             $scan =~ s/\$ocrad/$conf->{focr_bin_ocrad}/; 
    607             $scan =~ s/\$pfile/$pfile/; 
    608             $scan =~ s/\$efile/$efile/g; 
    609             #unlink $efile if (-e $efile); 
    610             #debuglog("Trying: $scanset"); 
    611             printf RAWERR qq(## $scan 2>>$efile\n) if ($haserr>0); 
    612  
    613             my ($retcode, @ocrdata) = save_execute("$scan", undef, ">$imgdir/ocr.temp", ">>$efile",1); 
     604            my $scancmd = $scanset->{ocr_command}; 
     605            my $scanlabel = $scanset->{label}; 
     606            printf RAWERR qq(## $scancmd\n) if ($haserr>0); 
     607            my ($retcode, @result) = $scanset->run($pfile); 
    614608            if ($retcode<0) { 
    615                 debuglog("Timeout: \"$scanset\" took more than $conf->{focr_timeout} sec."); 
    616                 debuglog("Skipping scanset due to timeout, trying next..."); 
    617                 printf RAWERR qq(Timeout[$conf->{focr_timeout}]: $scanset\n) if ($haserr>0); 
     609                debuglog("Timeout: \"$scancmd\" took more than $conf->{focr_timeout} sec."); 
     610                debuglog("Skipping scanset \"$scanlabel\" due to timeout, trying next..."); 
     611                printf RAWERR qq(Timeout[$conf->{focr_timeout}]: $scancmd\n) if ($haserr>0); 
    618612                next; 
    619613            } elsif ($retcode>0) { 
    620614                chomp $retcode; 
    621                 open ERR,$efile; 
    622                 my @stderr = <ERR>; 
    623                 close ERR; 
    624                 my $errstr = join( '', $retcode,@stderr ); 
    625                 debuglog("Errors in \"$scanset\""); 
     615                my $errstr = join( '', $retcode,@result ); 
     616                debuglog("Errors in Scanset \"$scanlabel\""); 
    626617                debuglog($errstr); 
    627618                debuglog("Skipping scanset because of errors, trying next..."); 
     
    630621            } 
    631622 
    632             debuglog("ocrdata=>>".join("",@ocrdata)."<<=end") if ($conf->{focr_verbose}>2); 
    633             push( @ocr_results, [@ocrdata] ); 
     623            debuglog("ocrdata=>>".join("",@result)."<<=end") if ($conf->{focr_verbose}>2); 
     624            push( @ocr_results, [@result] ); 
    634625            push( @used_scansets, $scanset ); 
    635626        } 
     
    654645                            "Found word \"$w\" in line\n \"$_\" \n with fuzz of " 
    655646                            . sprintf("%0.4f",$matched) 
    656                             . " scanned with scanset $used_scansets[$gcnt]
     647                            . " scanned with scanset \"$used_scansets[$gcnt]->{label}\"
    657648                        ); 
    658649                    } 
  • trunk/devel/FuzzyOcr/Config.pm

    r58 r60  
    11use strict; 
    22package FuzzyOcr::Config; 
     3 
     4use FuzzyOcr::Scanset; 
     5use FuzzyOcr::Preprocessor; 
    36 
    47use base 'Exporter'; 
     
    1316    get_timeout 
    1417    get_scansets  
     18    get_preprocessor  
    1519    get_thresholds  
    1620    get_config  
     
    3943our %words = (); 
    4044our @scansets; 
     45our @preprocessors; 
    4146our $conf; 
    4247our $pms; 
     
    128133} 
    129134 
     135sub get_preprocessor { 
     136    my ($label) = @_; 
     137    foreach (@preprocessors) { 
     138        if ($_->{label} eq $label) { 
     139            return $_; 
     140        } 
     141    } 
     142    return 0; 
     143} 
     144 
    130145sub get_thresholds { 
    131146    return \%Threshold; 
     
    352367 
    353368    push (@cmds, { 
    354         setting => 'focr_scansets', 
    355         default => '$gocr -i $pfile, $gocr -l 180 -d 2 -i $pfile, $ocrad -s5 -T 0.5 $pfile', 
     369        setting => 'focr_scanset_file', 
     370        default => '/etc/mail/spamassassin/FuzzyOcr.scansets', 
     371        type => $Mail::SpamAssassin::Conf::CONF_TYPE_STRING 
     372        }); 
     373    push (@cmds, { 
     374        setting => 'focr_preprocessor_file', 
     375        default => '/etc/mail/spamassassin/FuzzyOcr.preps', 
    356376        type => $Mail::SpamAssassin::Conf::CONF_TYPE_STRING 
    357377        }); 
     
    441461        debuglog(" $k => ".$conf->{$k}); 
    442462    } 
    443     # Use specified scansets 
    444     @scansets = split(/,\s*/, $conf->{'focr_scansets'}); 
     463    # Parse preprocessor file 
     464    my $pfile = $conf->{'focr_preprocessor_file'}; 
     465    (my $retcode, @preprocessors) = parse_preprocessors($pfile); 
     466    if ($retcode) { 
     467        debuglog("Error parsing preprocessor file \"$pfile\", aborting..."); 
     468        return 1; 
     469    } 
     470    foreach my $prep (@preprocessors) { 
     471        my $prepcmd = $prep->{command}; 
     472        my $preplabel = $prep->{label}; 
     473        debuglog("Loaded preprocessor $preplabel: $prepcmd"); 
     474    } 
     475 
     476    # Parse scanset file 
     477    my $sfile = $conf->{'focr_scanset_file'}; 
     478    ($retcode, @scansets) = parse_scansets($sfile); 
     479    if ($retcode) { 
     480        debuglog("Error parsing scanset file \"$sfile\", aborting..."); 
     481        return 1; 
     482    } 
    445483    foreach my $scan (@scansets) { 
    446         if ($scan =~ m/(gocr|ocrad)/) { 
    447             next unless -x $conf->{"focr_bin_$1"}; 
    448         } 
    449         debuglog("Using scan: $scan"); 
     484        my $scancmd = $scan->{ocr_command}; 
     485        my $scanlabel = $scan->{label}; 
     486        debuglog("Using scan $scanlabel: $scancmd"); 
    450487    } 
    451488 
     
    702739} 
    703740 
     741sub parse_scansets { 
     742    my ($file) = @_; 
     743 
     744    open(SFILE, "<$file"); 
     745    if ($?) { 
     746        debuglog("Failed to open scanset file \"$file\", aborting..."); 
     747        return 1; 
     748    } 
     749 
     750    our @slabels; 
     751    our @scansets; 
     752    my $scanset; 
     753 
     754    while(<SFILE>) { 
     755        # We are in the middle of a scanset 
     756        if(defined $scanset) { 
     757            # Strip comments and ignore blank lines 
     758            chomp($_); 
     759            $_ =~ s/(\s)*#(.*)//; 
     760            unless ($_) { 
     761                next; 
     762            } 
     763            if ($_ =~ /^(\s)*preprocessors(\s)*=(\s)*(.*)$/i) { 
     764                my $prep = $4; 
     765                $scanset->{preprocessors} = $prep; 
     766                $prep =~ s/ //g; 
     767                my @preps = split(',', $prep); 
     768                foreach (@preps) { 
     769                    unless(get_preprocessor($_)) { 
     770                        debuglog("Unknown preprocessor \"$_\" used in scansets line $., aborting..."); 
     771                        return 1; 
     772                    } 
     773                } 
     774            } elsif ($_ =~ /^(\s)*ocr_command(\s)*=(\s)*(.*)$/i) { 
     775                my $cmd = $4; 
     776                if ($cmd =~ /(<|>|\||;)/) { 
     777                    debuglog("OCR Command may not contain \"< > | ;\", aborting..."); 
     778                    return 1; 
     779                } 
     780                $scanset->{ocr_command} = $cmd; 
     781            } elsif ($_ =~ /^(\s)*force_output_in(\s)*=(\s)*(.*)$/i) { 
     782                $scanset->{force_output_in} = $4; 
     783            # Scanset is closing 
     784            } elsif ($_ =~ /^(\s)*\}/) { 
     785                unless ($scanset->{ocr_command}) { 
     786                    my $l = $scanset->{label}; 
     787                    debuglog("Scanset \"$l\" is missing ocr_command line, aborting..."); 
     788                    return 1; 
     789                } 
     790                push(@scansets, $scanset); 
     791                $scanset = undef; 
     792            } else { 
     793                debuglog("Unknown token at line $., aborting..."); 
     794                return 1; 
     795            } 
     796        # Start a new scanset 
     797        } elsif ($_ =~ /^(\s)*scanset(\s)+(.+?)(\s)+\{$/i) { 
     798            if (grep $_ eq $3, @slabels) { 
     799                debuglog("Error, label already used earlier in line $., aborting..."); 
     800                return 1; 
     801            } 
     802            $scanset = FuzzyOcr::Scanset->new($3); 
     803            push(@slabels, $3); 
     804        } 
     805    } 
     806    close(SFILE); 
     807    return (0, @scansets); 
     808} 
     809 
     810sub parse_preprocessors { 
     811    my ($file) = @_; 
     812 
     813    open(PFILE, "<$file"); 
     814    if ($?) { 
     815        debuglog("Failed to open preprocessor file \"$file\", aborting..."); 
     816        return 1; 
     817    } 
     818 
     819    our @plabels; 
     820    our @preprocessors; 
     821    my $preprocessor; 
     822 
     823    while(<PFILE>) { 
     824        chomp($_); 
     825        $_ =~ s/(\s)*#(.*)//; 
     826        unless ($_) { 
     827            next; 
     828        } 
     829        # We are in the middle of a preprocessor 
     830        if(defined $preprocessor) { 
     831            if ($_ =~ /^(\s)*command(\s)*=(\s)*(.*)$/i) { 
     832                my $cmd = $4; 
     833                if ($cmd =~ /(<|>|\||;)/) { 
     834                    debuglog("Preprocessor Command may not contain \"< > | ;\", aborting..."); 
     835                    return 1; 
     836                } 
     837                $preprocessor->{command} = $cmd; 
     838            # Preprocessor is closing 
     839            } elsif ($_ =~ /^(\s)*\}/) { 
     840                unless ($preprocessor->{command}) { 
     841                    my $l = $preprocessor->{label}; 
     842                    debuglog("Preprocessor \"$l\" is missing command line, aborting..."); 
     843                    return 1; 
     844                } 
     845                push(@preprocessors, $preprocessor); 
     846                $preprocessor = undef; 
     847            } else { 
     848                debuglog("Unknown token at line $., aborting..."); 
     849                return 1; 
     850            } 
     851        # Start a new preprocessor 
     852        } elsif ($_ =~ /^(\s)*preprocessor(\s)+(.+?)(\s)+\{$/i) { 
     853            if (grep $_ eq $3, @plabels) { 
     854                debuglog("Error, label already used earlier in line $., aborting..."); 
     855                return 1; 
     856            } 
     857            $preprocessor = FuzzyOcr::Preprocessor->new($3); 
     858            push(@plabels, $3); 
     859        } 
     860    } 
     861    close(PFILE); 
     862    return (0, @preprocessors); 
     863} 
     864 
    7048651; 
  • trunk/devel/FuzzyOcr/Misc.pm

    r59 r60  
    66 
    77use lib "../"; 
    8 use FuzzyOcr::Config qw(set_pid unset_pid get_timeout get_pms get_config set_config debuglog logfile);; 
     8use FuzzyOcr::Config qw(set_pid unset_pid get_timeout get_pms get_config set_config debuglog logfile); 
    99use Time::HiRes qw( time usleep ualarm gettimeofday tv_interval ); 
    1010 
  • trunk/devel/FuzzyOcr/Preprocessor.pm

    r59 r60  
    22 
    33use lib "../"; 
    4 use FuzzyOcr::Config qw(get_config debuglog get_tmpdir); 
    5 use FuzzyOcr::Misc qw(save_execute); 
     4use FuzzyOcr::Config; 
    65 
    76sub new { 
    8     my $class = shift; 
    9  
    10     my $label = shift: 
    11     my $command = shift; 
     7    my ($class, $label, $command) = @_; 
    128 
    139    bless { 
     
    1915sub run { 
    2016    my ($self, $input) = @_; 
    21     my $tmpdir = get_tmpdir(); 
     17    my $tmpdir = FuzzyOcr::Config::get_tmpdir(); 
    2218    my $label = $self->{label}; 
    2319    my $output = "$tmpdir/prep.$label.out"; 
     
    4339 
    4440    # Run processor 
    45     my $retcode = save_execute($rcmd, $stdin, $stdout, $stderr); 
     41    my $retcode = FuzzyOcr::Misc::save_execute($rcmd, $stdin, $stdout, $stderr); 
    4642 
    4743    # Return code 
  • trunk/devel/FuzzyOcr/Scanset.pm

    r59 r60  
    22 
    33use lib "../"; 
    4 use FuzzyOcr::Config qw(get_config debuglog get_tmpdir); 
    5 use FuzzyOcr::Misc qw(save_execute); 
     4use FuzzyOcr::Config; 
    65 
    76sub new { 
    8     my $class = shift; 
    9  
    10     my $label = shift; 
    11     my $preprocessors = shift; 
    12     my $ocr_command = shift; 
     7    my ($class, $label, $preprocessors, $ocr_command, $output_in) = @_; 
    138 
    149    bless { 
    1510        "label"         => $label, 
    1611        "preprocessors" => $preprocessors, 
    17         "ocr_command"   => $ocr_command 
     12        "ocr_command"   => $ocr_command, 
     13        "force_output_in" => $output_in 
    1814    }, $class; 
    1915} 
     
    2117sub run { 
    2218    my ($self, $input) = @_; 
    23     my $tmpdir = get_tmpdir(); 
     19    my $conf = FuzzyOcr::Config::get_config(); 
     20    my $tmpdir = FuzzyOcr::Config::get_tmpdir(); 
    2421    my $label = $self->{label}; 
    2522    my $output = "$tmpdir/scanset.$label.out"; 
     
    3229    # First, run all preprocessors 
    3330    my $preprocessors = $self->{preprocessors}; 
    34     $preprocessors =~ s/ //g; 
    35     my @prep = split(',', $preprocessors); 
    36     foreach (@prep) { 
    37         my $proc = get_preprocessor($_); 
    38         my $label = $proc->{label}; 
    39         my $retcode = $proc->run($input); 
    40         if ($retcode) { 
    41             #TODO, error here 
     31        if ($preprocessors) { 
     32        $preprocessors =~ s/ //g; 
     33        my @prep = split(',', $preprocessors); 
     34        foreach (@prep) { 
     35            my $proc = FuzzyOcr::Config::get_preprocessor($_); 
     36            my $label = $proc->{label}; 
     37            my $retcode = $proc->run($input); 
     38            if ($retcode) { 
     39                #TODO, error here 
     40            } 
     41            # Input of next processor is output of last 
     42            $input = "$tmpdir/prep.$label.out"; 
    4243        } 
    43         # Input of next processor is output of last 
    44         $input = "$tmpdir/prep.$label.out"; 
    4544    } 
    4645 
     
    6059    } 
    6160 
     61    #Replace supported scanner macros by full path 
     62    $rcmd =~ s/\$gocr/$conf->{focr_bin_gocr}/; 
     63    $rcmd =~ s/\$ocrad/$conf->{focr_bin_ocrad}/; 
     64 
    6265    # Run scanner 
    63     my ($retcode, @result) = save_execute($rcmd, $stdin, $stdout, $stderr, 1); 
     66    my $out_in = $self->{force_output_in}; 
     67    my $retcode; 
     68    my @result; 
    6469 
     70    # Scanset enforces OCR output in file $out_in (for example TesserAct has multiple files as output) 
     71    if ($out_in) { 
     72        $retcode = FuzzyOcr::Misc::save_execute($rcmd, $stdin, $stdout, $stderr); 
     73        open(INFILE, "<$out_in"); 
     74        @result = <INFILE>; 
     75        close(INFILE); 
     76    } else { 
     77        ($retcode, @result) = FuzzyOcr::Misc::save_execute($rcmd, $stdin, $stdout, $stderr, 1); 
     78    } 
     79 
     80    # If there were errors in the scan, return the errors instead of OCR results 
     81    if ($retcode>0) { 
     82        $stderr =~ tr/>|</   /; 
     83        open(INFILE, "<$stderr"); 
     84        @result = <INFILE>; 
     85        close(INFILE); 
     86    } 
    6587    # Return scanner results and return code 
     88 
    6689    return ($retcode, @result); 
    6790}