Changeset 64

Show
Ignore:
Timestamp:
28.11.2006 00:11:21 (2 years ago)
Author:
decoder
Message:

Implemented new configuration option focr_minimal_scanset
If this is set to 1, then the first scanset which has equal to or more hits than focr_counts_required is taken, all others are skipped:

Ex: [14412] info: FuzzyOcr?: Scanset "ocrad-invert" generates enough hits (3), skipping further scansets...

This is a trade-off less resources vs. lower scores.

Default is 0 (off).

(Note: I had to rewrite the matching routine because the matching was done after all scans were done, now the matching is done inside the scanset
loop)

Files:

Legend:

Unmodified
Added
Removed
Modified
Copied
Moved
  • trunk/devel/FuzzyOcr.pm

    r60 r64  
    601601        my @ocr_results = (); 
    602602        my $scansets = get_scansets(); 
     603        my $mcnt = 0; 
     604        my $wref = get_wordlist(); 
     605        my %words = %$wref; 
    603606        foreach my $scanset (@$scansets) { 
     607            my $cmcnt = 0; 
     608            my @cfound; 
    604609            my $scancmd = $scanset->{ocr_command}; 
    605610            my $scanlabel = $scanset->{label}; 
     
    622627 
    623628            debuglog("ocrdata=>>".join("",@result)."<<=end") if ($conf->{focr_verbose}>2); 
    624             push( @ocr_results, [@result] ); 
    625             push( @used_scansets, $scanset ); 
    626         } 
    627         my $mcnt = 0; 
    628         my $wref = get_wordlist(); 
    629         my %words = %$wref; 
    630         foreach my $ww (keys %words) { 
    631             my $w = lc $ww; 
    632             $w =~ s/[^a-z]//g; 
    633             my $wcnt = 0; 
    634             my $gcnt = 0; 
    635             foreach my $ocr_set (@ocr_results) { 
    636                 my $cwcnt = 0; 
    637                 foreach (@$ocr_set) { 
     629 
     630            foreach my $ww (keys %words) { 
     631                my $w = lc $ww; 
     632                $w =~ s/[^a-z]//g; 
     633                my $wcnt = 0; 
     634                foreach (@result) { 
    638635                    tr/!;|081/iiioal/; 
    639636                    s/[^a-zA-Z]//g; 
     
    641638                    my $matched = abs(adistr( $w, $_ )); 
    642639                    if ( $matched < $words{$ww} ) { 
    643                         $cwcnt++; 
     640                        $wcnt++; 
    644641                        debuglog( 
    645642                            "Found word \"$w\" in line\n \"$_\" \n with fuzz of " 
    646643                            . sprintf("%0.4f",$matched) 
    647                             . " scanned with scanset \"$used_scansets[$gcnt]->{label}\"" 
     644                            . " scanned with scanset \"$scanlabel\"" 
    648645                        ); 
    649646                    } 
    650647                } 
    651                 $wcnt = max( $wcnt, $cwcnt ); 
    652                 $gcnt++; 
    653             } 
    654             $cnt  += $wcnt; 
    655             $mcnt += $wcnt; 
    656             if ( ( $conf->{focr_verbose} > 0 ) and ($wcnt) ) { 
    657                 push( @found, "\"$w\" in $wcnt lines" ); 
    658             } 
     648                $cmcnt += $wcnt; 
     649                if ( ( $conf->{focr_verbose} > 0 ) and ($wcnt) ) { 
     650                    push( @cfound, "\"$w\" in $wcnt lines" ); 
     651                } 
     652            } 
     653            $mcnt = max($mcnt, $cmcnt); 
     654            if ($mcnt == $cmcnt) { 
     655                @found = @cfound; 
     656            } 
     657 
     658            if ($mcnt >= $conf->{focr_counts_required} and $conf->{focr_minimal_scanset}) { 
     659                debuglog("Scanset \"$scanlabel\" generates enough hits ($mcnt), skipping further scansets..."); 
     660                last; 
     661            } 
     662            #push( @ocr_results, [@result] ); 
     663            #push( @used_scansets, $scanset ); 
    659664        } 
    660665        if ($conf->{focr_enable_image_hashing}) { 
     
    662667            push(@hashes, $info); 
    663668        } 
     669        $cnt += $mcnt; 
    664670    } 
    665671    close RAWERR if ($haserr>0); 
    666      
     672 
    667673    if ($cnt == 0) { 
    668674        if ($conf->{focr_enable_image_hashing} > 1 and @hashes) {