Changeset 81
- Timestamp:
- 03.12.2006 00:03:49 (2 years ago)
- Files:
-
- trunk/devel/FuzzyOcr.cf (modified) (2 diffs)
- trunk/devel/FuzzyOcr.pm (modified) (4 diffs)
- trunk/devel/FuzzyOcr/Config.pm (modified) (1 diff)
- trunk/devel/FuzzyOcr/Preprocessor.pm (modified) (2 diffs)
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
trunk/devel/FuzzyOcr.cf
r80 r81 54 54 #focr_personal_wordlist fuzzyocr.words 55 55 # 56 ## Optionally, disable this option if you want to scan for numbers 57 ## Setting this to 0 will cause FuzzyOcr not to strip numbers from 58 ## both the wordlist and the OCR results 59 # 60 #focr_strip_numbers 1 61 56 62 57 63 ### … … 188 194 focr_add_score 0.375 189 195 196 # This option defines the factor, which is multiplied with the number 197 # of matches, that were made without stripping spaces. FuzzyOcr does two 198 # matching attempts on OCR results, one without space strippings and one with. 199 # To weight the first match type more, this factor is applied. 200 # Default value: 1.5 201 #focr_twopass_scoring_factor 1.5 202 190 203 # This is the score to give for a wrong content-type. 191 204 # e.g. JPEG image but content type says GIF trunk/devel/FuzzyOcr.pm
r80 r81 657 657 my $scansets = get_scansets(); 658 658 my $mcnt = 0; 659 my $modus = 0; 660 my $modus_match = 0; 659 661 my $wref = get_wordlist(); 660 662 my %words = %$wref; … … 688 690 } elsif ($retcode>0) { 689 691 chomp $retcode; 690 my $errstr = join( '', $retcode,@result ); 692 my $errstr = "Return code: $retcode, Error: "; 693 my $errstr .= join( '', @result ); 691 694 infolog("Errors in Scanset \"$scanlabel\""); 692 695 infolog($errstr); … … 697 700 698 701 debuglog("ocrdata=>>".join("",@result)."<<=end"); 699 700 foreach my $ww (keys %words) { 701 my $w = lc $ww; 702 $w =~ s/[^a-z]//g; 703 my $wcnt = 0; 704 foreach (@result) { 705 tr/!;|081/iiioal/; 706 s/[^a-zA-Z]//g; 707 $_ = lc; 708 my $matched = abs(adistr( $w, $_ )); 709 if ( $matched < $words{$ww} ) { 710 $wcnt++; 711 infolog( 712 "Found word \"$w\" in line\n \"$_\" \n with fuzz of " 713 . sprintf("%0.4f",$matched) 714 . " scanned with scanset \"$scanlabel\"" 715 ); 716 } 717 } 718 $cmcnt += $wcnt; 719 if ( ( $conf->{focr_verbose} > 0 ) and ($wcnt) ) { 720 push( @cfound, "\"$w\" in $wcnt lines" ); 721 } 722 } 723 $mcnt = max($mcnt, $cmcnt); 724 if ($mcnt == $cmcnt) { 725 @found = @cfound; 726 } 727 702 foreach $modus (0 .. 1) { 703 foreach my $ww (keys %words) { 704 my $w = lc $ww; 705 $w =~ s/[^a-z0-9 ]//g; 706 if ($modus) { 707 $w =~ s/ //g; 708 } 709 if ($conf->{focr_strip_numbers}) { 710 $w =~ s/[0-9]//g; 711 } 712 my $wcnt = 0; 713 foreach (@result) { 714 $_ = lc; 715 if ($modus) { 716 s/ //g; 717 } 718 if ($conf->{focr_strip_numbers}) { 719 tr/!;|081/iiioal/; 720 s/[0-9]//g; 721 } else { 722 tr/!;|/iii/; 723 } 724 s/[^a-z0-9 ]//g; 725 my $matched = abs(adistr( $w, $_ )); 726 if ( $matched < $words{$ww} ) { 727 $wcnt++; 728 infolog( 729 "Found word \"$w\" in line\n \"$_\" \n with fuzz of " 730 . sprintf("%0.4f",$matched) 731 . " scanned with scanset \"$scanlabel\"" 732 ); 733 } 734 } 735 $cmcnt += $wcnt; 736 if ( ( $conf->{focr_verbose} > 0 ) and ($wcnt) ) { 737 push( @cfound, "\"$w\" in $wcnt lines" ); 738 } 739 } 740 $mcnt = max($mcnt, $cmcnt); 741 if ($mcnt == $cmcnt) { 742 @found = @cfound; 743 } 744 if ((not $modus) and ($cmcnt >= $conf->{focr_counts_required})) { 745 if ($mcnt == $cmcnt) { 746 $modus_match = 0; 747 } 748 infolog("Enough OCR Hits without space stripping, skipping second matching pass..."); 749 last; 750 } elsif (not $modus) { 751 infolog("Not enough OCR Hits without space stripping, doing second matching pass..."); 752 if ($mcnt == $cmcnt) { 753 $modus_match = 1; 754 } 755 } 756 } 728 757 if ($mcnt >= $conf->{focr_counts_required} and $conf->{focr_minimal_scanset}) { 729 758 infolog("Scanset \"$scanlabel\" generates enough hits ($mcnt), skipping further scansets..."); … … 750 779 push(@hashes, $info); 751 780 } 752 $cnt += $mcnt; 781 782 # Normal match or match without spaces? 783 if ($modus_match) { 784 $cnt += $mcnt; 785 } else { 786 $cnt += $conf->{focr_twopass_scoring_factor} * $mcnt; 787 } 753 788 } 754 789 close RAWERR if ($haserr>0); trunk/devel/FuzzyOcr/Config.pm
r80 r81 295 295 $self->{focr_keep_bad_images} = $value+0; 296 296 } 297 }); 298 299 push (@cmds, { 300 setting => 'focr_strip_numbers', 301 default => 1, 302 type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC 303 }); 304 305 push (@cmds, { 306 setting => 'focr_twopass_scoring_factor', 307 default => 1.5, 308 type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC 297 309 }); 298 310 trunk/devel/FuzzyOcr/Preprocessor.pm
r70 r81 23 23 my $rcmd = $self->{command}; 24 24 25 if (defined $args) { 26 $rcmd .= ' ' . $args; 27 } 28 25 29 # Does the processor expect input from file or from stdin? 26 30 if(defined $args and $args =~ /\$input/) { 27 $rcmd .= " $input";31 $rcmd =~ s/\$input/$input/; 28 32 } else { 29 33 $stdin = "<$input"; … … 32 36 # Does it output to file or to stdout? 33 37 if(defined $args and $args =~ /\$output/) { 34 $rcmd .= " $output";38 $rcmd =~ s/\$output/$output/; 35 39 } else { 36 40 $stdout = ">$output";
