Changeset 80

Show
Ignore:
Timestamp:
02.12.2006 21:39:04 (2 years ago)
Author:
decoder
Message:

Content-Type application/octet-stream is now considered generic (always accepted)
New rule FUZZY_OCR_WRONG_EXTENSION which hits files with fake extensions (e.g. GIF file blah.jpeg)

+ Scoring config switch for this rule

This is an improved and generalized version of Rosenbaum's patch

Files:

Legend:

Unmodified
Added
Removed
Modified
Copied
Moved
  • trunk/devel/FuzzyOcr.cf

    r79 r80  
    44loadplugin FuzzyOcr FuzzyOcr.pm 
    55 
    6 body     FUZZY_OCR               eval:fuzzyocr_check() 
    7 body     FUZZY_OCR_WRONG_CTYPE   eval:dummy_check() 
    8 body     FUZZY_OCR_CORRUPT_IMG   eval:dummy_check() 
    9 body     FUZZY_OCR_KNOWN_HASH    eval:dummy_check() 
    10  
    11 describe FUZZY_OCR               Mail contains an image with common spam text inside 
    12 describe FUZZY_OCR_WRONG_CTYPE   Mail contains an image with wrong content-type set 
    13 describe FUZZY_OCR_CORRUPT_IMG   Mail contains a corrupted image 
    14 describe FUZZY_OCR_KNOWN_HASH    Mail contains an image with known hash 
     6body     FUZZY_OCR                   eval:fuzzyocr_check() 
     7body     FUZZY_OCR_WRONG_CTYPE       eval:dummy_check() 
     8body     FUZZY_OCR_CORRUPT_IMG       eval:dummy_check() 
     9body     FUZZY_OCR_WRONG_EXTENSION   eval:dummy_check() 
     10body     FUZZY_OCR_KNOWN_HASH        eval:dummy_check() 
     11 
     12describe FUZZY_OCR                   Mail contains an image with common spam text inside 
     13describe FUZZY_OCR_WRONG_CTYPE       Mail contains an image with wrong content-type set 
     14describe FUZZY_OCR_WRONG_EXTENSION   Mail contains an image with wrong file extension 
     15describe FUZZY_OCR_CORRUPT_IMG       Mail contains a corrupted image 
     16describe FUZZY_OCR_KNOWN_HASH        Mail contains an image with known hash 
    1517 
    1618priority FUZZY_OCR 900 
     
    191193#focr_wrongctype_score 1.5 
    192194 
     195# This is the score to give for a wrong file extension. 
     196# e.g. JPEG image but file extension says GIF 
     197# Default value: 1.5 
     198#focr_wrongext_score 1.5 
     199 
    193200# This is the score to give for a corrupted image. 
    194201# This currently affects only GIF images 
  • trunk/devel/FuzzyOcr.pm

    r75 r80  
    3939use FuzzyOcr::Hashing qw(check_image_hash_db add_image_hash_db calc_image_hash); 
    4040use FuzzyOcr::Deanimate qw(deanimate); 
    41 use FuzzyOcr::Scoring qw(wrong_ctype corrupt_img known_img_hash); 
     41use FuzzyOcr::Scoring qw(wrong_ctype wrong_extension corrupt_img known_img_hash); 
    4242use FuzzyOcr::Misc qw(max removedir save_execute); 
    4343 
     
    137137        my $test  = 0; 
    138138        $test++ if ($ctype =~ /image/i); 
    139         $test++ if ($fname =~ /(gif|jpg|jpeg|png|bmp|tiff)$/i); 
     139        $test++ if ($fname =~ /(gif|jpg|jpeg|png|bmp|tiff?)$/i); 
    140140 
    141141        if ($test == 0) { 
     
    177177        my $pdatalen = length($pdata); 
    178178        my $w = 0; my $h = 0; 
     179 
    179180        if ( substr($pdata,0,3) eq "\x47\x49\x46" ) { 
    180181            ## GIF File 
     
    297298        my @used_scansets = (); 
    298299        my $corrupt = 0; 
     300        my $suffix = 0; 
     301        my $generic_ctype = 0; 
    299302        my $digest; 
    300303        my $tfile = $file; 
     
    304307        debuglog("efile => $efile"); 
    305308 
     309        my $mimetype = $$pic{ctype}; 
     310        if($mimetype =~ m'application/octet-stream'i) { 
     311            $generic_ctype = 1; 
     312        } 
     313 
     314        if($$pic{fname} =~ /\.([\w-]+)$/) { 
     315            $suffix = $1; 
     316        } 
     317        if ($suffix) { 
     318            debuglog("File has Content-Type \"$mimetype\" and File Extension \"$suffix\""); 
     319        } else { 
     320            debuglog("File has Content-Type \"$mimetype\" and no File Extension"); 
     321        } 
     322 
    306323        if ( $$pic{ftype} == 1 ) { 
    307324            infolog("Found GIF header name=\"$$pic{fname}\""); 
     
    315332            } 
    316333 
    317             if ( $$pic{ctype} !~ /gif/i ) { 
     334            if ( ($$pic{ctype} !~ /gif/i) and not $generic_ctype) { 
    318335                wrong_ctype( "GIF", $$pic{ctype} ); 
    319336            } 
     337 
     338            if ( $suffix and $suffix !~ /gif/i) { 
     339                wrong_extension( "GIF", $suffix); 
     340            } 
     341 
    320342            my $interlaced_gif = 0; 
    321343            my $image_count = 0; 
     
    452474                next; 
    453475            } 
    454             if ( $$pic{ctype} !~ /(jpeg|jpg)/i ) { 
     476            if ( ($$pic{ctype} !~ /(jpeg|jpg)/i) and not $generic_ctype) { 
    455477                wrong_ctype( "JPEG", $$pic{ctype} ); 
    456478            } 
     479 
     480            if ( $suffix and $suffix !~ /(jpeg|jpg|jfif)/i) { 
     481                wrong_extension( "JPEG", $suffix); 
     482            } 
     483 
    457484            foreach my $a (qw/jpegtopnm/) { 
    458485                unless (defined $conf->{"focr_bin_$a"}) { 
     
    486513                next; 
    487514            } 
    488             if ( $$pic{ctype} !~ /png/i ) { 
     515            if ( ($$pic{ctype} !~ /png/i) and not $generic_ctype) { 
    489516                wrong_ctype( "PNG", $$pic{ctype} ); 
     517            } 
     518            if ( $suffix and $suffix !~ /(png)/i) { 
     519                wrong_extension( "PNG", $suffix); 
    490520            } 
    491521            foreach my $a (qw/pngtopnm/) { 
     
    521551                next; 
    522552            } 
    523             if ( $$pic{ctype} !~ /bmp/i ) { 
     553            if ( ($$pic{ctype} !~ /bmp/i) and not $generic_ctype) { 
    524554                wrong_ctype( "BMP", $$pic{ctype} ); 
     555            } 
     556            if ( $suffix and $suffix !~ /(bmp)/i) { 
     557                wrong_extension( "BMP", $suffix); 
    525558            } 
    526559            foreach my $a (qw/bmptopnm/) { 
     
    555588                next; 
    556589            } 
    557             if ( $$pic{ctype} !~ /tiff/i ) { 
     590            if ( ($$pic{ctype} !~ /tif/i) and not $generic_ctype) { 
    558591                wrong_ctype( "TIFF", $$pic{ctype} ); 
     592            } 
     593            if ( $suffix and $suffix !~ /tif/i) { 
     594                wrong_extension( "TIFF", $suffix); 
    559595            } 
    560596 
  • trunk/devel/FuzzyOcr/Config.pm

    r79 r80  
    329329    push (@cmds, { 
    330330        setting => 'focr_wrongctype_score', 
     331        default => 1.5, 
     332        type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC 
     333    }); 
     334 
     335    push (@cmds, { 
     336        setting => 'focr_wrongext_score', 
    331337        default => 1.5, 
    332338        type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC 
  • trunk/devel/FuzzyOcr/Scoring.pm

    r70 r80  
    33 
    44use base 'Exporter'; 
    5 our @EXPORT_OK = qw(wrong_ctype corrupt_img known_img_hash); 
     5our @EXPORT_OK = qw(wrong_ctype corrupt_img known_img_hash wrong_extension); 
    66 
    77use lib qw(..); 
     
    2929            $conf->{'focr_wrongctype_score'}, "BODY: ", 
    3030            $pms->{conf}->{descriptions}->{FUZZY_OCR_WRONG_CTYPE} . "\n$debuginfo" ); 
     31    } 
     32} 
     33 
     34sub wrong_extension { 
     35    my $conf = get_config(); 
     36    my $pms = get_pms(); 
     37    my ( $format, $suffix ) = @_; 
     38    if ($conf->{'focr_wrongext_score'}) { 
     39        my $debuginfo = ""; 
     40        if ( $conf->{"focr_verbose"} > 0 ) { 
     41            $debuginfo =  
     42              ("Image has format \"$format\" but file extension is \"$suffix\""); 
     43        } 
     44        infolog($debuginfo); 
     45        my $ws = sprintf( "%0.3f", $conf->{'focr_wrongext_score'} ); 
     46        for my $set ( 0 .. 3 ) { 
     47            $pms->{conf}->{scoreset}->[$set]->{"FUZZY_OCR_WRONG_EXTENSION"} = $ws; 
     48        } 
     49        $pms->_handle_hit( "FUZZY_OCR_WRONG_EXTENSION", 
     50            $conf->{'focr_wrongext_score'}, "BODY: ", 
     51            $pms->{conf}->{descriptions}->{FUZZY_OCR_WRONG_EXTENSION} . "\n$debuginfo" ); 
    3152    } 
    3253}