Changeset 70

Show
Ignore:
Timestamp:
30.11.2006 18:47:07 (2 years ago)
Author:
jorge
Message:

Helper application configuration is now dynamic (see FuzzyOcr?.cf)
Changed format slightly for scansets and preprocessors (see FuzzyOcr?.preps and FuzzyOcr?.scansets)
Updated FuzzyOcr?.cf to better inform of plugin options.
Logging now shows error/info/debug lines in output to stderr.

Files:

Legend:

Unmodified
Added
Removed
Modified
Copied
Moved
  • trunk/devel/FuzzyOcr.cf

    r67 r70  
     1# Syntax: 
     2# loadplugin <Plugin_Name> <Location> 
     3#  <Location> path where Plugin resides. 
    14loadplugin FuzzyOcr FuzzyOcr.pm 
    2 body FUZZY_OCR eval:fuzzyocr_check() 
    3 describe FUZZY_OCR Mail contains an image with common spam text inside 
    4 body FUZZY_OCR_WRONG_CTYPE eval:dummy_check() 
    5 describe FUZZY_OCR_WRONG_CTYPE Mail contains an image with wrong content-type set 
    6 body FUZZY_OCR_CORRUPT_IMG eval:dummy_check() 
    7 describe FUZZY_OCR_CORRUPT_IMG Mail contains a corrupted image 
    8 body FUZZY_OCR_KNOWN_HASH eval:dummy_check() 
    9 describe FUZZY_OCR_KNOWN_HASH Mail contains an image with known hash 
    10  
    11 priority FUZZY_OCR             900 
    12  
    13 ########### Plugin Configuration ############# 
    14  
    15 #### Logging options ##### 
    16 # Verbosity level (see manual) Attention: Don't set to 0, but to 0.0 for quiet operation. (Default value: 1) 
    17 #focr_verbose 2 
    18 
    19 # When focr_verbose>1 you can log to stderr by setting this variable to 1 (Default value: 0) 
     5 
     6body     FUZZY_OCR               eval:fuzzyocr_check() 
     7body     FUZZY_OCR_WRONG_CTYPE   eval:dummy_check() 
     8body     FUZZY_OCR_CORRUPT_IMG   eval:dummy_check() 
     9body     FUZZY_OCR_KNOWN_HASH    eval:dummy_check() 
     10 
     11describe FUZZY_OCR               Mail contains an image with common spam text inside 
     12describe FUZZY_OCR_WRONG_CTYPE   Mail contains an image with wrong content-type set 
     13describe FUZZY_OCR_CORRUPT_IMG   Mail contains a corrupted image 
     14describe FUZZY_OCR_KNOWN_HASH    Mail contains an image with known hash 
     15 
     16priority FUZZY_OCR 900 
     17 
     18### 
     19### Plugin Configuration  
     20### 
     21 
     22### 
     23### Logging options 
     24### 
     25 
     26# Verbosity level (see manual) 
     27# Attention: Don't set to 0, but to 0.0 for quiet operation. 
     28# Default value: 1 
     29#focr_verbose 3 
     30 
     31# Send logging output to stderr. 
     32# Default value: 0 
    2033#focr_log_stderr 1 
    21 
    22 # Logfile (make sure it is writable by the plugin) (Default value: undefined (means no logfile if you don't set one!) 
    23 #focr_logfile /etc/mail/spamassassin/FuzzyOcr.log 
    24 ########################## 
    25  
    26 ##### Wordlists ##### 
    27 # Here we defined the words to scan for (Default value: /etc/mail/spamassassin/FuzzyOcr.words) 
     34 
     35# Logfile (make sure it is writable by the plugin)  
     36# Default value: none 
     37#focr_logfile /tmp/FuzzyOcr.log 
     38 
     39### 
     40### Wordlists  
     41### 
     42 
     43# Here we defined the words to scan for 
     44# Default value: /etc/mail/spamassassin/FuzzyOcr.words 
    2845#focr_global_wordlist /etc/mail/spamassassin/FuzzyOcr.words 
    2946# 
    30 # This is the path RELATIVE to the respektive home directory for the personalized list 
    31 # This list is merged with the global word list on execution (Default value: .spamassassin/fuzzyocr.words) 
    32 # If focr_personal_wordlist begins with '/', treats option as fixed path and does not search HOME 
    33 #focr_personal_wordlist .spamassassin/fuzzyocr.words 
    34 ##################### 
     47# This is the path RELATIVE to the respective home directory 
     48# for the personalized list. This list is merged with the global  
     49# word list on execution. 
     50# Default value: ~/.spamassassin/fuzzyocr.words  
     51# If value begins with '/', it is treated as fixed path. 
     52#focr_personal_wordlist fuzzyocr.words 
     53
     54 
     55### 
     56### Helper Applications 
     57### 
    3558 
    3659# These parameters can be used to change other detection settings 
    3760# If you leave these commented out, the defaults will be used. 
    3861# Do not use " " around any parameters! 
    39 
    40 ##### Location of helper applications (path + binary) (Default values: /usr/bin/<app>) ##### 
     62 
     63### 
     64### Step 1: 
     65### Inform the plugin which helper apps are required. 
     66### 
     67 
     68# The following are already included by default: 
     69
     70#focr_bin_helper gifsicle, giffix, giftext, gifinter, giftopnm 
     71#focr_bin_helper jpegtopnm, pngtopnm, bmptopnm, tifftopnm, ppmhist 
     72#focr_bin_helper gocr, ocrad 
     73 
     74# Include additional scanner/preprocessor commands here: 
     75
     76focr_bin_helper pnmnorm, pnminvert, convert 
     77focr_bin_helper tesseract 
     78 
     79### 
     80### Step 2: 
     81### Inform the plugin of the search path to find all helper apps. 
     82### Only the first match will be considered, so the order is important. 
     83### 
     84 
     85# Search path for locating helper applications 
     86#focr_path_bin /usr/local/netpbm/bin:/usr/local/bin:/usr/bin 
     87 
     88### 
     89### Step 3: 
     90### You can optionally define a helper application location, bypassing 
     91### the search path algorithm. Please note that if the helper app is not 
     92### previously defined, it will generate an error: 
     93 
    4194#focr_bin_gifsicle /usr/bin/gifsicle 
    4295#focr_bin_giffix /usr/bin/giffix 
     
    51104#focr_bin_gocr /usr/bin/gocr 
    52105#focr_bin_ocrad /usr/bin/ocrad 
    53 
    54 #focr_path_bin /usr/local/netpbm/bin:/usr/local/bin:/usr/bin 
    55 
    56 ############################################################################################ 
    57  
    58  
    59 ##### Scansets ##### 
    60 
    61 ##Paths to the files containing Scansets and Preprocessors used in Scansets. 
     106 
     107#focr_bin_pnmnorm /usr/bin/pnmnorm 
     108#focr_bin_pnminvert /usr/bin/pnminvert 
     109#focr_bin_convert /usr/bin/convert 
     110 
     111### 
     112### Scansets  
     113### 
     114 
     115# Paths to the files containing Scansets and Preprocessors definitions 
    62116# 
    63117#focr_preprocessor_file /etc/mail/spamassassin/FuzzyOcr.preps 
    64118#focr_scanset_file /etc/mail/spamassassin/FuzzyOcr.scansets 
    65 
    66 ## Setting this to 1 will cause FuzzyOcr to skip all other scansets, if a scanset has 
    67 ## reached the amount of hits specified in focr_counts_required. (i.e. if the image is detected as spam). 
    68 ## This saves resources, but lowers the scores because not the best, but the first best scanset is taken as result. 
    69 #focr_minimal_scanset 0.0 
    70 
    71 ## This option is only used when focr_minimal_scanset is enabled. Basically, this counts the effectiveness 
    72 ## of a scanset on the current mail traffic and resorts the scansets with the most effective first. 
    73 ## This saves unnecessary scanner passes and saves resources. The default is 1. 
    74 #focr_autosort_scanset 1 
    75 
    76 ## This is a parameter for the focr_autosort_scanset function, and specifies the maximum value of the effectiveness 
    77 ## counter used in each scanset. If you increase this, it will take longer until the autosort function adapts to new 
    78 ## types of spam, setting it too low will lower the effectiveness of the function. Default is 10. 
     119 
     120# Setting this to 1 will cause FuzzyOcr to skip all other scansets, 
     121# if a scanset has reached the amount of hits specified in  
     122# focr_counts_required. (i.e. if the image is detected as spam). 
     123# This saves resources, but lowers the scores because not the best,  
     124# but the first best scanset is taken as result. 
     125# Default value: 0 
     126#focr_minimal_scanset 1 
     127 
     128# This option is only used when focr_minimal_scanset is enabled.  
     129# Basically, this counts the effectiveness of a scanset on the current  
     130# mail traffic and resorts the scansets with the most effective first. 
     131# This saves unnecessary scanner passes and saves resources.  
     132# Default value: 1. 
     133#focr_autosort_scanset 0 
     134 
     135# This is a parameter for the focr_autosort_scanset function, and specifies 
     136# the maximum value of the effectiveness counter used in each scanset. If you 
     137# increase this, it will take longer until the autosort function adapts to new 
     138# types of spam, setting it too low will lower the effectiveness of the  
     139# function.  
     140# Default value: 10. 
    79141#focr_autosort_buffer 10 
    80 
    81 ######### 
    82  
    83 #################### 
    84  
    85 ##### Various Score/Scan settings ##### 
    86 # Timeout for the plugin, in seconds. (Default value: 10) 
    87 #focr_timeout 15 
    88 
    89 # Configures whether the timeout is global or per helper application (Default value: 0) 
    90 #focr_global_timeout 1 
    91 
    92 # Maximum file size for different formats in byte, bigger pictures will not be scanned (Default values: Unlimited) 
     142 
     143### 
     144### Scan Settings 
     145### 
     146 
     147# Timeout for the plugin, in seconds. (Maximum runtime of the plugin) 
     148# Default value: 10 
     149focr_timeout 15 
     150 
     151# Use a global timeout value instead of per helper application. 
     152# Default value: 0 
     153focr_global_timeout 1 
     154 
     155# Maximum file size for different formats in byte, bigger pictures  
     156# will not be scanned  
     157# Default values: Unlimited) 
    93158#focr_max_size_gif 80000 
    94159#focr_max_size_jpeg 100000 
     
    96161#focr_max_size_bmp 500000 
    97162#focr_max_size_tiff 500000 
    98 
    99 # Skip checking the following image types (default: 0 check all) 
     163 
     164# Skip checking the following image types  
     165# Default value: 0 (check image type) 
    100166#focr_skip_gif 1 
    101167#focr_skip_jpeg 1 
     
    103169#focr_skip_bmp 1 
    104170#focr_skip_tiff 1 
    105 
    106 # Default detection treshold (see manual) (Default value: 0.3) (Can be changed on a per word basis in the wordlist). 
    107 #focr_threshold 0.3 
    108 
    109 # This is the score for a hit after focr_counts_required matches 
    110 #focr_base_score 5 
    111 
    112 # This is the additional score for every additional match after focr_counts_required matches (Default value: 1) 
    113 #focr_add_score 0.375 
    114 
    115 # This is the score to give for a wrong content-type (e.g. JPEG image but content type says GIF) (Default value: 1.5) 
    116 #focr_wrongctype_score 1.5 
    117 
    118 # This is the score to give for a corrupted image (This currently affects only GIF images) (Default value: 2.5) 
    119 #focr_corrupt_score 2.5 
    120 
    121 # This is the score to give for a corrupted unfixable image (This currently affects only GIF images) (Default value: 5) 
    122 #focr_corrupt_unfixable_score 5 
    123 
    124 # This is used to disable the OCR engine if the message has already more points than this value (Default value: 10) 
    125 #focr_autodisable_score 10 
    126 
    127 # This is used to disable the OCR engine if the message has less points than this value (Default value: -5) 
    128 #focr_autodisable_negative_score -5 
    129 
     171 
     172# Default detection treshold (see manual)  
     173# Default value: 0.25 (Can be changed on a per word basis in the wordlist). 
     174#focr_threshold 0.20 
     175 
    130176# Number of minimum matches before the rule scores (Default value: 2) 
    131177#focr_counts_required 3 
    132 
    133 ####################################### 
    134  
    135 ##### Image Hash Database settings (Experimental, disabled by default) ##### 
    136 
    137 # Set this to 1 to enable the Image Hash database feature (Default value: 0.0) 
    138 # Value = 1 ... use digest_hash only 
    139 # Value = 2 ... use digest_db w/digest_hash import 
    140 # Value = 3 ... use mysql database 
     178 
     179# This is the score for a hit after focr_counts_required matches 
     180# Default value: 5 
     181focr_base_score 5 
     182 
     183# This is the additional score for every additional match after  
     184# focr_counts_required matches 
     185# Default value: 1 
     186focr_add_score 0.375 
     187 
     188# This is the score to give for a wrong content-type. 
     189# e.g. JPEG image but content type says GIF 
     190# Default value: 1.5 
     191#focr_wrongctype_score 1.5 
     192 
     193# This is the score to give for a corrupted image. 
     194# This currently affects only GIF images 
     195# Default value: 2.5 
     196#focr_corrupt_score 2.5 
     197 
     198# This is the score to give for a corrupted unfixable image. 
     199# This currently affects only GIF images. 
     200# Default value: 5 
     201#focr_corrupt_unfixable_score 5 
     202 
     203# This is used to disable the OCR engine if the message has  
     204# already more points than this value  
     205# Default value: 10 
     206#focr_autodisable_score 30 
     207 
     208# This is used to disable the OCR engine if the message has 
     209# already less points than this value  
     210# Default value: -5 
     211#focr_autodisable_negative_score -5 
     212 
     213 
     214### 
     215### Hashing Options (Optional) 
     216### 
     217 
     218# Select which type of image hashing to use: 
     219# Default value: 0 (disabled) 
     220# Allowed values: 
     221#  1 ... use digest_hash only (deprecated) 
     222#  2 ... use digest_db w/digest_hash import (see requirements) 
     223#  3 ... use mysql database (see requirements) 
     224#-- 
     225# The score is saved with the hash in the database, allowing the plugin to 
     226# skip the scans when the image is found in the database, using the score 
     227# from the previous scans. 
     228#-- 
    141229#focr_enable_image_hashing 3 
    142 
    143 # The score is saved with the hash in the database, so no extra scoring for a db hit is required. 
    144 
    145 # If the image hash database feature is enabled, specify the file here to use as database 
    146 # (Default value: /etc/mail/spamassassin/FuzzyOcr.hashdb) 
    147 focr_digest_db /etc/mail/spamassassin/FuzzyOcr.hashdb 
    148 
    149 # If the image hash db feature is enabled, specify the file here to use as database 
    150 # (Default value: ~/.spamassassin/FuzzyOcr.db) 
    151 focr_db_hash /etc/mail/spamassassin/FuzzyOcr.db 
    152 
    153 # If the image hash db feature is enabled, specify the file here to use as database 
    154 # (Default value: ~/.spamassassin/FuzzyOcr.safe.db) 
    155 focr_db_safe /etc/mail/spamassassin/FuzzyOcr.safe.db 
    156 
    157 # Expire records from focr_digest_db after (Default: 35) days 
     230 
     231# Set this to skip updating the hashing database at startup 
     232# Default value: 0 (update at startup) 
     233#focr_skip_updates 1 
     234 
     235# Automatically add hashes of spam images recognized by OCR to the Image  
     236# Hash database, to disable, set to 0.0  
     237# Default value: 1 (learn) 
     238#focr_hashing_learn_scanned 1 
     239 
     240# Score images who's global word count is below focr_counts_required using  
     241# the following formulae: (focr_add_score * word count) as score. 
     242# Default value: 0 (ignore images) 
     243#focr_score_ham 1 
     244 
     245# If the image hash database feature is enabled (Type 1 Hashing), 
     246# specify the file to use as database 
     247# Default value: /etc/mail/spamassassin/FuzzyOcr.hashdb 
     248#focr_digest_db /etc/mail/spamassassin/FuzzyOcr.hashdb 
     249 
     250# If the image hash db feature is enabled (Type 2 Hashing), 
     251# specify the file to use as the SPAM database 
     252# Default value: /etc/mail/spamassassin/FuzzyOcr.db 
     253#focr_db_hash /etc/mail/spamassassin/FuzzyOcr.db 
     254 
     255# If the image hash db feature is enabled (Type 2 Hashing),  
     256# specify the file to use as the HAM database 
     257# Default value: /etc/mail/spamassassin/FuzzyOcr.safe.db 
     258#focr_db_safe /etc/mail/spamassassin/FuzzyOcr.safe.db 
     259 
     260# Auto-prune: Expire records from hasing databases after these many days 
     261# Default value: 35 
    158262focr_db_max_days 15 
    159 
    160 # MySQL options 
     263 
     264### 
     265### MySQL options 
     266### 
     267 
    161268#focr_mysql_db FuzzyOcr 
    162269#focr_mysql_hash Hash 
     
    164271#focr_mysql_user fuzzyocr 
    165272#focr_mysql_pass fuzzyocr 
     273#focr_mysql_host localhost 
    166274#focr_mysql_port 3306 
    167275#focr_mysql_socket /tmp/mysql.sock 
    168 
    169 # Automatically add hashes of spam images recognized by OCR to the Image Hash database, to disable, set to 0.0 (Default value: 1) 
    170 #focr_hashing_learn_scanned 1 
    171 
    172 # Keep files that generate errors 
    173 #  0 = always cleanup 
     276 
     277# If set, the database table is updated with different data from one of 
     278# the following: 
     279#  + filename,  
     280#  + image-params, 
     281#  + content-type,  
     282#  + file-type,  
     283#  + score,  
     284#  + word-info 
     285# Default value: 0 
     286#focr_mysql_update_hash 1 
     287 
     288### 
     289### Miscellaneous Options 
     290### 
     291 
     292# The pluging uses a temporary directory to store intermediate information. 
     293# In order to Keep these files for debugging purposes use any of these 
     294# values: 
     295#  0 = always cleanup (default value) 
    174296#  1 = keep only if error 
    175297#  2 = always keep 
     298#-- 
     299# Keeping these intermediate files could fill your HDD _very_ fast! 
     300# Make shure you periodically empty your temp dir (usually: /tmp) or 
     301# suffer the conscecuences.  You've been warned!! 
     302#-- 
    176303#focr_keep_bad_images 1 
    177 
    178 # Score images who's global word count is below focr_counts_required using focr_add_score * word count as score. 
    179 #focr_score_ham 1 
    180 ###################################################################### 
    181  
     304 
     305################################################################# 
    182306# DO NOT REMOVE THIS LINE, IT IS REQUIRED UNDER ALL CIRCUMSTANCES 
    183307focr_end_config 
  • trunk/devel/FuzzyOcr.pm

    r69 r70  
    2020use POSIX; 
    2121 
    22 use lib qw(. /etc/mail/spamassassin); # Allow placing of FuzzyOcr in siteconfigdir 
    23  
    24 use FuzzyOcr::Logging qw(debuglog); 
     22use lib qw(.); # Allow placing of FuzzyOcr in siteconfigdir 
     23 
     24use FuzzyOcr::Logging qw(debuglog errorlog infolog); 
    2525use FuzzyOcr::Config qw(kill_pid 
    2626    get_tmpdir 
     
    7070    if ($conf->{focr_global_timeout}) { 
    7171        my $t = get_timeout(); 
    72         debuglog("Global Timeout set at ".$conf->{focr_timeout}." sec."); 
     72        infolog("Global Timeout set at ".$conf->{focr_timeout}." sec."); 
    7373        $t->run(sub { 
    7474            $end = fuzzyocr_do( $self, $conf, $pms ); 
    7575        }); 
    7676        if ($t->timed_out()) { 
    77             debuglog("Scan timed out after $conf->{focr_timeout} seconds."); 
    78             debuglog("Killing possibly running pid..."); 
     77            infolog("Scan timed out after $conf->{focr_timeout} seconds."); 
     78            infolog("Killing possibly running pid..."); 
    7979            my ($ret, $pid) = kill_pid(); 
    8080            if ($ret > 0) { 
    81                     debuglog("Successfully killed PID $pid"); 
     81                    infolog("Successfully killed PID $pid"); 
    8282            } elsif ($ret < 0) { 
    83                 debuglog("No processes left... exiting"); 
     83                infolog("No processes left... exiting"); 
    8484            } else { 
    85                 debuglog("Failed to kill PID $pid, stale process!"); 
     85                infolog("Failed to kill PID $pid, stale process!"); 
    8686            } 
    8787            return 0; 
     
    9090        $end = fuzzyocr_do( $self, $conf, $pms ); 
    9191    } 
    92     debuglog("Processed in ". 
     92    infolog("Processed in ". 
    9393        sprintf("%.6f",tv_interval($begin, [gettimeofday])) 
    9494        ." sec."); 
     
    103103 
    104104    if ( $current_score > $score ) { 
    105         debuglog("Scan canceled, message has already more than $score points ($current_score)."); 
     105        infolog("Scan canceled, message has already more than $score points ($current_score)."); 
    106106        return 0; 
    107107    } 
     
    109109    $score = $conf->{focr_autodiable_negative_score} || -100; 
    110110    if ( $current_score < $score ) { 
    111         debuglog("Scan canceled, message has less than $score points ($current_score)."); 
     111        infolog("Scan canceled, message has less than $score points ($current_score)."); 
    112112        return 0; 
    113113    } 
    114114 
    115     my $ddb; 
    116115    my $imgdir; 
    117116    my %imgfiles = (); 
     
    123122 
    124123    debuglog("Starting FuzzyOcr..."); 
    125     debuglog("Attempting to load personal wordlist..."); 
    126124    if ($conf->{focr_personal_wordlist} =~ m/^\//) { 
     125        debuglog("Attempting to load personal wordlist..."); 
    127126        read_words( $conf->{focr_personal_wordlist} ); 
    128     } else { 
    129         my $peruserlist = $main->sed_path($conf->{focr_personal_wordlist}); 
    130         if (-r $peruserlist) { 
    131             read_words($peruserlist); 
    132         } else { 
    133             debuglog("Error getting personal wordlist, skipping..."); 
    134         } 
    135127    } 
    136128 
     
    153145 
    154146        if ($test == 0) { 
    155             debuglog("Skipping file with content-type=\"$ctype\" name=\"$fname\""); 
     147            infolog("Skipping file with content-type=\"$ctype\" name=\"$fname\""); 
    156148            next; 
    157149        } 
     
    162154 
    163155        unless ($imgdir) { 
    164             debuglog("Scan canceled, cannot create Image TMPDIR."); 
     156            errorlog("Scan canceled, cannot create Image TMPDIR."); 
    165157            return 0; 
    166158        } 
     
    194186            $imgfiles{$imgfilename}{ftype} = 1;  
    195187            ($w,$h) = unpack("vv",substr($pdata,6,4)); 
    196             debuglog("GIF: $imgfilename '${h}x${w}'"); 
     188            debuglog("GIF: [${h}x${w}] $imgfilename"); 
    197189            $imgfiles{$imgfilename}{width}  = $w; 
    198190            $imgfiles{$imgfilename}{height} = $h; 
     
    204196                my ($b,$m) = unpack("CC",substr($pdata,$pos,2)); $pos += 2; 
    205197                if ($b != 0xff) { 
    206                    debuglog("Invalid JPEG image"); 
     198                   infolog("Invalid JPEG image"); 
    207199                   $pos = $pdatalen + 1; 
    208200                   last; 
     
    218210            } 
    219211            if ($pos > $pdatalen) { 
    220                 debuglog("Cannot find image dimensions"); 
     212                errorlog("Cannot find image dimensions"); 
    221213            } else { 
    222214                ($h,$w) = unpack("nn",substr($pdata,$pos+3,4)); 
    223                 debuglog("JPEG: $imgfilename '${h}x${w}'"); 
     215                debuglog("JPEG: [${h}x${w}] $imgfilename"); 
    224216                $imgfiles{$imgfilename}{ftype} = 2; 
    225217                $imgfiles{$imgfilename}{height} = $h; 
     
    232224            $imgfiles{$imgfilename}{width}  = $w; 
    233225            $imgfiles{$imgfilename}{height} = $h; 
    234             debuglog("PNG: $imgfilename '${h}x${w}'"); 
     226            debuglog("PNG: [${h}x${w}] $imgfilename"); 
    235227        } elsif ( substr($pdata,0,2) eq "BM" ) { 
    236228            ## BMP File 
     
    239231            $imgfiles{$imgfilename}{width}  = $w; 
    240232            $imgfiles{$imgfilename}{height} = $h; 
    241             debuglog("BMP: $imgfilename '${h}x${w}'"); 
     233            debuglog("BMP: [${h}x${w}] $imgfilename"); 
    242234        } elsif ( 
    243235            (substr($pdata,0,4) eq "\x4d\x4d\x00\x2a") or 
     
    250242                my $add = 2 + ($n * 12); 
    251243                my ($id,$tag,$cnt,$val)  = unpack($worder?"vvVV":"nnNN",substr($pdata,$offset+$add,12)); 
    252                 $h = $val if ($tag == 256); 
    253                 $w = $val if ($tag == 257); 
     244                $h = $val if ($id == 256); 
     245                $w = $val if ($id == 257); 
    254246                last if ($h != 0 and $w != 0); 
    255247            } 
    256             debuglog("TIFF: $imgfilename ($worder) '${h}x${w}"); 
    257             debuglog("Cannot determite size of TIFF image, setting to '1x1'") 
    258                 if ($h == 0 and $w  == 0); 
     248            debuglog("TIFF: [${h}x${w}] $imgfilename ($worder)"); 
     249            debuglog("Cannot determite size of TIFF image, setting to '1x1'") if ($h == 0 and $w == 0); 
    259250            $imgfiles{$imgfilename}{ftype}  = 5; 
    260251            $imgfiles{$imgfilename}{width}  = $w ? $w : 1; 
     
    266257        $imgfiles{$imgfilename}{fsize} = $pdatalen; 
    267258        unless (open PICT, ">$imgfilename") { 
    268             debuglog("Cannot write \"$imgfilename\", skipping..."); 
     259            errorlog("Cannot write \"$imgfilename\", skipping..."); 
    269260            delete $imgfiles{$imgfilename}; 
    270261            next; 
     
    278269 
    279270    if ($cnt == 0) { 
    280         debuglog("Skipping OCR, no image files found..."); 
     271        infolog("Skipping OCR, no image files found..."); 
    281272        removedir($imgdir) if (defined($imgdir) and ($conf->{focr_keep_bad_images}<2)); 
    282273        return 0; 
    283274    } 
    284     debuglog("Found: $cnt images"); $cnt = 0; 
     275    infolog("Found: $cnt images"); $cnt = 0; 
    285276    if ($conf->{focr_enable_image_hashing} == 3) { 
    286         $ddb = $conf->{focr_ddb} = get_mysql_ddb(); 
     277        $conf->{focr_ddb} = get_mysql_ddb(); 
    287278    } 
    288279 
     
    293284    foreach my $file (keys %imgfiles) { 
    294285        my $pic = $imgfiles{$file}; 
    295         debuglog("Analyzing file with content-type=\"$$pic{ctype}\""); 
     286        infolog("Analyzing file with content-type=\"$$pic{ctype}\""); 
    296287        my @used_scansets = (); 
    297288        my $corrupt = 0; 
     
    304295 
    305296        if ( $$pic{ftype} == 1 ) { 
    306             debuglog("Found GIF header name=\"$$pic{fname}\""); 
     297            infolog("Found GIF header name=\"$$pic{fname}\""); 
    307298            if ($conf->{focr_skip_gif}) { 
    308                 debuglog("Skipping image check"); 
     299                infolog("Skipping image check"); 
    309300                next IMAGE; 
    310301            } 
    311302            if (defined($conf->{focr_max_size_gif}) and ($$pic{fsize} > $conf->{focr_max_size_gif})) { 
    312                 debuglog("GIF file size ($$pic{fsize}) exceeds maximum file size for this format, skipping..."); 
     303                infolog("GIF file size ($$pic{fsize}) exceeds maximum file size for this format, skipping..."); 
    313304                next; 
    314305            } 
     
    322313            foreach my $a (qw/gifsicle giftext giffix gifinter giftopnm/) { 
    323314                unless (defined $conf->{"focr_bin_$a"}) { 
    324                     debuglog("Cannot exec $a, skipping image"); 
     315                    errorlog("Cannot exec $a, skipping image"); 
    325316                    next IMAGE; 
    326317                } 
     
    336327            if ($retcode<0) { # only care if we timed out 
    337328                chomp $retcode; 
    338                 debuglog("$conf->{focr_bin_giftext} Timed out [$retcode], skipping..."); 
     329                errorlog("$conf->{focr_bin_giftext} Timed out [$retcode], skipping..."); 
    339330                ++$imgerr if $conf->{focr_keep_bad_images}>0; next; 
    340331            } 
     
    351342            } 
    352343            if ($interlaced_gif or ($image_count > 1)) { 
    353                 debuglog("Image is interlaced or animated..."); 
     344                infolog("Image is interlaced or animated..."); 
    354345            } 
    355346            else { 
    356                 debuglog("Image is single non-interlaced..."); 
     347                infolog("Image is single non-interlaced..."); 
    357348                $tfile .= "-fixed.gif"; 
    358349                printf RAWERR "## $conf->{focr_bin_giffix} $file >$tfile 2>>$efile\n" if ($haserr>0); 
     
    362353                if ($retcode<0) { # only care if we timed out 
    363354                    chomp $retcode; 
    364                     debuglog("$conf->{focr_bin_giffix}: Timed out [$retcode], skipping..."); 
     355                    errorlog("$conf->{focr_bin_giffix}: Timed out [$retcode], skipping..."); 
    365356                    printf RAWERR "?? Timed out > $retcode\n" if ($haserr>0); 
    366357                    ++$imgerr if $conf->{focr_keep_bad_images}>0; next; 
     
    381372            if ($corrupt) { 
    382373                if ($interlaced_gif or ($image_count > 1)) { 
    383                     debuglog("Skipping corrupted interlaced image..."); 
     374                    infolog("Skipping corrupted interlaced image..."); 
    384375                    corrupt_img($conf->{focr_corrupt_unfixable_score}, $corrupt); 
    385376                    next; 
    386377                } 
    387378                if (-z $tfile) { 
    388                     debuglog("Uncorrectable corruption detected, skipping non-interlaced image..."); 
     379                    infolog("Uncorrectable corruption detected, skipping non-interlaced image..."); 
    389380                    corrupt_img($conf->{focr_corrupt_unfixable_score}, $corrupt); 
    390381                    next; 
    391382                } 
    392                 debuglog("Image is corrupt, but seems fixable, continuing..."); 
     383                infolog("Image is corrupt, but seems fixable, continuing..."); 
    393384                corrupt_img($conf->{focr_corrupt_score}, $corrupt); 
    394385            } 
    395386 
    396387            if ($image_count > 1) { 
    397                 debuglog("File contains <$image_count> images, deanimating..."); 
     388                infolog("File contains <$image_count> images, deanimating..."); 
    398389                $tfile = deanimate($tfile); 
    399390            } 
    400391 
    401392            if ($interlaced_gif) { 
    402                 debuglog("Processing interlaced_gif $tfile..."); 
     393                infolog("Processing interlaced_gif $tfile..."); 
    403394                my $cfile = $tfile; 
    404395                if ($tfile =~ m/\.gif$/i) { 
     
    414405                    chomp $retcode; 
    415406                    printf RAWERR "?? Timed out > $retcode\n" if ($haserr>0); 
    416                     debuglog("$conf->{focr_bin_gifinter}: Timed out [$retcode], skipping..."); 
     407                    errorlog("$conf->{focr_bin_gifinter}: Timed out [$retcode], skipping..."); 
    417408                    ++$imgerr if $conf->{focr_keep_bad_images}>0; next; 
    418409                } elsif ($retcode>0) { 
    419410                    chomp $retcode; 
    420411                    printf RAWERR "?? [$retcode] returned from $conf->{focr_bin_gifinter}\n" if ($haserr>0); 
    421                     debuglog("$conf->{focr_bin_gifinter}: Returned [$retcode], skipping..."); 
     412                    errorlog("$conf->{focr_bin_gifinter}: Returned [$retcode], skipping..."); 
    422413                    ++$imgerr if $conf->{focr_keep_bad_images}>0; next; 
    423414                } 
     
    431422                chomp $retcode; 
    432423                printf RAWERR "?? Timed out > $retcode\n" if ($haserr>0); 
    433                 debuglog("$conf->{focr_bin_giftopnm}: Timed out [$retcode], skipping..."); 
     424                errorlog("$conf->{focr_bin_giftopnm}: Timed out [$retcode], skipping..."); 
    434425                ++$imgerr if $conf->{focr_keep_bad_images}>0; next; 
    435426            } elsif ($retcode>0) { 
    436427                chomp $retcode; 
    437428                printf RAWERR "?? [$retcode] returned from $conf->{focr_bin_giftopnm}\n" if ($haserr>0); 
    438                 debuglog("$conf->{focr_bin_giftopnm}: Returned [$retcode], skipping..."); 
     429                errorlog("$conf->{focr_bin_giftopnm}: Returned [$retcode], skipping..."); 
    439430                ++$imgerr if $conf->{focr_keep_bad_images}>0; next; 
    440431            } 
    441432        } 
    442433        elsif ( $$pic{ftype} == 2 ) { 
    443             debuglog("Found JPEG header name=\"$$pic{fname}\""); 
     434            infolog("Found JPEG header name=\"$$pic{fname}\""); 
    444435            if ($conf->{focr_skip_jpeg}) { 
    445                 debuglog("Skipping image check"); 
     436                infolog("Skipping image check"); 
    446437                next IMAGE; 
    447438            } 
    448439 
    449440            if (defined($conf->{focr_max_size_jpeg}) and ($$pic{fsize} > $conf->{focr_max_size_jpeg})) { 
    450                 debuglog("JPEG file size ($$pic{fsize}) exceeds maximum file size for this format, skipping..."); 
     441                infolog("JPEG file size ($$pic{fsize}) exceeds maximum file size for this format, skipping..."); 
    451442                next; 
    452443            } 
     
    456447            foreach my $a (qw/jpegtopnm/) { 
    457448                unless (defined $conf->{"focr_bin_$a"}) { 
    458                     debuglog("Cannot exec $a, skipping image"); 
     449                    errorlog("Cannot exec $a, skipping image"); 
    459450                    next IMAGE; 
    460451                } 
     
    466457                chomp $retcode; 
    467458                printf RAWERR "?? Timed out > $retcode\n" if ($haserr>0); 
    468                 debuglog("$conf->{focr_bin_jpegtopnm}: Timed out [$retcode], skipping..."); 
     459                errorlog("$conf->{focr_bin_jpegtopnm}: Timed out [$retcode], skipping..."); 
    469460                ++$imgerr if $conf->{focr_keep_bad_images}>0; next; 
    470461            } elsif ($retcode>0) { 
    471462                chomp $retcode; 
    472463                printf RAWERR "?? [$retcode] returned from $conf->{focr_bin_jpegtopnm}\n" if ($haserr>0); 
    473                 debuglog("$conf->{focr_bin_jpegtopnm}: Returned [$retcode], skipping..."); 
     464                errorlog("$conf->{focr_bin_jpegtopnm}: Returned [$retcode], skipping..."); 
    474465                ++$imgerr if $conf->{focr_keep_bad_images}>0; next; 
    475466            } 
    476467        } 
    477468        elsif ( $$pic{ftype} == 3 ) { 
    478             debuglog("Found PNG header name=\"$$pic{fname}\""); 
     469            infolog("Found PNG header name=\"$$pic{fname}\""); 
    479470            if ($conf->{focr_skip_png}) { 
    480                 debuglog("Skipping image check"); 
     471                infolog("Skipping image check"); 
    481472                next IMAGE; 
    482473            } 
    483474            if (defined($conf->{focr_max_size_png}) and ($$pic{fsize} > $conf->{focr__max_size_png})) { 
    484                 debuglog("PNG file size ($$pic{fsize}) exceeds maximum file size for this format, skipping..."); 
     475                infolog("PNG file size ($$pic{fsize}) exceeds maximum file size for this format, skipping..."); 
    485476                next; 
    486477            } 
     
    490481            foreach my $a (qw/pngtopnm/) { 
    491482                unless (defined $conf->{"focr_bin_$a"}) { 
    492                     debuglog("Cannot exec $a, skipping image"); 
     483                    errorlog("Cannot exec $a, skipping image"); 
    493484                    next IMAGE; 
    494485                } 
     
    501492                chomp $retcode; 
    502493                printf RAWERR "?? Timed out > $retcode\n" if ($haserr>0); 
    503                 debuglog("$conf->{focr_bin_pngtopnm}: Timed out [$retcode], skipping..."); 
     494                errorlog("$conf->{focr_bin_pngtopnm}: Timed out [$retcode], skipping..."); 
    504495                ++$imgerr if $conf->{focr_keep_bad_images}>0; next; 
    505496            } elsif ($retcode>0) { 
    506497                chomp $retcode; 
    507498                printf RAWERR "?? [$retcode] returned from $conf->{focr_bin_pngtopnm}\n" if ($haserr>0); 
    508                 debuglog("$conf->{focr_bin_pngtopnm}: Returned [$retcode], skipping..."); 
     499                errorlog("$conf->{focr_bin_pngtopnm}: Returned [$retcode], skipping..."); 
    509500                ++$imgerr if $conf->{focr_keep_bad_images}>0; next; 
    510501            } 
    511502        } 
    512503        elsif ( $$pic{ftype} == 4 ) { 
    513             debuglog("Found BMP header name=\"$$pic{fname}\""); 
     504            infolog("Found BMP header name=\"$$pic{fname}\""); 
    514505            if ($conf->{focr_skip_bmp}) { 
    515                 debuglog("Skipping image check"); 
     506                infolog("Skipping image check"); 
    516507                next IMAGE; 
    517508            } 
    518509            if (defined($conf->{focr_max_size_bmp}) and ($$pic{fsize} > $conf->{focr_max_size_bmp})) { 
    519                 debuglog("BMP file size ($$pic{fsize}) exceeds maximum file size for this format, skipping..."); 
     510                infolog("BMP file size ($$pic{fsize}) exceeds maximum file size for this format, skipping..."); 
    520511                next; 
    521512            } 
     
    525516            foreach my $a (qw/bmptopnm/) { 
    526517                unless (defined $conf->{"focr_bin_$a"}) { 
    527                     debuglog("Cannot exec $a, skipping image"); 
     518                    errorlog("Cannot exec $a, skipping image"); 
    528519                    next IMAGE; 
    529520                } 
     
    535526                chomp $retcode; 
    536527                printf RAWERR "?? Timed out > $retcode\n" if ($haserr>0); 
    537                 debuglog("$conf->{focr_bin_bmptopnm}: Timed out [$retcode], skipping..."); 
     528                errorlog("$conf->{focr_bin_bmptopnm}: Timed out [$retcode], skipping..."); 
    538529                ++$imgerr if $conf->{focr_keep_bad_images}>0; next; 
    539530            } elsif ($retcode>0) { 
    540531                chomp $retcode; 
    541532                printf RAWERR "?? [$retcode] returned from $conf->{focr_bin_bmptopnm}\n" if ($haserr>0); 
    542                 debuglog("$conf->{focr_bin_bmptopnm}: Returned [$retcode], skipping..."); 
     533                errorlog("$conf->{focr_bin_bmptopnm}: Returned [$retcode], skipping..."); 
    543534                ++$imgerr if $conf->{focr_keep_bad_images}>0; next; 
    544535            } 
    545536        } 
    546537        elsif ( $$pic{ftype} == 5 ) { 
    547             debuglog("Found TIFF header name=\"$$pic{fname}\""); 
     538            infolog("Found TIFF header name=\"$$pic{fname}\""); 
    548539            if ($conf->{focr_skip_tiff}) { 
    549                 debuglog("Skipping image check"); 
     540                infolog("Skipping image check"); 
    550541                next IMAGE; 
    551542            } 
    552543            if (defined($conf->{focr_max_size_tiff}) and ($$pic{fsize} > $conf->{focr_max_size_tiff})) { 
    553                 debuglog("TIFF file size ($$pic{fsize}) exceeds maximum file size for this format, skipping..."); 
     544                infolog("TIFF file size ($$pic{fsize}) exceeds maximum file size for this format, skipping..."); 
    554545                next; 
    555546            } 
     
    560551            foreach my $a (qw/tifftopnm/) { 
    561552                unless (defined $conf->{"focr_bin_$a"}) { 
    562                     debuglog("Cannot exec $a, skipping image"); 
     553                    errorlog("Cannot exec $a, skipping image"); 
    563554                    next IMAGE; 
    564555                } 
     
    570561                chomp $retcode; 
    571562                printf RAWERR "?? Timed out > $retcode\n" if ($haserr>0); 
    572                 debuglog("$conf->{focr_bin_tifftopnm}: Timed out [$retcode], skipping..."); 
     563                errorlog("$conf->{focr_bin_tifftopnm}: Timed out [$retcode], skipping..."); 
    573564                ++$imgerr if $conf->{focr_keep_bad_images}>0; next; 
    574565            } elsif ($retcode>0) { 
    575566                chomp $retcode; 
    576567                printf RAWERR "?? [$retcode] returned from $conf->{focr_bin_tifftopnm}\n" if ($haserr>0); 
    577                 debuglog("$conf->{focr_bin_tifftopnm}: Returned [$retcode], skipping..."); 
     568                errorlog("$conf->{focr_bin_tifftopnm}: Returned [$retcode], skipping..."); 
    578569                ++$imgerr if $conf->{focr_keep_bad_images}>0; next; 
    579570            } 
    580571        } 
    581572        else { 
    582             debuglog("Image type not recognized, unknown format. Skipping this image..."); 
     573            errorlog("Image type not recognized, unknown format. Skipping this image..."); 
    583574            next; 
    584575        } 
    585576 
    586577        if($conf->{focr_enable_image_hashing}) { 
    587             debuglog("Calculating the image hash: $pfile"); 
     578            infolog("Calculating image hash for: $pfile"); 
    588579            ($corrupt, $digest) = calc_image_hash($pfile,$pic); 
    589580            if ($corrupt) { 
    590                 debuglog("Error calculating the image hash, skipping hash check..."); 
     581                infolog("Error calculating the image hash, skipping hash check..."); 
    591582            } else { 
    592583                my ($score, $dinfo, $whash); 
     
    597588                if ($score > 0) { 
    598589                    known_img_hash($score,$dinfo); 
    599                     debuglog("Message is SPAM. $dinfo") if ($conf->{focr_enable_image_hashing} < 3); 
     590                    infolog("Message is SPAM. $dinfo") if ($conf->{focr_enable_image_hashing} < 3); 
    600591                    removedir($imgdir); 
    601592                    return 0; 
     
    606597                ($score,$dinfo) = check_image_hash_db($digest, $whash, $$pic{fname}, $$pic{ctype}, $$pic{ftype}); 
    607598                if ($score > 0) { 
    608                     debuglog("Image in KNOWN_GOOD. Skipping OCR checks..."); 
     599                    infolog("Image in KNOWN_GOOD. Skipping OCR checks..."); 
    609600                    next IMAGE; 
    610601                } 
    611602            } 
    612603            if ($digest eq '') { 
    613                 debuglog("Empty Hash, skipping..."); 
     604                infolog("Empty Hash, skipping..."); 
    614605                next IMAGE; 
    615606            } 
    616607        } else { 
    617             debuglog("Image hashing disabled in configuration, skipping..."); 
     608            infolog("Image hashing disabled in configuration, skipping..."); 
    618609        } 
    619610        my @ocr_results = (); 
     
    623614        my %words = %$wref; 
    624615        foreach my $scanset (@$scansets) { 
     616            my $scanlabel = $scanset->{label}; 
     617            my $scancmd   = $scanset->{command}; 
     618            if ($scancmd} =~ m/^\$/) { 
     619                infolog("Skipping $scanlabel, invalid command '$scancmd'"); 
     620                next; 
     621            } 
    625622            my $cmcnt = 0; 
    626623            my @cfound; 
    627             my $scancmd = $scanset->{ocr_command}; 
    628             my $scanlabel = $scanset->{label}; 
     624            if (defined $scanset->{args}) { 
     625                $scancmd .= ' ' . $scanset->{args}; 
     626            } 
    629627            printf RAWERR qq(## $scancmd\n) if ($haserr>0); 
    630628            my ($retcode, @result) = $scanset->run($pfile); 
    631629            if ($retcode<0) { 
    632                 debuglog("Timeout: \"$scancmd\" took more than $conf->{focr_timeout} sec."); 
    633                 debuglog("Skipping scanset \"$scanlabel\" due to timeout, trying next..."); 
    634                 printf RAWERR qq(Timeout[$conf->{focr_timeout}]: $scancmd\n) if ($haserr>0); 
     630                if ($retcode == -1) { 
     631                    printf RAWERR qq(Timeout[$conf->{focr_timeout}]: $scancmd\n) if ($haserr>0); 
     632