Changeset 70
- Timestamp:
- 30.11.2006 18:47:07 (2 years ago)
- Files:
-
- trunk/devel/FuzzyOcr.cf (modified) (5 diffs)
- trunk/devel/FuzzyOcr.pm (modified) (43 diffs)
- trunk/devel/FuzzyOcr.preps (modified) (1 diff)
- trunk/devel/FuzzyOcr.scansets (modified) (1 diff)
- trunk/devel/FuzzyOcr/Config.pm (modified) (26 diffs)
- trunk/devel/FuzzyOcr/Deanimate.pm (modified) (6 diffs)
- trunk/devel/FuzzyOcr/Hashing.pm (modified) (25 diffs)
- trunk/devel/FuzzyOcr/Logging.pm (modified) (2 diffs)
- trunk/devel/FuzzyOcr/Misc.pm (modified) (7 diffs)
- trunk/devel/FuzzyOcr/Preprocessor.pm (modified) (3 diffs)
- trunk/devel/FuzzyOcr/Scanset.pm (modified) (6 diffs)
- trunk/devel/FuzzyOcr/Scoring.pm (modified) (3 diffs)
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
trunk/devel/FuzzyOcr.cf
r67 r70 1 # Syntax: 2 # loadplugin <Plugin_Name> <Location> 3 # <Location> path where Plugin resides. 1 4 loadplugin FuzzyOcr FuzzyOcr.pm 2 body FUZZY_OCR eval:fuzzyocr_check() 3 describe FUZZY_OCR Mail contains an image with common spam text inside 4 body FUZZY_OCR_WRONG_CTYPE eval:dummy_check() 5 describe FUZZY_OCR_WRONG_CTYPE Mail contains an image with wrong content-type set 6 body FUZZY_OCR_CORRUPT_IMG eval:dummy_check() 7 describe FUZZY_OCR_CORRUPT_IMG Mail contains a corrupted image 8 body FUZZY_OCR_KNOWN_HASH eval:dummy_check() 9 describe FUZZY_OCR_KNOWN_HASH Mail contains an image with known hash 10 11 priority FUZZY_OCR 900 12 13 ########### Plugin Configuration ############# 14 15 #### Logging options ##### 16 # Verbosity level (see manual) Attention: Don't set to 0, but to 0.0 for quiet operation. (Default value: 1) 17 #focr_verbose 2 18 # 19 # When focr_verbose>1 you can log to stderr by setting this variable to 1 (Default value: 0) 5 6 body FUZZY_OCR eval:fuzzyocr_check() 7 body FUZZY_OCR_WRONG_CTYPE eval:dummy_check() 8 body FUZZY_OCR_CORRUPT_IMG eval:dummy_check() 9 body FUZZY_OCR_KNOWN_HASH eval:dummy_check() 10 11 describe FUZZY_OCR Mail contains an image with common spam text inside 12 describe FUZZY_OCR_WRONG_CTYPE Mail contains an image with wrong content-type set 13 describe FUZZY_OCR_CORRUPT_IMG Mail contains a corrupted image 14 describe FUZZY_OCR_KNOWN_HASH Mail contains an image with known hash 15 16 priority FUZZY_OCR 900 17 18 ### 19 ### Plugin Configuration 20 ### 21 22 ### 23 ### Logging options 24 ### 25 26 # Verbosity level (see manual) 27 # Attention: Don't set to 0, but to 0.0 for quiet operation. 28 # Default value: 1 29 #focr_verbose 3 30 31 # Send logging output to stderr. 32 # Default value: 0 20 33 #focr_log_stderr 1 21 # 22 # Logfile (make sure it is writable by the plugin) (Default value: undefined (means no logfile if you don't set one!) 23 #focr_logfile /etc/mail/spamassassin/FuzzyOcr.log 24 ########################## 25 26 ##### Wordlists ##### 27 # Here we defined the words to scan for (Default value: /etc/mail/spamassassin/FuzzyOcr.words) 34 35 # Logfile (make sure it is writable by the plugin) 36 # Default value: none 37 #focr_logfile /tmp/FuzzyOcr.log 38 39 ### 40 ### Wordlists 41 ### 42 43 # Here we defined the words to scan for 44 # Default value: /etc/mail/spamassassin/FuzzyOcr.words 28 45 #focr_global_wordlist /etc/mail/spamassassin/FuzzyOcr.words 29 46 # 30 # This is the path RELATIVE to the respektive home directory for the personalized list 31 # This list is merged with the global word list on execution (Default value: .spamassassin/fuzzyocr.words) 32 # If focr_personal_wordlist begins with '/', treats option as fixed path and does not search HOME 33 #focr_personal_wordlist .spamassassin/fuzzyocr.words 34 ##################### 47 # This is the path RELATIVE to the respective home directory 48 # for the personalized list. This list is merged with the global 49 # word list on execution. 50 # Default value: ~/.spamassassin/fuzzyocr.words 51 # If value begins with '/', it is treated as fixed path. 52 #focr_personal_wordlist fuzzyocr.words 53 # 54 55 ### 56 ### Helper Applications 57 ### 35 58 36 59 # These parameters can be used to change other detection settings 37 60 # If you leave these commented out, the defaults will be used. 38 61 # Do not use " " around any parameters! 39 # 40 ##### Location of helper applications (path + binary) (Default values: /usr/bin/<app>) ##### 62 63 ### 64 ### Step 1: 65 ### Inform the plugin which helper apps are required. 66 ### 67 68 # The following are already included by default: 69 # 70 #focr_bin_helper gifsicle, giffix, giftext, gifinter, giftopnm 71 #focr_bin_helper jpegtopnm, pngtopnm, bmptopnm, tifftopnm, ppmhist 72 #focr_bin_helper gocr, ocrad 73 74 # Include additional scanner/preprocessor commands here: 75 # 76 focr_bin_helper pnmnorm, pnminvert, convert 77 focr_bin_helper tesseract 78 79 ### 80 ### Step 2: 81 ### Inform the plugin of the search path to find all helper apps. 82 ### Only the first match will be considered, so the order is important. 83 ### 84 85 # Search path for locating helper applications 86 #focr_path_bin /usr/local/netpbm/bin:/usr/local/bin:/usr/bin 87 88 ### 89 ### Step 3: 90 ### You can optionally define a helper application location, bypassing 91 ### the search path algorithm. Please note that if the helper app is not 92 ### previously defined, it will generate an error: 93 41 94 #focr_bin_gifsicle /usr/bin/gifsicle 42 95 #focr_bin_giffix /usr/bin/giffix … … 51 104 #focr_bin_gocr /usr/bin/gocr 52 105 #focr_bin_ocrad /usr/bin/ocrad 53 # 54 #focr_path_bin /usr/local/netpbm/bin:/usr/local/bin:/usr/bin 55 # 56 ############################################################################################ 57 58 59 ##### Scansets ##### 60 # 61 ##Paths to the files containing Scansets and Preprocessors used in Scansets. 106 107 #focr_bin_pnmnorm /usr/bin/pnmnorm 108 #focr_bin_pnminvert /usr/bin/pnminvert 109 #focr_bin_convert /usr/bin/convert 110 111 ### 112 ### Scansets 113 ### 114 115 # Paths to the files containing Scansets and Preprocessors definitions 62 116 # 63 117 #focr_preprocessor_file /etc/mail/spamassassin/FuzzyOcr.preps 64 118 #focr_scanset_file /etc/mail/spamassassin/FuzzyOcr.scansets 65 # 66 ## Setting this to 1 will cause FuzzyOcr to skip all other scansets, if a scanset has 67 ## reached the amount of hits specified in focr_counts_required. (i.e. if the image is detected as spam). 68 ## This saves resources, but lowers the scores because not the best, but the first best scanset is taken as result. 69 #focr_minimal_scanset 0.0 70 # 71 ## This option is only used when focr_minimal_scanset is enabled. Basically, this counts the effectiveness 72 ## of a scanset on the current mail traffic and resorts the scansets with the most effective first. 73 ## This saves unnecessary scanner passes and saves resources. The default is 1. 74 #focr_autosort_scanset 1 75 # 76 ## This is a parameter for the focr_autosort_scanset function, and specifies the maximum value of the effectiveness 77 ## counter used in each scanset. If you increase this, it will take longer until the autosort function adapts to new 78 ## types of spam, setting it too low will lower the effectiveness of the function. Default is 10. 119 120 # Setting this to 1 will cause FuzzyOcr to skip all other scansets, 121 # if a scanset has reached the amount of hits specified in 122 # focr_counts_required. (i.e. if the image is detected as spam). 123 # This saves resources, but lowers the scores because not the best, 124 # but the first best scanset is taken as result. 125 # Default value: 0 126 #focr_minimal_scanset 1 127 128 # This option is only used when focr_minimal_scanset is enabled. 129 # Basically, this counts the effectiveness of a scanset on the current 130 # mail traffic and resorts the scansets with the most effective first. 131 # This saves unnecessary scanner passes and saves resources. 132 # Default value: 1. 133 #focr_autosort_scanset 0 134 135 # This is a parameter for the focr_autosort_scanset function, and specifies 136 # the maximum value of the effectiveness counter used in each scanset. If you 137 # increase this, it will take longer until the autosort function adapts to new 138 # types of spam, setting it too low will lower the effectiveness of the 139 # function. 140 # Default value: 10. 79 141 #focr_autosort_buffer 10 80 # 81 ######### 82 83 #################### 84 85 ##### Various Score/Scan settings ##### 86 # Timeout for the plugin, in seconds. (Default value: 10) 87 #focr_timeout 15 88 # 89 # Configures whether the timeout is global or per helper application (Default value: 0) 90 #focr_global_timeout 1 91 # 92 # Maximum file size for different formats in byte, bigger pictures will not be scanned (Default values: Unlimited) 142 143 ### 144 ### Scan Settings 145 ### 146 147 # Timeout for the plugin, in seconds. (Maximum runtime of the plugin) 148 # Default value: 10 149 focr_timeout 15 150 151 # Use a global timeout value instead of per helper application. 152 # Default value: 0 153 focr_global_timeout 1 154 155 # Maximum file size for different formats in byte, bigger pictures 156 # will not be scanned 157 # Default values: Unlimited) 93 158 #focr_max_size_gif 80000 94 159 #focr_max_size_jpeg 100000 … … 96 161 #focr_max_size_bmp 500000 97 162 #focr_max_size_tiff 500000 98 # 99 # Skip checking the following image types (default: 0 check all) 163 164 # Skip checking the following image types 165 # Default value: 0 (check image type) 100 166 #focr_skip_gif 1 101 167 #focr_skip_jpeg 1 … … 103 169 #focr_skip_bmp 1 104 170 #focr_skip_tiff 1 105 # 106 # Default detection treshold (see manual) (Default value: 0.3) (Can be changed on a per word basis in the wordlist). 107 #focr_threshold 0.3 108 # 109 # This is the score for a hit after focr_counts_required matches 110 #focr_base_score 5 111 # 112 # This is the additional score for every additional match after focr_counts_required matches (Default value: 1) 113 #focr_add_score 0.375 114 # 115 # This is the score to give for a wrong content-type (e.g. JPEG image but content type says GIF) (Default value: 1.5) 116 #focr_wrongctype_score 1.5 117 # 118 # This is the score to give for a corrupted image (This currently affects only GIF images) (Default value: 2.5) 119 #focr_corrupt_score 2.5 120 # 121 # This is the score to give for a corrupted unfixable image (This currently affects only GIF images) (Default value: 5) 122 #focr_corrupt_unfixable_score 5 123 # 124 # This is used to disable the OCR engine if the message has already more points than this value (Default value: 10) 125 #focr_autodisable_score 10 126 # 127 # This is used to disable the OCR engine if the message has less points than this value (Default value: -5) 128 #focr_autodisable_negative_score -5 129 # 171 172 # Default detection treshold (see manual) 173 # Default value: 0.25 (Can be changed on a per word basis in the wordlist). 174 #focr_threshold 0.20 175 130 176 # Number of minimum matches before the rule scores (Default value: 2) 131 177 #focr_counts_required 3 132 # 133 ####################################### 134 135 ##### Image Hash Database settings (Experimental, disabled by default) ##### 136 # 137 # Set this to 1 to enable the Image Hash database feature (Default value: 0.0) 138 # Value = 1 ... use digest_hash only 139 # Value = 2 ... use digest_db w/digest_hash import 140 # Value = 3 ... use mysql database 178 179 # This is the score for a hit after focr_counts_required matches 180 # Default value: 5 181 focr_base_score 5 182 183 # This is the additional score for every additional match after 184 # focr_counts_required matches 185 # Default value: 1 186 focr_add_score 0.375 187 188 # This is the score to give for a wrong content-type. 189 # e.g. JPEG image but content type says GIF 190 # Default value: 1.5 191 #focr_wrongctype_score 1.5 192 193 # This is the score to give for a corrupted image. 194 # This currently affects only GIF images 195 # Default value: 2.5 196 #focr_corrupt_score 2.5 197 198 # This is the score to give for a corrupted unfixable image. 199 # This currently affects only GIF images. 200 # Default value: 5 201 #focr_corrupt_unfixable_score 5 202 203 # This is used to disable the OCR engine if the message has 204 # already more points than this value 205 # Default value: 10 206 #focr_autodisable_score 30 207 208 # This is used to disable the OCR engine if the message has 209 # already less points than this value 210 # Default value: -5 211 #focr_autodisable_negative_score -5 212 213 214 ### 215 ### Hashing Options (Optional) 216 ### 217 218 # Select which type of image hashing to use: 219 # Default value: 0 (disabled) 220 # Allowed values: 221 # 1 ... use digest_hash only (deprecated) 222 # 2 ... use digest_db w/digest_hash import (see requirements) 223 # 3 ... use mysql database (see requirements) 224 #-- 225 # The score is saved with the hash in the database, allowing the plugin to 226 # skip the scans when the image is found in the database, using the score 227 # from the previous scans. 228 #-- 141 229 #focr_enable_image_hashing 3 142 # 143 # The score is saved with the hash in the database, so no extra scoring for a db hit is required. 144 # 145 # If the image hash database feature is enabled, specify the file here to use as database 146 # (Default value: /etc/mail/spamassassin/FuzzyOcr.hashdb) 147 focr_digest_db /etc/mail/spamassassin/FuzzyOcr.hashdb 148 # 149 # If the image hash db feature is enabled, specify the file here to use as database 150 # (Default value: ~/.spamassassin/FuzzyOcr.db) 151 focr_db_hash /etc/mail/spamassassin/FuzzyOcr.db 152 # 153 # If the image hash db feature is enabled, specify the file here to use as database 154 # (Default value: ~/.spamassassin/FuzzyOcr.safe.db) 155 focr_db_safe /etc/mail/spamassassin/FuzzyOcr.safe.db 156 # 157 # Expire records from focr_digest_db after (Default: 35) days 230 231 # Set this to skip updating the hashing database at startup 232 # Default value: 0 (update at startup) 233 #focr_skip_updates 1 234 235 # Automatically add hashes of spam images recognized by OCR to the Image 236 # Hash database, to disable, set to 0.0 237 # Default value: 1 (learn) 238 #focr_hashing_learn_scanned 1 239 240 # Score images who's global word count is below focr_counts_required using 241 # the following formulae: (focr_add_score * word count) as score. 242 # Default value: 0 (ignore images) 243 #focr_score_ham 1 244 245 # If the image hash database feature is enabled (Type 1 Hashing), 246 # specify the file to use as database 247 # Default value: /etc/mail/spamassassin/FuzzyOcr.hashdb 248 #focr_digest_db /etc/mail/spamassassin/FuzzyOcr.hashdb 249 250 # If the image hash db feature is enabled (Type 2 Hashing), 251 # specify the file to use as the SPAM database 252 # Default value: /etc/mail/spamassassin/FuzzyOcr.db 253 #focr_db_hash /etc/mail/spamassassin/FuzzyOcr.db 254 255 # If the image hash db feature is enabled (Type 2 Hashing), 256 # specify the file to use as the HAM database 257 # Default value: /etc/mail/spamassassin/FuzzyOcr.safe.db 258 #focr_db_safe /etc/mail/spamassassin/FuzzyOcr.safe.db 259 260 # Auto-prune: Expire records from hasing databases after these many days 261 # Default value: 35 158 262 focr_db_max_days 15 159 # 160 # MySQL options 263 264 ### 265 ### MySQL options 266 ### 267 161 268 #focr_mysql_db FuzzyOcr 162 269 #focr_mysql_hash Hash … … 164 271 #focr_mysql_user fuzzyocr 165 272 #focr_mysql_pass fuzzyocr 273 #focr_mysql_host localhost 166 274 #focr_mysql_port 3306 167 275 #focr_mysql_socket /tmp/mysql.sock 168 # 169 # Automatically add hashes of spam images recognized by OCR to the Image Hash database, to disable, set to 0.0 (Default value: 1) 170 #focr_hashing_learn_scanned 1 171 # 172 # Keep files that generate errors 173 # 0 = always cleanup 276 277 # If set, the database table is updated with different data from one of 278 # the following: 279 # + filename, 280 # + image-params, 281 # + content-type, 282 # + file-type, 283 # + score, 284 # + word-info 285 # Default value: 0 286 #focr_mysql_update_hash 1 287 288 ### 289 ### Miscellaneous Options 290 ### 291 292 # The pluging uses a temporary directory to store intermediate information. 293 # In order to Keep these files for debugging purposes use any of these 294 # values: 295 # 0 = always cleanup (default value) 174 296 # 1 = keep only if error 175 297 # 2 = always keep 298 #-- 299 # Keeping these intermediate files could fill your HDD _very_ fast! 300 # Make shure you periodically empty your temp dir (usually: /tmp) or 301 # suffer the conscecuences. You've been warned!! 302 #-- 176 303 #focr_keep_bad_images 1 177 # 178 # Score images who's global word count is below focr_counts_required using focr_add_score * word count as score. 179 #focr_score_ham 1 180 ###################################################################### 181 304 305 ################################################################# 182 306 # DO NOT REMOVE THIS LINE, IT IS REQUIRED UNDER ALL CIRCUMSTANCES 183 307 focr_end_config trunk/devel/FuzzyOcr.pm
r69 r70 20 20 use POSIX; 21 21 22 use lib qw(. /etc/mail/spamassassin); # Allow placing of FuzzyOcr in siteconfigdir23 24 use FuzzyOcr::Logging qw(debuglog );22 use lib qw(.); # Allow placing of FuzzyOcr in siteconfigdir 23 24 use FuzzyOcr::Logging qw(debuglog errorlog infolog); 25 25 use FuzzyOcr::Config qw(kill_pid 26 26 get_tmpdir … … 70 70 if ($conf->{focr_global_timeout}) { 71 71 my $t = get_timeout(); 72 debuglog("Global Timeout set at ".$conf->{focr_timeout}." sec.");72 infolog("Global Timeout set at ".$conf->{focr_timeout}." sec."); 73 73 $t->run(sub { 74 74 $end = fuzzyocr_do( $self, $conf, $pms ); 75 75 }); 76 76 if ($t->timed_out()) { 77 debuglog("Scan timed out after $conf->{focr_timeout} seconds.");78 debuglog("Killing possibly running pid...");77 infolog("Scan timed out after $conf->{focr_timeout} seconds."); 78 infolog("Killing possibly running pid..."); 79 79 my ($ret, $pid) = kill_pid(); 80 80 if ($ret > 0) { 81 debuglog("Successfully killed PID $pid");81 infolog("Successfully killed PID $pid"); 82 82 } elsif ($ret < 0) { 83 debuglog("No processes left... exiting");83 infolog("No processes left... exiting"); 84 84 } else { 85 debuglog("Failed to kill PID $pid, stale process!");85 infolog("Failed to kill PID $pid, stale process!"); 86 86 } 87 87 return 0; … … 90 90 $end = fuzzyocr_do( $self, $conf, $pms ); 91 91 } 92 debuglog("Processed in ".92 infolog("Processed in ". 93 93 sprintf("%.6f",tv_interval($begin, [gettimeofday])) 94 94 ." sec."); … … 103 103 104 104 if ( $current_score > $score ) { 105 debuglog("Scan canceled, message has already more than $score points ($current_score).");105 infolog("Scan canceled, message has already more than $score points ($current_score)."); 106 106 return 0; 107 107 } … … 109 109 $score = $conf->{focr_autodiable_negative_score} || -100; 110 110 if ( $current_score < $score ) { 111 debuglog("Scan canceled, message has less than $score points ($current_score).");111 infolog("Scan canceled, message has less than $score points ($current_score)."); 112 112 return 0; 113 113 } 114 114 115 my $ddb;116 115 my $imgdir; 117 116 my %imgfiles = (); … … 123 122 124 123 debuglog("Starting FuzzyOcr..."); 125 debuglog("Attempting to load personal wordlist...");126 124 if ($conf->{focr_personal_wordlist} =~ m/^\//) { 125 debuglog("Attempting to load personal wordlist..."); 127 126 read_words( $conf->{focr_personal_wordlist} ); 128 } else {129 my $peruserlist = $main->sed_path($conf->{focr_personal_wordlist});130 if (-r $peruserlist) {131 read_words($peruserlist);132 } else {133 debuglog("Error getting personal wordlist, skipping...");134 }135 127 } 136 128 … … 153 145 154 146 if ($test == 0) { 155 debuglog("Skipping file with content-type=\"$ctype\" name=\"$fname\"");147 infolog("Skipping file with content-type=\"$ctype\" name=\"$fname\""); 156 148 next; 157 149 } … … 162 154 163 155 unless ($imgdir) { 164 debuglog("Scan canceled, cannot create Image TMPDIR.");156 errorlog("Scan canceled, cannot create Image TMPDIR."); 165 157 return 0; 166 158 } … … 194 186 $imgfiles{$imgfilename}{ftype} = 1; 195 187 ($w,$h) = unpack("vv",substr($pdata,6,4)); 196 debuglog("GIF: $imgfilename '${h}x${w}'");188 debuglog("GIF: [${h}x${w}] $imgfilename"); 197 189 $imgfiles{$imgfilename}{width} = $w; 198 190 $imgfiles{$imgfilename}{height} = $h; … … 204 196 my ($b,$m) = unpack("CC",substr($pdata,$pos,2)); $pos += 2; 205 197 if ($b != 0xff) { 206 debuglog("Invalid JPEG image");198 infolog("Invalid JPEG image"); 207 199 $pos = $pdatalen + 1; 208 200 last; … … 218 210 } 219 211 if ($pos > $pdatalen) { 220 debuglog("Cannot find image dimensions");212 errorlog("Cannot find image dimensions"); 221 213 } else { 222 214 ($h,$w) = unpack("nn",substr($pdata,$pos+3,4)); 223 debuglog("JPEG: $imgfilename '${h}x${w}'");215 debuglog("JPEG: [${h}x${w}] $imgfilename"); 224 216 $imgfiles{$imgfilename}{ftype} = 2; 225 217 $imgfiles{$imgfilename}{height} = $h; … … 232 224 $imgfiles{$imgfilename}{width} = $w; 233 225 $imgfiles{$imgfilename}{height} = $h; 234 debuglog("PNG: $imgfilename '${h}x${w}'");226 debuglog("PNG: [${h}x${w}] $imgfilename"); 235 227 } elsif ( substr($pdata,0,2) eq "BM" ) { 236 228 ## BMP File … … 239 231 $imgfiles{$imgfilename}{width} = $w; 240 232 $imgfiles{$imgfilename}{height} = $h; 241 debuglog("BMP: $imgfilename '${h}x${w}'");233 debuglog("BMP: [${h}x${w}] $imgfilename"); 242 234 } elsif ( 243 235 (substr($pdata,0,4) eq "\x4d\x4d\x00\x2a") or … … 250 242 my $add = 2 + ($n * 12); 251 243 my ($id,$tag,$cnt,$val) = unpack($worder?"vvVV":"nnNN",substr($pdata,$offset+$add,12)); 252 $h = $val if ($ tag== 256);253 $w = $val if ($ tag== 257);244 $h = $val if ($id == 256); 245 $w = $val if ($id == 257); 254 246 last if ($h != 0 and $w != 0); 255 247 } 256 debuglog("TIFF: $imgfilename ($worder) '${h}x${w}"); 257 debuglog("Cannot determite size of TIFF image, setting to '1x1'") 258 if ($h == 0 and $w == 0); 248 debuglog("TIFF: [${h}x${w}] $imgfilename ($worder)"); 249 debuglog("Cannot determite size of TIFF image, setting to '1x1'") if ($h == 0 and $w == 0); 259 250 $imgfiles{$imgfilename}{ftype} = 5; 260 251 $imgfiles{$imgfilename}{width} = $w ? $w : 1; … … 266 257 $imgfiles{$imgfilename}{fsize} = $pdatalen; 267 258 unless (open PICT, ">$imgfilename") { 268 debuglog("Cannot write \"$imgfilename\", skipping...");259 errorlog("Cannot write \"$imgfilename\", skipping..."); 269 260 delete $imgfiles{$imgfilename}; 270 261 next; … … 278 269 279 270 if ($cnt == 0) { 280 debuglog("Skipping OCR, no image files found...");271 infolog("Skipping OCR, no image files found..."); 281 272 removedir($imgdir) if (defined($imgdir) and ($conf->{focr_keep_bad_images}<2)); 282 273 return 0; 283 274 } 284 debuglog("Found: $cnt images"); $cnt = 0;275 infolog("Found: $cnt images"); $cnt = 0; 285 276 if ($conf->{focr_enable_image_hashing} == 3) { 286 $ ddb = $conf->{focr_ddb} = get_mysql_ddb();277 $conf->{focr_ddb} = get_mysql_ddb(); 287 278 } 288 279 … … 293 284 foreach my $file (keys %imgfiles) { 294 285 my $pic = $imgfiles{$file}; 295 debuglog("Analyzing file with content-type=\"$$pic{ctype}\"");286 infolog("Analyzing file with content-type=\"$$pic{ctype}\""); 296 287 my @used_scansets = (); 297 288 my $corrupt = 0; … … 304 295 305 296 if ( $$pic{ftype} == 1 ) { 306 debuglog("Found GIF header name=\"$$pic{fname}\"");297 infolog("Found GIF header name=\"$$pic{fname}\""); 307 298 if ($conf->{focr_skip_gif}) { 308 debuglog("Skipping image check");299 infolog("Skipping image check"); 309 300 next IMAGE; 310 301 } 311 302 if (defined($conf->{focr_max_size_gif}) and ($$pic{fsize} > $conf->{focr_max_size_gif})) { 312 debuglog("GIF file size ($$pic{fsize}) exceeds maximum file size for this format, skipping...");303 infolog("GIF file size ($$pic{fsize}) exceeds maximum file size for this format, skipping..."); 313 304 next; 314 305 } … … 322 313 foreach my $a (qw/gifsicle giftext giffix gifinter giftopnm/) { 323 314 unless (defined $conf->{"focr_bin_$a"}) { 324 debuglog("Cannot exec $a, skipping image");315 errorlog("Cannot exec $a, skipping image"); 325 316 next IMAGE; 326 317 } … … 336 327 if ($retcode<0) { # only care if we timed out 337 328 chomp $retcode; 338 debuglog("$conf->{focr_bin_giftext} Timed out [$retcode], skipping...");329 errorlog("$conf->{focr_bin_giftext} Timed out [$retcode], skipping..."); 339 330 ++$imgerr if $conf->{focr_keep_bad_images}>0; next; 340 331 } … … 351 342 } 352 343 if ($interlaced_gif or ($image_count > 1)) { 353 debuglog("Image is interlaced or animated...");344 infolog("Image is interlaced or animated..."); 354 345 } 355 346 else { 356 debuglog("Image is single non-interlaced...");347 infolog("Image is single non-interlaced..."); 357 348 $tfile .= "-fixed.gif"; 358 349 printf RAWERR "## $conf->{focr_bin_giffix} $file >$tfile 2>>$efile\n" if ($haserr>0); … … 362 353 if ($retcode<0) { # only care if we timed out 363 354 chomp $retcode; 364 debuglog("$conf->{focr_bin_giffix}: Timed out [$retcode], skipping...");355 errorlog("$conf->{focr_bin_giffix}: Timed out [$retcode], skipping..."); 365 356 printf RAWERR "?? Timed out > $retcode\n" if ($haserr>0); 366 357 ++$imgerr if $conf->{focr_keep_bad_images}>0; next; … … 381 372 if ($corrupt) { 382 373 if ($interlaced_gif or ($image_count > 1)) { 383 debuglog("Skipping corrupted interlaced image...");374 infolog("Skipping corrupted interlaced image..."); 384 375 corrupt_img($conf->{focr_corrupt_unfixable_score}, $corrupt); 385 376 next; 386 377 } 387 378 if (-z $tfile) { 388 debuglog("Uncorrectable corruption detected, skipping non-interlaced image...");379 infolog("Uncorrectable corruption detected, skipping non-interlaced image..."); 389 380 corrupt_img($conf->{focr_corrupt_unfixable_score}, $corrupt); 390 381 next; 391 382 } 392 debuglog("Image is corrupt, but seems fixable, continuing...");383 infolog("Image is corrupt, but seems fixable, continuing..."); 393 384 corrupt_img($conf->{focr_corrupt_score}, $corrupt); 394 385 } 395 386 396 387 if ($image_count > 1) { 397 debuglog("File contains <$image_count> images, deanimating...");388 infolog("File contains <$image_count> images, deanimating..."); 398 389 $tfile = deanimate($tfile); 399 390 } 400 391 401 392 if ($interlaced_gif) { 402 debuglog("Processing interlaced_gif $tfile...");393 infolog("Processing interlaced_gif $tfile..."); 403 394 my $cfile = $tfile; 404 395 if ($tfile =~ m/\.gif$/i) { … … 414 405 chomp $retcode; 415 406 printf RAWERR "?? Timed out > $retcode\n" if ($haserr>0); 416 debuglog("$conf->{focr_bin_gifinter}: Timed out [$retcode], skipping...");407 errorlog("$conf->{focr_bin_gifinter}: Timed out [$retcode], skipping..."); 417 408 ++$imgerr if $conf->{focr_keep_bad_images}>0; next; 418 409 } elsif ($retcode>0) { 419 410 chomp $retcode; 420 411 printf RAWERR "?? [$retcode] returned from $conf->{focr_bin_gifinter}\n" if ($haserr>0); 421 debuglog("$conf->{focr_bin_gifinter}: Returned [$retcode], skipping...");412 errorlog("$conf->{focr_bin_gifinter}: Returned [$retcode], skipping..."); 422 413 ++$imgerr if $conf->{focr_keep_bad_images}>0; next; 423 414 } … … 431 422 chomp $retcode; 432 423 printf RAWERR "?? Timed out > $retcode\n" if ($haserr>0); 433 debuglog("$conf->{focr_bin_giftopnm}: Timed out [$retcode], skipping...");424 errorlog("$conf->{focr_bin_giftopnm}: Timed out [$retcode], skipping..."); 434 425 ++$imgerr if $conf->{focr_keep_bad_images}>0; next; 435 426 } elsif ($retcode>0) { 436 427 chomp $retcode; 437 428 printf RAWERR "?? [$retcode] returned from $conf->{focr_bin_giftopnm}\n" if ($haserr>0); 438 debuglog("$conf->{focr_bin_giftopnm}: Returned [$retcode], skipping...");429 errorlog("$conf->{focr_bin_giftopnm}: Returned [$retcode], skipping..."); 439 430 ++$imgerr if $conf->{focr_keep_bad_images}>0; next; 440 431 } 441 432 } 442 433 elsif ( $$pic{ftype} == 2 ) { 443 debuglog("Found JPEG header name=\"$$pic{fname}\"");434 infolog("Found JPEG header name=\"$$pic{fname}\""); 444 435 if ($conf->{focr_skip_jpeg}) { 445 debuglog("Skipping image check");436 infolog("Skipping image check"); 446 437 next IMAGE; 447 438 } 448 439 449 440 if (defined($conf->{focr_max_size_jpeg}) and ($$pic{fsize} > $conf->{focr_max_size_jpeg})) { 450 debuglog("JPEG file size ($$pic{fsize}) exceeds maximum file size for this format, skipping...");441 infolog("JPEG file size ($$pic{fsize}) exceeds maximum file size for this format, skipping..."); 451 442 next; 452 443 } … … 456 447 foreach my $a (qw/jpegtopnm/) { 457 448 unless (defined $conf->{"focr_bin_$a"}) { 458 debuglog("Cannot exec $a, skipping image");449 errorlog("Cannot exec $a, skipping image"); 459 450 next IMAGE; 460 451 } … … 466 457 chomp $retcode; 467 458 printf RAWERR "?? Timed out > $retcode\n" if ($haserr>0); 468 debuglog("$conf->{focr_bin_jpegtopnm}: Timed out [$retcode], skipping...");459 errorlog("$conf->{focr_bin_jpegtopnm}: Timed out [$retcode], skipping..."); 469 460 ++$imgerr if $conf->{focr_keep_bad_images}>0; next; 470 461 } elsif ($retcode>0) { 471 462 chomp $retcode; 472 463 printf RAWERR "?? [$retcode] returned from $conf->{focr_bin_jpegtopnm}\n" if ($haserr>0); 473 debuglog("$conf->{focr_bin_jpegtopnm}: Returned [$retcode], skipping...");464 errorlog("$conf->{focr_bin_jpegtopnm}: Returned [$retcode], skipping..."); 474 465 ++$imgerr if $conf->{focr_keep_bad_images}>0; next; 475 466 } 476 467 } 477 468 elsif ( $$pic{ftype} == 3 ) { 478 debuglog("Found PNG header name=\"$$pic{fname}\"");469 infolog("Found PNG header name=\"$$pic{fname}\""); 479 470 if ($conf->{focr_skip_png}) { 480 debuglog("Skipping image check");471 infolog("Skipping image check"); 481 472 next IMAGE; 482 473 } 483 474 if (defined($conf->{focr_max_size_png}) and ($$pic{fsize} > $conf->{focr__max_size_png})) { 484 debuglog("PNG file size ($$pic{fsize}) exceeds maximum file size for this format, skipping...");475 infolog("PNG file size ($$pic{fsize}) exceeds maximum file size for this format, skipping..."); 485 476 next; 486 477 } … … 490 481 foreach my $a (qw/pngtopnm/) { 491 482 unless (defined $conf->{"focr_bin_$a"}) { 492 debuglog("Cannot exec $a, skipping image");483 errorlog("Cannot exec $a, skipping image"); 493 484 next IMAGE; 494 485 } … … 501 492 chomp $retcode; 502 493 printf RAWERR "?? Timed out > $retcode\n" if ($haserr>0); 503 debuglog("$conf->{focr_bin_pngtopnm}: Timed out [$retcode], skipping...");494 errorlog("$conf->{focr_bin_pngtopnm}: Timed out [$retcode], skipping..."); 504 495 ++$imgerr if $conf->{focr_keep_bad_images}>0; next; 505 496 } elsif ($retcode>0) { 506 497 chomp $retcode; 507 498 printf RAWERR "?? [$retcode] returned from $conf->{focr_bin_pngtopnm}\n" if ($haserr>0); 508 debuglog("$conf->{focr_bin_pngtopnm}: Returned [$retcode], skipping...");499 errorlog("$conf->{focr_bin_pngtopnm}: Returned [$retcode], skipping..."); 509 500 ++$imgerr if $conf->{focr_keep_bad_images}>0; next; 510 501 } 511 502 } 512 503 elsif ( $$pic{ftype} == 4 ) { 513 debuglog("Found BMP header name=\"$$pic{fname}\"");504 infolog("Found BMP header name=\"$$pic{fname}\""); 514 505 if ($conf->{focr_skip_bmp}) { 515 debuglog("Skipping image check");506 infolog("Skipping image check"); 516 507 next IMAGE; 517 508 } 518 509 if (defined($conf->{focr_max_size_bmp}) and ($$pic{fsize} > $conf->{focr_max_size_bmp})) { 519 debuglog("BMP file size ($$pic{fsize}) exceeds maximum file size for this format, skipping...");510 infolog("BMP file size ($$pic{fsize}) exceeds maximum file size for this format, skipping..."); 520 511 next; 521 512 } … … 525 516 foreach my $a (qw/bmptopnm/) { 526 517 unless (defined $conf->{"focr_bin_$a"}) { 527 debuglog("Cannot exec $a, skipping image");518 errorlog("Cannot exec $a, skipping image"); 528 519 next IMAGE; 529 520 } … … 535 526 chomp $retcode; 536 527 printf RAWERR "?? Timed out > $retcode\n" if ($haserr>0); 537 debuglog("$conf->{focr_bin_bmptopnm}: Timed out [$retcode], skipping...");528 errorlog("$conf->{focr_bin_bmptopnm}: Timed out [$retcode], skipping..."); 538 529 ++$imgerr if $conf->{focr_keep_bad_images}>0; next; 539 530 } elsif ($retcode>0) { 540 531 chomp $retcode; 541 532 printf RAWERR "?? [$retcode] returned from $conf->{focr_bin_bmptopnm}\n" if ($haserr>0); 542 debuglog("$conf->{focr_bin_bmptopnm}: Returned [$retcode], skipping...");533 errorlog("$conf->{focr_bin_bmptopnm}: Returned [$retcode], skipping..."); 543 534 ++$imgerr if $conf->{focr_keep_bad_images}>0; next; 544 535 } 545 536 } 546 537 elsif ( $$pic{ftype} == 5 ) { 547 debuglog("Found TIFF header name=\"$$pic{fname}\"");538 infolog("Found TIFF header name=\"$$pic{fname}\""); 548 539 if ($conf->{focr_skip_tiff}) { 549 debuglog("Skipping image check");540 infolog("Skipping image check"); 550 541 next IMAGE; 551 542 } 552 543 if (defined($conf->{focr_max_size_tiff}) and ($$pic{fsize} > $conf->{focr_max_size_tiff})) { 553 debuglog("TIFF file size ($$pic{fsize}) exceeds maximum file size for this format, skipping...");544 infolog("TIFF file size ($$pic{fsize}) exceeds maximum file size for this format, skipping..."); 554 545 next; 555 546 } … … 560 551 foreach my $a (qw/tifftopnm/) { 561 552 unless (defined $conf->{"focr_bin_$a"}) { 562 debuglog("Cannot exec $a, skipping image");553 errorlog("Cannot exec $a, skipping image"); 563 554 next IMAGE; 564 555 } … … 570 561 chomp $retcode; 571 562 printf RAWERR "?? Timed out > $retcode\n" if ($haserr>0); 572 debuglog("$conf->{focr_bin_tifftopnm}: Timed out [$retcode], skipping...");563 errorlog("$conf->{focr_bin_tifftopnm}: Timed out [$retcode], skipping..."); 573 564 ++$imgerr if $conf->{focr_keep_bad_images}>0; next; 574 565 } elsif ($retcode>0) { 575 566 chomp $retcode; 576 567 printf RAWERR "?? [$retcode] returned from $conf->{focr_bin_tifftopnm}\n" if ($haserr>0); 577 debuglog("$conf->{focr_bin_tifftopnm}: Returned [$retcode], skipping...");568 errorlog("$conf->{focr_bin_tifftopnm}: Returned [$retcode], skipping..."); 578 569 ++$imgerr if $conf->{focr_keep_bad_images}>0; next; 579 570 } 580 571 } 581 572 else { 582 debuglog("Image type not recognized, unknown format. Skipping this image...");573 errorlog("Image type not recognized, unknown format. Skipping this image..."); 583 574 next; 584 575 } 585 576 586 577 if($conf->{focr_enable_image_hashing}) { 587 debuglog("Calculating the image hash: $pfile");578 infolog("Calculating image hash for: $pfile"); 588 579 ($corrupt, $digest) = calc_image_hash($pfile,$pic); 589 580 if ($corrupt) { 590 debuglog("Error calculating the image hash, skipping hash check...");581 infolog("Error calculating the image hash, skipping hash check..."); 591 582 } else { 592 583 my ($score, $dinfo, $whash); … … 597 588 if ($score > 0) { 598 589 known_img_hash($score,$dinfo); 599 debuglog("Message is SPAM. $dinfo") if ($conf->{focr_enable_image_hashing} < 3);590 infolog("Message is SPAM. $dinfo") if ($conf->{focr_enable_image_hashing} < 3); 600 591 removedir($imgdir); 601 592 return 0; … … 606 597 ($score,$dinfo) = check_image_hash_db($digest, $whash, $$pic{fname}, $$pic{ctype}, $$pic{ftype}); 607 598 if ($score > 0) { 608 debuglog("Image in KNOWN_GOOD. Skipping OCR checks...");599 infolog("Image in KNOWN_GOOD. Skipping OCR checks..."); 609 600 next IMAGE; 610 601 } 611 602 } 612 603 if ($digest eq '') { 613 debuglog("Empty Hash, skipping...");604 infolog("Empty Hash, skipping..."); 614 605 next IMAGE; 615 606 } 616 607 } else { 617 debuglog("Image hashing disabled in configuration, skipping...");608 infolog("Image hashing disabled in configuration, skipping..."); 618 609 } 619 610 my @ocr_results = (); … … 623 614 my %words = %$wref; 624 615 foreach my $scanset (@$scansets) { 616 my $scanlabel = $scanset->{label}; 617 my $scancmd = $scanset->{command}; 618 if ($scancmd} =~ m/^\$/) { 619 infolog("Skipping $scanlabel, invalid command '$scancmd'"); 620 next; 621 } 625 622 my $cmcnt = 0; 626 623 my @cfound; 627 my $scancmd = $scanset->{ocr_command}; 628 my $scanlabel = $scanset->{label}; 624 if (defined $scanset->{args}) { 625 $scancmd .= ' ' . $scanset->{args}; 626 } 629 627 printf RAWERR qq(## $scancmd\n) if ($haserr>0); 630 628 my ($retcode, @result) = $scanset->run($pfile); 631 629 if ($retcode<0) { 632 debuglog("Timeout: \"$scancmd\" took more than $conf->{focr_timeout} sec."); 633 debuglog("Skipping scanset \"$scanlabel\" due to timeout, trying next..."); 634 printf RAWERR qq(Timeout[$conf->{focr_timeout}]: $scancmd\n) if ($haserr>0); 630 if ($retcode == -1) { 631 printf RAWERR qq(Timeout[$conf->{focr_timeout}]: $scancmd\n) if ($haserr>0); 632
