root/trunk/devel/FuzzyOcr.cf

Revision 132, 11.3 kB (checked in by decoder, 1 year ago)

Added capabilities to scan PDF files (highly experimental and disabled by default)

  • config params: focr_scan_pdfs focr_pdf_maxpages
Line 
1 # Syntax:
2 # loadplugin <Plugin_Name> <Location>
3 #  <Location> path where Plugin resides.
4 loadplugin FuzzyOcr FuzzyOcr.pm
5
6 body     FUZZY_OCR                   eval:fuzzyocr_check()
7 body     FUZZY_OCR_WRONG_CTYPE       eval:dummy_check()
8 body     FUZZY_OCR_CORRUPT_IMG       eval:dummy_check()
9 body     FUZZY_OCR_WRONG_EXTENSION   eval:dummy_check()
10 body     FUZZY_OCR_KNOWN_HASH        eval:dummy_check()
11
12 describe FUZZY_OCR                   Mail contains an image with common spam text inside
13 describe FUZZY_OCR_WRONG_CTYPE       Mail contains an image with wrong content-type set
14 describe FUZZY_OCR_WRONG_EXTENSION   Mail contains an image with wrong file extension
15 describe FUZZY_OCR_CORRUPT_IMG       Mail contains a corrupted image
16 describe FUZZY_OCR_KNOWN_HASH        Mail contains an image with known hash
17
18 priority FUZZY_OCR 900
19
20 ###
21 ### Plugin Configuration
22 ###
23
24 ###
25 ### Logging options
26 ###
27
28 # Verbosity level (see manual)
29 # Level 0 - Errors only
30 # Level 1 - Errors and Warnings
31 # Level 2 - Errors, Warnings and Info Messages
32 # Level 3 - Full debug output
33 # Default value: 1
34 #focr_verbose 3
35
36 # Log Message-Id, From, To
37 # Default: 1
38 #focr_log_pmsinfo 0
39
40 # Send logging output to stderr.
41 # Default value: 1
42 #focr_log_stderr 0
43
44 # Logfile (make sure it is writable by the plugin)
45 # Default value: none
46 #focr_logfile /tmp/FuzzyOcr.log
47
48 ###
49 ### Wordlists
50 ###
51
52 # Here we defined the words to scan for
53 # Default value: /etc/mail/spamassassin/FuzzyOcr.words
54 #focr_global_wordlist /etc/mail/spamassassin/FuzzyOcr.words
55 #
56 # This is the path RELATIVE to the respective home directory
57 # for the personalized list. This list is merged with the global
58 # word list on execution.
59 # Default value: ~/.spamassassin/fuzzyocr.words
60 # If value begins with '/', it is treated as fixed path.
61 #focr_personal_wordlist fuzzyocr.words
62 #
63 # This option allows you to disable the whole personalization stuff,
64 # i.e. FuzzyOcr will not call functions in SA that require home
65 # directories for your users. This is only required if you are running
66 # an environment where the users don't have home directories at all.
67 # Default value: 0
68 #
69 #focr_no_homedirs 1
70 #
71 ## Optionally, disable this option if you want to scan for numbers
72 ## Setting this to 0 will cause FuzzyOcr not to strip numbers from
73 ## both the wordlist and the OCR results
74 #
75 #focr_strip_numbers 1
76
77
78 ###
79 ### Helper Applications
80 ###
81
82 # These parameters can be used to change other detection settings
83 # If you leave these commented out, the defaults will be used.
84 # Do not use " " around any parameters!
85
86 ###
87 ### Step 1:
88 ### Inform the plugin which helper apps are required.
89 ###
90
91 # The following are already included by default:
92 #
93 #focr_bin_helper gifsicle, giffix, giftext, gifinter, giftopnm
94 #focr_bin_helper jpegtopnm, pngtopnm, bmptopnm, tifftopnm, ppmhist
95 #focr_bin_helper gocr, ocrad
96
97 # Include additional scanner/preprocessor commands here:
98 #
99 focr_bin_helper pnmnorm, pnminvert, pamthreshold, ppmtopgm, pamtopnm
100 focr_bin_helper tesseract
101
102 # These helpers must be defined before enabling PDF scanning
103 #focr_bin_helper pdfinfo, pdftops, pstopnm
104
105 ###
106 ### Step 2:
107 ### Inform the plugin of the search path to find all helper apps.
108 ### Only the first match will be considered, so the order is important.
109 ###
110
111 # Search path for locating helper applications
112 #focr_path_bin /usr/local/netpbm/bin:/usr/local/bin:/usr/bin
113
114 ###
115 ### Step 3:
116 ### You can optionally define a helper application location, bypassing
117 ### the search path algorithm. Please note that if the helper app is not
118 ### previously defined, it will generate an error:
119
120 #focr_bin_gifsicle /usr/bin/gifsicle
121 #focr_bin_giffix /usr/bin/giffix
122 #focr_bin_giftext /usr/bin/giftext
123 #focr_bin_gifinter /usr/bin/gifinter
124 #focr_bin_giftopnm /usr/bin/giftopnm
125 #focr_bin_jpegtopnm /usr/bin/jpegtopnm
126 #focr_bin_pngtopnm /usr/bin/pngtopnm
127 #focr_bin_bmptopnm /usr/bin/bmptopnm
128 #focr_bin_tifftopnm /usr/bin/tifftopnm
129 #focr_bin_ppmhist /usr/bin/ppmhist
130 #focr_bin_gocr /usr/bin/gocr
131 #focr_bin_ocrad /usr/bin/ocrad
132
133 #focr_bin_pnmnorm /usr/bin/pnmnorm
134 #focr_bin_pnminvert /usr/bin/pnminvert
135
136 #focr_bin_pdfinfo /usr/bin/pdfinfo
137 #focr_bin_pdftops /usr/bin/pdftops
138 #focr_bin_pstopnm /usr/bin/pstopnm
139
140 ###
141 ### Scansets
142 ###
143
144 # Paths to the files containing Scansets and Preprocessors definitions
145 #
146 #focr_preprocessor_file /etc/mail/spamassassin/FuzzyOcr.preps
147 #focr_scanset_file /etc/mail/spamassassin/FuzzyOcr.scansets
148
149 # Setting this to 1 will cause FuzzyOcr to skip all other scansets,
150 # if a scanset has reached the amount of hits specified in
151 # focr_counts_required. (i.e. if the image is detected as spam).
152 # This saves resources, but lowers the scores because not the best,
153 # but the first best scanset is taken as result.
154 # Default value: 1
155 #focr_minimal_scanset 0
156
157 # This option is only used when focr_minimal_scanset is enabled.
158 # Basically, this counts the effectiveness of a scanset on the current
159 # mail traffic and resorts the scansets with the most effective first.
160 # This saves unnecessary scanner passes and saves resources.
161 # Default value: 1.
162 #focr_autosort_scanset 0
163
164 # This is a parameter for the focr_autosort_scanset function, and specifies
165 # the maximum value of the effectiveness counter used in each scanset. If you
166 # increase this, it will take longer until the autosort function adapts to new
167 # types of spam, setting it too low will lower the effectiveness of the
168 # function.
169 # Default value: 10
170 #focr_autosort_buffer 10
171
172 ###
173 ### Scan Settings
174 ###
175
176 # Timeout for the plugin, in seconds. (Maximum runtime of the plugin)
177 # Default value: 10
178 #focr_timeout 15
179
180 # Use a global timeout value instead of per helper application.
181 # Default value: 0
182 #focr_global_timeout 1
183
184 # Minimum image size to scan. Images with dimensions smaller than the
185 # ones specified here will be skipped:
186 # (This parameter does not apply to PDF files)
187 # Default: Height:4 Width:4
188 #
189 #focr_min_height 4
190 #focr_min_width 4
191
192 # Maximum image size to scan. Images with dimensions bigger than the
193 # ones specified here will be skipped:
194 # (This parameter does not apply to PDF files)
195 # Default: Height:800 Width:800
196 #
197 #focr_max_height 800
198 #focr_max_width 800
199
200
201 # Maximum file size for different formats in byte, bigger pictures
202 # will not be scanned
203 # Default values: Unlimited)
204 #focr_max_size_gif 80000
205 #focr_max_size_jpeg 100000
206 #focr_max_size_png 80000
207 #focr_max_size_bmp 500000
208 #focr_max_size_tiff 500000
209
210 # Skip checking the following image types
211 # Default value: 0 (check image type)
212 #focr_skip_gif 1
213 #focr_skip_jpeg 1
214 #focr_skip_png 1
215 #focr_skip_bmp 1
216 #focr_skip_tiff 1
217 #
218
219 # PDF specific options
220 # WARNING: Enable this at your own risk, this might lead to false positives and classify
221 #          important documents as spam. YOU HAVE BEEN WARNED.
222 #focr_scan_pdfs 0
223 # PDFs having more pages than this value will be skipped
224 #focr_pdf_maxpages 1
225
226 # Default detection treshold (see manual)
227 # Default value: 0.25 (Can be changed on a per word basis in the wordlist).
228 #focr_threshold 0.20
229
230 # Number of minimum matches before the rule scores (Default value: 2)
231 #focr_counts_required 3
232
233 # Setting this will cause every word to be matched only once per image (Default value: 0)
234 #focr_unique_matches 1
235
236 # This is the score for a hit after focr_counts_required matches
237 # Default value: 5
238 #focr_base_score 5
239
240 # This is the additional score for every additional match after
241 # focr_counts_required matches
242 # Default value: 1
243 #focr_add_score 0.375
244
245 # This option defines the factor, which is multiplied with the number
246 # of matches, that were made without stripping spaces. FuzzyOcr does two
247 # matching attempts on OCR results, one without space strippings and one with.
248 # To weight the first match type more, this factor is applied.
249 # Default value: 1.5
250 #focr_twopass_scoring_factor 1.5
251
252 # This is the score to give for a wrong content-type.
253 # e.g. JPEG image but content type says GIF
254 # Default value: 1.5
255 #focr_wrongctype_score 1.5
256
257 # This is the score to give for a wrong file extension.
258 # e.g. JPEG image but file extension says GIF
259 # Default value: 1.5
260 #focr_wrongext_score 1.5
261
262 # This is the score to give for a corrupted image.
263 # This currently affects only GIF images
264 # Default value: 2.5
265 #focr_corrupt_score 2.5
266
267 # This is the score to give for a corrupted unfixable image.
268 # This currently affects only GIF images.
269 # Default value: 5
270 #focr_corrupt_unfixable_score 5
271
272 # This is used to disable the OCR engine if the message has
273 # already more points than this value
274 # Default value: 10
275 #focr_autodisable_score 30
276
277 # This is used to disable the OCR engine if the message has
278 # already less points than this value
279 # Default value: -5
280 #focr_autodisable_negative_score -5
281
282
283 ###
284 ### Hashing Options (Optional)
285 ###
286
287 # Select which type of image hashing to use:
288 # Default value: 0 (disabled)
289 # Allowed values:
290 #  1 ... use digest_hash only (deprecated)
291 #  2 ... use digest_db w/digest_hash import (see requirements, recommended)
292 #  3 ... use mysql database (see requirements, experimental)
293 #--
294 # The score is saved with the hash in the database, allowing the plugin to
295 # skip the scans when the image is found in the database, using the score
296 # from the previous scans.
297 #--
298 #focr_enable_image_hashing 3
299
300 # Set this to skip updating the hashing database at startup
301 # Default value: 0 (update at startup)
302 #focr_skip_updates 1
303
304 # Automatically add hashes of spam images recognized by OCR to the Image
305 # Hash database, to disable, set to 0
306 # Default value: 1 (learn)
307 #focr_hashing_learn_scanned 1
308
309 # Score images who's global word count is below focr_counts_required using
310 # the following formulae: (focr_add_score * word count) as score.
311 # Default value: 0 (ignore images)
312 #focr_score_ham 1
313
314 # If the image hash database feature is enabled (Type 1 Hashing),
315 # specify the file to use as database
316 # Default value: /etc/mail/spamassassin/FuzzyOcr.hashdb
317 #focr_digest_db /etc/mail/spamassassin/FuzzyOcr.hashdb
318
319 # If the image hash db feature is enabled (Type 2 Hashing),
320 # specify the file to use as the SPAM database
321 # Default value: /etc/mail/spamassassin/FuzzyOcr.db
322 #focr_db_hash /etc/mail/spamassassin/FuzzyOcr.db
323
324 # If the image hash db feature is enabled (Type 2 Hashing),
325 # specify the file to use as the HAM database
326 # Default value: /etc/mail/spamassassin/FuzzyOcr.safe.db
327 #focr_db_safe /etc/mail/spamassassin/FuzzyOcr.safe.db
328
329 # Auto-prune: Expire records from hasing databases after these many days
330 # Default value: 35
331 #focr_db_max_days 15
332
333 ###
334 ### MySQL options (Type 3 Hashing)
335 ###
336
337 #focr_mysql_db FuzzyOcr
338 #focr_mysql_hash Hash
339 #focr_mysql_safe Safe
340 #focr_mysql_user fuzzyocr
341 #focr_mysql_pass fuzzyocr
342 #focr_mysql_host localhost
343 #focr_mysql_port 3306
344 #focr_mysql_socket /tmp/mysql.sock
345
346 # If set, the database table is updated with different data from one of
347 # the following:
348 #  + filename,
349 #  + image-params,
350 #  + content-type,
351 #  + file-type,
352 #  + score,
353 #  + word-info
354 # Default value: 0
355 #focr_mysql_update_hash 1
356
357 ###
358 ### Miscellaneous Options
359 ###
360
361 # The pluging uses a temporary directory to store intermediate information.
362 # In order to Keep these files for debugging purposes use any of these
363 # values:
364 #  0 = always cleanup (default value)
365 #  1 = keep only if error
366 #  2 = always keep
367 #--
368 # Keeping these intermediate files could fill your HDD _very_ fast!
369 # Make shure you periodically empty your temp dir (usually: /tmp) or
370 # suffer the conscecuences.  You've been warned!!
371 #--
372 #focr_keep_bad_images 1
373
374 #################################################################
375 # DO NOT REMOVE THIS LINE, IT IS REQUIRED UNDER ALL CIRCUMSTANCES
376 focr_end_config
Note: See TracBrowser for help on using the browser.