root/trunk/devel/FuzzyOcr.cf

Revision 132, 11.3 KB (checked in by decoder, 3 years ago)

Added capabilities to scan PDF files (highly experimental and disabled by default)

  • config params: focr_scan_pdfs focr_pdf_maxpages
Line 
1# Syntax:
2# loadplugin <Plugin_Name> <Location>
3#  <Location> path where Plugin resides.
4loadplugin FuzzyOcr FuzzyOcr.pm
5
6body     FUZZY_OCR                   eval:fuzzyocr_check()
7body     FUZZY_OCR_WRONG_CTYPE       eval:dummy_check()
8body     FUZZY_OCR_CORRUPT_IMG       eval:dummy_check()
9body     FUZZY_OCR_WRONG_EXTENSION   eval:dummy_check()
10body     FUZZY_OCR_KNOWN_HASH        eval:dummy_check()
11
12describe FUZZY_OCR                   Mail contains an image with common spam text inside
13describe FUZZY_OCR_WRONG_CTYPE       Mail contains an image with wrong content-type set
14describe FUZZY_OCR_WRONG_EXTENSION   Mail contains an image with wrong file extension
15describe FUZZY_OCR_CORRUPT_IMG       Mail contains a corrupted image
16describe FUZZY_OCR_KNOWN_HASH        Mail contains an image with known hash
17
18priority FUZZY_OCR 900
19
20###
21### Plugin Configuration
22###
23
24###
25### Logging options
26###
27
28# Verbosity level (see manual)
29# Level 0 - Errors only
30# Level 1 - Errors and Warnings
31# Level 2 - Errors, Warnings and Info Messages
32# Level 3 - Full debug output
33# Default value: 1
34#focr_verbose 3
35
36# Log Message-Id, From, To
37# Default: 1
38#focr_log_pmsinfo 0
39
40# Send logging output to stderr.
41# Default value: 1
42#focr_log_stderr 0
43
44# Logfile (make sure it is writable by the plugin)
45# Default value: none
46#focr_logfile /tmp/FuzzyOcr.log
47
48###
49### Wordlists
50###
51
52# Here we defined the words to scan for
53# Default value: /etc/mail/spamassassin/FuzzyOcr.words
54#focr_global_wordlist /etc/mail/spamassassin/FuzzyOcr.words
55#
56# This is the path RELATIVE to the respective home directory
57# for the personalized list. This list is merged with the global
58# word list on execution.
59# Default value: ~/.spamassassin/fuzzyocr.words
60# If value begins with '/', it is treated as fixed path.
61#focr_personal_wordlist fuzzyocr.words
62#
63# This option allows you to disable the whole personalization stuff,
64# i.e. FuzzyOcr will not call functions in SA that require home
65# directories for your users. This is only required if you are running
66# an environment where the users don't have home directories at all.
67# Default value: 0
68#
69#focr_no_homedirs 1
70#
71## Optionally, disable this option if you want to scan for numbers
72## Setting this to 0 will cause FuzzyOcr not to strip numbers from
73## both the wordlist and the OCR results
74#
75#focr_strip_numbers 1
76
77
78###
79### Helper Applications
80###
81
82# These parameters can be used to change other detection settings
83# If you leave these commented out, the defaults will be used.
84# Do not use " " around any parameters!
85
86###
87### Step 1:
88### Inform the plugin which helper apps are required.
89###
90
91# The following are already included by default:
92#
93#focr_bin_helper gifsicle, giffix, giftext, gifinter, giftopnm
94#focr_bin_helper jpegtopnm, pngtopnm, bmptopnm, tifftopnm, ppmhist
95#focr_bin_helper gocr, ocrad
96
97# Include additional scanner/preprocessor commands here:
98#
99focr_bin_helper pnmnorm, pnminvert, pamthreshold, ppmtopgm, pamtopnm
100focr_bin_helper tesseract
101
102# These helpers must be defined before enabling PDF scanning
103#focr_bin_helper pdfinfo, pdftops, pstopnm
104
105###
106### Step 2:
107### Inform the plugin of the search path to find all helper apps.
108### Only the first match will be considered, so the order is important.
109###
110
111# Search path for locating helper applications
112#focr_path_bin /usr/local/netpbm/bin:/usr/local/bin:/usr/bin
113
114###
115### Step 3:
116### You can optionally define a helper application location, bypassing
117### the search path algorithm. Please note that if the helper app is not
118### previously defined, it will generate an error:
119
120#focr_bin_gifsicle /usr/bin/gifsicle
121#focr_bin_giffix /usr/bin/giffix
122#focr_bin_giftext /usr/bin/giftext
123#focr_bin_gifinter /usr/bin/gifinter
124#focr_bin_giftopnm /usr/bin/giftopnm
125#focr_bin_jpegtopnm /usr/bin/jpegtopnm
126#focr_bin_pngtopnm /usr/bin/pngtopnm
127#focr_bin_bmptopnm /usr/bin/bmptopnm
128#focr_bin_tifftopnm /usr/bin/tifftopnm
129#focr_bin_ppmhist /usr/bin/ppmhist
130#focr_bin_gocr /usr/bin/gocr
131#focr_bin_ocrad /usr/bin/ocrad
132
133#focr_bin_pnmnorm /usr/bin/pnmnorm
134#focr_bin_pnminvert /usr/bin/pnminvert
135
136#focr_bin_pdfinfo /usr/bin/pdfinfo
137#focr_bin_pdftops /usr/bin/pdftops
138#focr_bin_pstopnm /usr/bin/pstopnm
139
140###
141### Scansets
142###
143
144# Paths to the files containing Scansets and Preprocessors definitions
145#
146#focr_preprocessor_file /etc/mail/spamassassin/FuzzyOcr.preps
147#focr_scanset_file /etc/mail/spamassassin/FuzzyOcr.scansets
148
149# Setting this to 1 will cause FuzzyOcr to skip all other scansets,
150# if a scanset has reached the amount of hits specified in
151# focr_counts_required. (i.e. if the image is detected as spam).
152# This saves resources, but lowers the scores because not the best,
153# but the first best scanset is taken as result.
154# Default value: 1
155#focr_minimal_scanset 0
156
157# This option is only used when focr_minimal_scanset is enabled.
158# Basically, this counts the effectiveness of a scanset on the current
159# mail traffic and resorts the scansets with the most effective first.
160# This saves unnecessary scanner passes and saves resources.
161# Default value: 1.
162#focr_autosort_scanset 0
163
164# This is a parameter for the focr_autosort_scanset function, and specifies
165# the maximum value of the effectiveness counter used in each scanset. If you
166# increase this, it will take longer until the autosort function adapts to new
167# types of spam, setting it too low will lower the effectiveness of the
168# function.
169# Default value: 10
170#focr_autosort_buffer 10
171
172###
173### Scan Settings
174###
175
176# Timeout for the plugin, in seconds. (Maximum runtime of the plugin)
177# Default value: 10
178#focr_timeout 15
179
180# Use a global timeout value instead of per helper application.
181# Default value: 0
182#focr_global_timeout 1
183
184# Minimum image size to scan. Images with dimensions smaller than the
185# ones specified here will be skipped:
186# (This parameter does not apply to PDF files)
187# Default: Height:4 Width:4
188#
189#focr_min_height 4
190#focr_min_width 4
191
192# Maximum image size to scan. Images with dimensions bigger than the
193# ones specified here will be skipped:
194# (This parameter does not apply to PDF files)
195# Default: Height:800 Width:800
196#
197#focr_max_height 800
198#focr_max_width 800
199
200
201# Maximum file size for different formats in byte, bigger pictures
202# will not be scanned
203# Default values: Unlimited)
204#focr_max_size_gif 80000
205#focr_max_size_jpeg 100000
206#focr_max_size_png 80000
207#focr_max_size_bmp 500000
208#focr_max_size_tiff 500000
209
210# Skip checking the following image types
211# Default value: 0 (check image type)
212#focr_skip_gif 1
213#focr_skip_jpeg 1
214#focr_skip_png 1
215#focr_skip_bmp 1
216#focr_skip_tiff 1
217#
218
219# PDF specific options
220# WARNING: Enable this at your own risk, this might lead to false positives and classify
221#          important documents as spam. YOU HAVE BEEN WARNED.
222#focr_scan_pdfs 0
223# PDFs having more pages than this value will be skipped
224#focr_pdf_maxpages 1
225
226# Default detection treshold (see manual)
227# Default value: 0.25 (Can be changed on a per word basis in the wordlist).
228#focr_threshold 0.20
229
230# Number of minimum matches before the rule scores (Default value: 2)
231#focr_counts_required 3
232
233# Setting this will cause every word to be matched only once per image (Default value: 0)
234#focr_unique_matches 1
235
236# This is the score for a hit after focr_counts_required matches
237# Default value: 5
238#focr_base_score 5
239
240# This is the additional score for every additional match after
241# focr_counts_required matches
242# Default value: 1
243#focr_add_score 0.375
244
245# This option defines the factor, which is multiplied with the number
246# of matches, that were made without stripping spaces. FuzzyOcr does two
247# matching attempts on OCR results, one without space strippings and one with.
248# To weight the first match type more, this factor is applied.
249# Default value: 1.5
250#focr_twopass_scoring_factor 1.5
251
252# This is the score to give for a wrong content-type.
253# e.g. JPEG image but content type says GIF
254# Default value: 1.5
255#focr_wrongctype_score 1.5
256
257# This is the score to give for a wrong file extension.
258# e.g. JPEG image but file extension says GIF
259# Default value: 1.5
260#focr_wrongext_score 1.5
261
262# This is the score to give for a corrupted image.
263# This currently affects only GIF images
264# Default value: 2.5
265#focr_corrupt_score 2.5
266
267# This is the score to give for a corrupted unfixable image.
268# This currently affects only GIF images.
269# Default value: 5
270#focr_corrupt_unfixable_score 5
271
272# This is used to disable the OCR engine if the message has
273# already more points than this value
274# Default value: 10
275#focr_autodisable_score 30
276
277# This is used to disable the OCR engine if the message has
278# already less points than this value
279# Default value: -5
280#focr_autodisable_negative_score -5
281
282
283###
284### Hashing Options (Optional)
285###
286
287# Select which type of image hashing to use:
288# Default value: 0 (disabled)
289# Allowed values:
290#  1 ... use digest_hash only (deprecated)
291#  2 ... use digest_db w/digest_hash import (see requirements, recommended)
292#  3 ... use mysql database (see requirements, experimental)
293#--
294# The score is saved with the hash in the database, allowing the plugin to
295# skip the scans when the image is found in the database, using the score
296# from the previous scans.
297#--
298#focr_enable_image_hashing 3
299
300# Set this to skip updating the hashing database at startup
301# Default value: 0 (update at startup)
302#focr_skip_updates 1
303
304# Automatically add hashes of spam images recognized by OCR to the Image
305# Hash database, to disable, set to 0
306# Default value: 1 (learn)
307#focr_hashing_learn_scanned 1
308
309# Score images who's global word count is below focr_counts_required using
310# the following formulae: (focr_add_score * word count) as score.
311# Default value: 0 (ignore images)
312#focr_score_ham 1
313
314# If the image hash database feature is enabled (Type 1 Hashing),
315# specify the file to use as database
316# Default value: /etc/mail/spamassassin/FuzzyOcr.hashdb
317#focr_digest_db /etc/mail/spamassassin/FuzzyOcr.hashdb
318
319# If the image hash db feature is enabled (Type 2 Hashing),
320# specify the file to use as the SPAM database
321# Default value: /etc/mail/spamassassin/FuzzyOcr.db
322#focr_db_hash /etc/mail/spamassassin/FuzzyOcr.db
323
324# If the image hash db feature is enabled (Type 2 Hashing),
325# specify the file to use as the HAM database
326# Default value: /etc/mail/spamassassin/FuzzyOcr.safe.db
327#focr_db_safe /etc/mail/spamassassin/FuzzyOcr.safe.db
328
329# Auto-prune: Expire records from hasing databases after these many days
330# Default value: 35
331#focr_db_max_days 15
332
333###
334### MySQL options (Type 3 Hashing)
335###
336
337#focr_mysql_db FuzzyOcr
338#focr_mysql_hash Hash
339#focr_mysql_safe Safe
340#focr_mysql_user fuzzyocr
341#focr_mysql_pass fuzzyocr
342#focr_mysql_host localhost
343#focr_mysql_port 3306
344#focr_mysql_socket /tmp/mysql.sock
345
346# If set, the database table is updated with different data from one of
347# the following:
348#  + filename,
349#  + image-params,
350#  + content-type,
351#  + file-type,
352#  + score,
353#  + word-info
354# Default value: 0
355#focr_mysql_update_hash 1
356
357###
358### Miscellaneous Options
359###
360
361# The pluging uses a temporary directory to store intermediate information.
362# In order to Keep these files for debugging purposes use any of these
363# values:
364#  0 = always cleanup (default value)
365#  1 = keep only if error
366#  2 = always keep
367#--
368# Keeping these intermediate files could fill your HDD _very_ fast!
369# Make shure you periodically empty your temp dir (usually: /tmp) or
370# suffer the conscecuences.  You've been warned!!
371#--
372#focr_keep_bad_images 1
373
374#################################################################
375# DO NOT REMOVE THIS LINE, IT IS REQUIRED UNDER ALL CIRCUMSTANCES
376focr_end_config
Note: See TracBrowser for help on using the browser.