root/trunk/devel/Utils/fuzzy-find

Revision 133, 10.1 kB (checked in by decoder, 1 year ago)

Added License tags too all code files

Line 
1 #!/usr/local/bin/perl
2 #
3 # <@LICENSE>
4 # Licensed to the Apache Software Foundation (ASF) under one or more
5 # contributor license agreements.  See the NOTICE file distributed with
6 # this work for additional information regarding copyright ownership.
7 # The ASF licenses this file to you under the Apache License, Version 2.0
8 # (the "License"); you may not use this file except in compliance with
9 # the License.  You may obtain a copy of the License at:
10 #
11 #     http://www.apache.org/licenses/LICENSE-2.0
12 #
13 # Unless required by applicable law or agreed to in writing, software
14 # distributed under the License is distributed on an "AS IS" BASIS,
15 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 # See the License for the specific language governing permissions and
17 # limitations under the License.
18 # </@LICENSE>
19
20 use Getopt::Long;
21 use DBI;
22 use MLDBM qw(DB_File Storable);
23 my %Files = (
24     db_hash => '/etc/mail/spamassassin/FuzzyOcr.db',
25     db_safe => '/etc/mail/spamassassin/FuzzyOcr.safe.db',
26     ); 
27
28 my %MySQL = (
29      db   => 'FuzzyOcr'
30     ,hash => 'Hash'
31     ,safe => 'Safe'
32     ,user => 'fuzzyocr'
33     ,pass => 'fuzzyocr'
34     ,host => 'localhost'
35     ,port => 3306
36 );
37
38 # defaults
39 my $cfgfile = "/etc/mail/spamassassin/FuzzyOcr.cf";
40 my %App;
41 my @bin_utils = qw/pamfile ppmhist jpegtopnm giftopnm pngtopnm bmptopnm/;
42
43 my $delete = 0;
44 my $verbose = 0;
45 my $learn_ham = 0;
46 my $learn_spam = 0;
47 my $score;
48 GetOptions(
49     'verbose' => \$verbose,
50     'delete'  => \$delete,
51     'config=s' => \$cfgfile,
52     'score=f' => \$score,
53     'learn-ham' => \$learn_ham,
54     'learn-spam' => \$learn_spam,
55 );
56
57 unless (@ARGV) {
58     print "Usage: fuzzy-find.pl [Options] (imagehash|imagefile) \n";
59     print "\n";
60     print "Available options:\n";
61     print "--config=s   Specify location of FuzzyOcr.cf\n";
62     print "             Default: /etc/mail/spamassassin/FuzzyOcr.cf\n";
63     print "--delete     Removes the hash from the database\n";
64     print "--learn-ham  Add the hash as ham to the database\n";
65     print "--learn-spam Add the hash as spam to the database\n";
66     print "--score=i    Score to use when adding ham/spam\n";
67     print "--verbose    Show more informations\n";
68     print "\n";
69     exit 1;
70 }
71
72 # Setup default score
73 unless (defined $score) {
74     $score = $learn_ham ? 10 : 0;
75 }
76
77 # Read custom paths from FuzzyOcr.cf
78 my $app_path = q(/usr/local/netpbm/bin:/usr/local/bin:/usr/bin);
79 open CONFIG, "< $cfgfile" or warn "Can't read configuration file, using defaults...\n";
80
81 while (<CONFIG>) {
82     chomp;
83     if ($_ =~ m/^focr_bin_(\w+) (.+)/) {
84         $App{$1} = $2;
85         printf "Found custom path \"$2\" for application \"$1\"\n" if $verbose;
86     }
87     if ($_ =~ m/^focr_path_bin (.+)/) {
88         $app_path = $1;
89         printf "Found new path: \"$1\"\n" if $verbose;
90     }
91     if ($_ =~ m/^focr_enable_image_hashing (\d)/) {
92         $App{hashing_type} = $1;
93         printf "Found DB Hashing\n" if ($verbose and $1 == 2);
94         printf "Found MySQL Hashing\n" if ($verbose and $1 == 3);
95     }
96     if ($_ =~ m/^focr_mysql_(\w+) (.+)/) {
97         $MySQL{$1} = $2;
98         printf "Found MySQL option $1 => '$2'\n" if $verbose;
99     }
100     if ($_ =~ m/^focr_threshold_max_hash (.+)/) {
101         $App{max_hash} = $1;
102         printf "Updated Thresold{max_hash} = $1\n" if $verbose;
103     }
104 }
105
106 close CONFIG;
107
108 # make shure we have this threshold set
109 $App{max_hash} = 5 unless defined $App{max_hash};
110
111 # search path for bin_util unless already specified in configuration file
112 foreach my $app (@bin_utils) {
113     next if defined $App{$app};
114     foreach my $d (split(':',$app_path)) {
115         if (-x "$d/$app") {
116             $App{$app} = "$d/$app";
117             last;
118         }
119     }
120 }
121
122 sub get_ddb {
123     my %dopts = ( AutoCommit => 1 );
124     my $dsn = "dbi:mysql:database=".$MySQL{db};
125     if (defined $MySQL{socket}) {
126         $dsn .= ";mysql_socket=$MySQL{socket}";
127     } else {
128         $dsn .= ";host=$MySQL{host}";
129         $dns .= ";port=$MySQL{port}" unless $MySQL{port} == 3306;
130     }
131     printf "Connecting to: $dsn\n" if $verbose;
132     return DBI->connect($dsn,$MySQL{user},$MySQL{pass},\%dopts);
133 }
134
135 while (@ARGV) {
136     my $file = shift @ARGV;
137     my @data = ();
138     if ($file =~ m/(\d+):(\d+):(\d+):(\d+)/) {
139         push @data, $1,$2,$3,$4;
140     } elsif ($file eq ':::0') {
141         $key = $file;
142         $data[3] = 0;
143     } else {
144         next unless -r $file;
145     }
146     my $key = '';
147     my $ctype = '';
148     my $ftype = 0;
149     unless (@data) {
150         my $app;
151         if (($file =~ m/\.jpg$/i) or ($file =~ m/\.jpeg$/i)) {
152             $app = $App{jpegtopnm};
153             $ctype = "image/jpeg";
154             $ftype = 2;
155         } elsif ($file =~ m/\.png$/i) {
156             $app = $App{pngtopnm};
157             $ctype = "image/png";
158             $ftype = 3;
159         } elsif ($file =~ m/\.bmp$/i) {
160             $ctype = "image/bmp";
161             $app = $App{bmptopnm};
162             $ftype = 4;
163         } elsif ($file =~ m/\.tiff?$/i) {
164             $app = $App{tifftopnm};
165             $ctype = "image/tiff";
166             $ftype = 5;
167         } elsif ($file =~ m/\.gif$/i) {
168             $app = $App{giftopnm};
169             $ctype = "image/gif";
170             $ftype = 1;
171         } elsif ($file =~ m/\.pnm$/i) {
172             $app = '/bin/cat';
173             $ctype = "image/pnm";
174         } else {
175             print "Unknown extension given in \"$file\", aborting...\n";
176             exit 1;
177         }
178         my @hist = `$app $file 2>/dev/null |$App{ppmhist} -noheader -`;
179         my @res = `$app $file 2>/dev/null |$App{pamfile} -`;
180         my ($h,$w) = (0,0);
181         if ($res[0] =~ m/(\d+) by (\d+)/) {
182             $w = $1; $h = $2;
183             printf "Found ($h,$w)\n" if $verbose
184         }
185         my $c = scalar(@hist); my $cnt = 0;
186         printf "Colors: %d\n",$c if $verbose;
187         push @data, (stat($file))[7],$h,$w,$c;
188         foreach (@hist) {
189             $_ =~ s/ +/ /g;
190             my @d = split(' ',$_);
191             $hash .= sprintf("::%d:%d:%d:%d:%d",@d);
192             last if ($cnt++ ge $App{max_hash});
193         }
194         $key = substr($hash,2);
195     }
196     printf "Img = %9d %dx%dx%d\n",@data;
197     printf "key = <$key>\n" if ($key);
198     if ($learn_spam || $learn_ham) {
199         if ($App{hashing_type} == 2) {
200             my %DB;
201             my $ff = $learn_spam ? 'db_hash' : 'db_safe';
202             my $dfscore = $learn_spam ? 5 : -5;
203             $score = $score ? $score : $dfscore;
204             tie %DB, 'MLDBM', $Files{$ff} or die "Can't open $ff";
205             print "Adding key to database...\n";
206             if (defined $key) {
207                 my $dbm = $DB{$key};
208                 $dbm->{fname} = $file;
209                 $dbm->{ctype} = $ctype;
210                 $dbm->{dinfo} = "Manually added to the database\n";
211                 $dbm->{basic} = join(':', @data);
212                 $dbm->{score} = $score;
213                 $dbm->{input} =
214                 $dbm->{check} = time;
215                 $dbm->{match} = $learn_spam ? 0 : 1;
216                 $DB{$key} = $dbm;
217             }
218             untie %DB;
219             exit 0;
220         } elsif ($App{hashing_type} == 3) {
221             my $ddb = get_ddb();
222             if ($ddb) {
223                 my $now = time;
224                 my $tab = $learn_spam ? 'hash' : 'safe';
225                 my $sql = "INSERT INTO $MySQL{$tab} VALUES ('" . $key
226                     . "','" . join(':',@data)."','"
227                     . "','" . $file
228                     . "','" . $ctype
229                     . "','" . $ftype
230                     . "','" . $learn_spam ? 0 : 1
231                     . "','" . $now
232                     . "','" . $now
233                     . "','" . $score
234                     . "','" . "Manually added to the database\n";
235                 $ddb->do($sql);
236                 $dbb->disconnect;
237             } else {
238                 printf "Cannot connect to $dsn\n";
239                 exit 1;
240             }
241             exit 0;
242         }
243     } else {
244         if ($App{hashing_type} == 2) {
245             foreach my $ff (keys %Files) {
246                 my %DB;
247                 tie %DB, 'MLDBM', $Files{$ff} or next;
248                 printf "Searching $Files{$ff}...\n";
249                 foreach my $kk (keys %DB) {
250                     my $db = $DB{$kk};
251                     my @dd = split('::',$kk);
252                     shift @dd if ($dd[0] !~ m/:/);
253                     my $dd = join('::',@dd);
254                     if ($key eq '') {
255                         next unless ($db->{basic} eq join(':',@data));
256                     } else {
257                         next unless ($dd eq $key);
258                     }
259                     printf "%s HASH\n",($delete)?'Removing':'Found';
260                     if ($delete) {
261                         delete $DB{$kk};
262                     } else {
263                         printf "ImageInfo  : %9d:%d:%d:%d\n",split(':',$db->{basic});
264                         printf "Matched    : %4d Time(s)\n",$db->{match};
265                         printf "Calc.Score : %9.3f\n",$db->{score};
266                         printf "in DB since: %s\n",scalar(localtime($db->{input}));
267                         printf "Last Match : %s\n",scalar(localtime($db->{check}));
268                     }
269                 }
270                 untie %DB;
271             }
272         } elsif ($App{hashing_type} == 3) {
273             my $ddb = get_ddb();
274             if ($ddb) {
275                 foreach my $ff (sort keys %Files) {
276                     my $sql;
277                     if ($delete) {
278                         $sql = "DELETE FROM $ff WHERE $MySQL{$tab}.key=?";
279                         $ddb->do($sql,undef,$key);
280                     } else {
281                         my $tab = $ff; $tab =~ s/db_//;
282                         $sql = "SELECT * FROM $MySQL{$tab} where $MySQL{$tab}.key=?";
283                         my @data = $ddb->selectrow_array($sql,undef,$key);
284                         if (scalar(@data)) {
285                             printf "ImageInfo  : %9d:%d:%d:%d\n",split(':',$data[1]);
286                             printf "Matched    : %4d Time(s)\n",$data[5];
287                             printf "Calc.Score : %9.3f\n",$data[8];
288                             printf "in DB since: %s\n",scalar(localtime($data[6]));
289                             printf "Last Match : %s\n",scalar(localtime($data[7]));
290                         }
291                     }
292                 }
293                 $ddb->disconnect;
294             }
295         }
296     }
297 }
Note: See TracBrowser for help on using the browser.