root/trunk/devel/FuzzyOcr/Hashing.pm

Revision 133, 14.9 kB (checked in by decoder, 1 year ago)

Added License tags too all code files

Line 
1 # <@LICENSE>
2 # Licensed to the Apache Software Foundation (ASF) under one or more
3 # contributor license agreements.  See the NOTICE file distributed with
4 # this work for additional information regarding copyright ownership.
5 # The ASF licenses this file to you under the Apache License, Version 2.0
6 # (the "License"); you may not use this file except in compliance with
7 # the License.  You may obtain a copy of the License at:
8 #
9 #     http://www.apache.org/licenses/LICENSE-2.0
10 #
11 # Unless required by applicable law or agreed to in writing, software
12 # distributed under the License is distributed on an "AS IS" BASIS,
13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 # See the License for the specific language governing permissions and
15 # limitations under the License.
16 # </@LICENSE>
17
18 use strict;
19 package FuzzyOcr::Hashing;
20
21 use base 'Exporter';
22 our @EXPORT_OK = qw(check_image_hash_db
23     add_image_hash_db
24     calc_image_hash);
25
26 use lib qw(..);
27 use FuzzyOcr::Config qw(get_thresholds get_config set_config get_tmpdir get_mysql_ddb);
28 use FuzzyOcr::Misc qw(save_execute);
29 use FuzzyOcr::Logging qw(debuglog errorlog warnlog infolog);
30 use Fcntl;
31 use Fcntl ':flock';
32
33 #Implements all functions related to Image Hashing
34
35 sub within_threshold {
36     my $thresref = get_thresholds();
37     my %Threshold = %$thresref;
38
39     my $digest = shift;
40     my $record = shift;
41
42     my ($dimg,$dkey) = split('::',$digest);
43     my ($rimg,$rkey) = split('::',$record);
44     my ($ds, $dh, $dw, $dcn) = split(':',$dimg);
45     my ($rs, $rh, $rw, $rcn) = split(':',$rimg);
46     return(0) unless $rs;
47     return(0) unless $rh;
48     return(0) unless $rw;
49     return(0) unless $rcn;
50     return(0) unless $rkey;
51     return(0) if ((abs($ds  - $rs ) / $rs ) > $Threshold{s});
52     return(0) if ((abs($dh  - $rh ) / $rh ) > $Threshold{h});
53     return(0) if ((abs($dw  - $rw ) / $rw ) > $Threshold{w});
54     return(0) if ((abs($dcn - $rcn) / $rcn) > $Threshold{cn});
55            
56     my @rcf = split('::',$rkey);
57     my @dcf = split('::',$dkey);
58
59     my (@dcfs, @rcfs);
60     foreach (@dcf) { push @dcfs,split(':',$_); }
61     foreach (@rcf) { push @rcfs,split(':',$_); }
62
63     my $total = scalar(@rcfs);
64     if ($total == scalar(@dcfs)) {
65         my $match = 0;
66         foreach (0 .. ($total-1)) {
67             $match++ if (abs($dcfs[$_] - $rcfs[$_]) <= $Threshold{c});
68         }
69         infolog("image matched <$match> of <$total> colors");
70         return(1) if ($match == $total);
71     }
72     return(0);
73 }
74
75 sub check_image_hash_db {
76     my $conf = get_config();
77     if ($conf->{focr_enable_image_hashing} == 0) {
78         warnlog("Image Hashing is disabled");
79         return (0,'');
80     }
81     my $digest = $_[0];
82     my $dbfile = $_[1] || $conf->{focr_db_hash};
83     my $fname  = $_[2];
84     my $ctype  = $_[3];
85     my $ftype  = $_[4] || 0;
86     my ($img, $key) = split('::', $digest,2);
87     return (0,'') unless defined $key;
88     my $now = time;
89     my $hash = $digest;
90     my $ret = 0; my $txt = 'Exact';
91     my $dinfo;
92
93     if ($conf->{focr_enable_image_hashing} == 3) {
94         unless (defined $conf->{focr_mysql_ddb}) {
95             warnlog("Connection to MySQL server unavailable");
96             return (0,'');
97         }
98         my $ddb   = $conf->{focr_mysql_ddb};
99         my $db    = $conf->{focr_mysql_db};
100         my $sql   = qq(select * from $db.$dbfile where $dbfile.key='$key');
101         my @data  = $ddb->selectrow_array($sql);
102         my $next  = 0;
103         my $when  = 0;
104         my $match = 0;
105         if ((scalar(@data)>0) and ($img eq $data[1])) {
106             $match++;
107             infolog("Found[$dbfile]: Score='$data[8]' Info: '$data[9]'");
108             $next  = $data[5]; $next++;
109             $when  = $data[7]; $data[8] += 0;
110             $ret   = $data[8] == 0 ? 0.001 : $data[8];
111             $dinfo = $data[9] || '';
112             if ($data[2] eq '') {
113                 infolog("Updating $txt info File-Name:'$fname'");
114                 $ddb->do(qq(update $db.$dbfile set $dbfile.fname=? where $dbfile.key='$key'),undef,$fname);
115             }
116             if ($data[3] eq '') {
117                 infolog("Updating $txt info Content-Type:'$ctype'");
118                 $ddb->do(qq(update $db.$dbfile set $dbfile.ctype=? where $dbfile.key='$key'),undef,$ctype);
119             }
120             if ($data[4] != $ftype) {
121                 infolog("Updating $txt info File-Type:'$ftype'");
122                 $ddb->do(qq(update $db.$dbfile set $dbfile.ftype=? where $dbfile.key='$key'),undef,$ftype);
123             }
124         }
125         unless ($match) {
126             my $then = time - ($conf->{focr_db_max_days}*86400);
127             $sql = qq(select * from $db.$dbfile order by $dbfile.check);
128             my $sth  = $ddb->prepare($sql); $sth->execute;
129             while (my @row = $sth->fetchrow_array) {
130                 my $hash2 = $row[1] || "0:0:0:0";
131                 $hash2 .= "::$row[0]";
132                 if (within_threshold($digest,$hash2)) {
133                     $txt   = 'Approx';
134                     $key   = $row[0];
135                     $next  = $row[5] + 1;
136                     $when  = $row[7] || $now;
137                     $ret   = $dbfile eq $conf->{focr_mysql_hash} ? $row[8] : $row[5];
138                     $dinfo = $row[9] || '';
139                     infolog("Found[$dbfile]: Score='$row[8]' Info: '$row[9]'");
140                     last;
141                 }
142             }
143             # Expire old records...
144             $sql = qq(delete from $db.$dbfile where $dbfile.check < $then);
145             debuglog($sql,2);
146             $ddb->do($sql);
147         }
148         if ($ret > 0) {
149             if ($dbfile eq $conf->{focr_mysql_hash}) {
150                 infolog("Found Score <$ret> for $txt Image Hash");
151             }
152             infolog("Matched [$next] time(s). Prev match: ".fmt_time($now - $when));
153             $sql = qq(update $db.$dbfile set $dbfile.match='$next',$dbfile.check='$now' where $dbfile.key='$key');
154             $ddb->do($sql);
155             debuglog($sql);
156         }
157         return ($ret,$dinfo);
158     }
159     elsif ($conf->{focr_enable_image_hashing} == 2) {
160         import MLDBM qw(DB_File Storable);
161         my %DB = (); my $dbm; my $sdbm;
162         $sdbm = tie %DB, 'MLDBM::Sync', $dbfile, O_CREAT|O_RDWR or $ret++;
163         if ($ret>0) {
164             warnlog("No Image Hash database found at \"$dbfile\", or permissions wrong.");
165             return (0,'');
166         }
167         $sdbm->Lock;
168         if (defined $DB{$key}) {
169             $dbm = $DB{$key};
170             if ($img eq $dbm->{basic}) {
171                 $ret = $dbm->{score} || 0.001;
172                 $dinfo = $dbm->{dinfo} || '';
173                 $dbm->{fname} = $fname;
174                 $dbm->{ctype} = $ctype;
175                 infolog("Updating $txt info File:'$fname' Type:'$ctype'");
176                 $DB{$key} = $dbm;
177             }
178         }
179         if ($ret == 0) {
180             my $then = time - ($conf->{focr_db_max_days}*86400);
181             foreach my $k (keys %DB) {
182                 $dbm  = $DB{$k};
183                 $hash = $dbm->{basic} ? $dbm->{basic} : "0:0:0:0::$k";
184                 if (within_threshold($digest,$hash)) {
185                     $ret  = $dbfile eq $conf->{focr_db_hash} ? $dbm->{score} : $dbm->{match};
186                     $txt  = 'Approx'; $dinfo = $dbm->{dinfo} || '';
187                     infolog("Found in: <$dbfile>");
188                     last;
189                 }
190                 # Has the record expired??
191                 $dbm->{check} = $now - 1 unless defined $dbm->{check};
192                 if ($dbm->{check} < $then) {
193                     infolog("Expiring <$k> older than $conf->{focr_db_max_days} days");
194                     delete $DB{$k};
195                 }
196             }
197         }
198         if ($ret>0) {
199             $dbm->{match}++;
200             if ($dbfile eq $conf->{focr_db_hash}) {
201                 $ret = sprintf("%0.3f",$dbm->{score});
202                 infolog("Found Score <$ret> for $txt Image Hash");
203             }
204             infolog("Matched [$dbm->{match}] time(s). Prev match: ".fmt_time(time - $dbm->{check}));
205             $dbm->{check} = time;
206             $DB{$key} = $dbm;
207         }
208         $sdbm->UnLock;
209         undef $sdbm;
210         untie %DB;
211         return ($ret,$dinfo);
212     }
213     elsif ($conf->{focr_enable_image_hashing} == 1) {
214         $ret = open HASH, $conf->{focr_digest_db};
215         unless($ret) {
216             warnlog("No Image Hash database found at \"$conf->{focr_digest_db}\", or permissions wrong.");
217             return (0,'');
218         }
219         while (<HASH>) {
220             chomp;
221             ($ret,$hash) = split('::',$_,2);
222             if (within_threshold($digest,$hash)) {
223                 infolog("Found Score <$ret> for Hash <$hash>");
224                 return ($ret,'');
225             }
226         }
227         close HASH;
228         return (0,'');
229     }
230 }
231
232 sub add_image_hash_db {
233     my $conf = get_config();
234     return if ($conf->{focr_enable_image_hashing} == 0);
235     my $digest = $_[0];
236     my $score  = $_[1];
237     my $ret = 0;
238
239     if ($conf->{focr_enable_image_hashing} == 3) {
240         unless (defined $conf->{focr_mysql_ddb}) {
241             warnlog("Connection to MySQL server unavailable");
242             return;
243         }
244         my $db    = $conf->{focr_mysql_db};
245         my $ddb   = $conf->{focr_mysql_ddb};
246         my $table = $_[2] || $conf->{focr_mysql_hash};
247         my $fname = $_[3] || '';
248         my $ctype = $_[4] || 'image';
249         my $ftype = $_[5] || 0;
250         my $dinfo = $_[6] || '';
251         infolog("Adding Hash to table: \"$db.$table\" with score \"$score\"");
252         my $sql;
253         my ($img,$key) = split('::',$digest,2);
254         if (defined $key) {
255             $sql = "select basic from $db.$table where $table.key='$key'";
256             my @data = $ddb->selectrow_array($sql);
257             if (scalar(@data)>0) {
258                 if ($conf->{focr_mysql_update_hash}) {
259                     infolog("Hash already in $db.$table updating...");
260                     $sql  = "update $db.$table set ";
261                     my @params;
262                     unless ($data[1] eq $img) {
263                         $sql .= "basic=?,"; push @params,$img;
264                     }
265                     unless ($data[2] eq $fname) {
266                         $sql .= "fname=?,"; push @params,$fname;
267                     }
268                     unless ($data[3] eq $ctype) {
269                         $sql .= "ctype=?,"; push @params,$ctype;
270                     }
271                     unless ($data[4] == $ftype) {
272                         $sql .= "ftype=?,"; push @params,$ftype;
273                     }
274                     unless ($data[8] == $score) {
275                         $sql .= "score=?,"; push @params,$score;
276                     }
277                     unless ($data[9] == $dinfo) {
278                         $sql .= "dinfo=?,"; push @params,$dinfo;
279                     }
280                     $sql  =~ s/,$//;
281                     $sql .= " where $table.key='$key'";
282                     $ddb->do($sql,undef,@params);
283                     foreach my $p (@params) { $sql =~ s/\?/$p/; }
284                     debuglog($sql);
285                 } else {
286                     infolog("Hash already in $db.$table skipping...");
287                 }
288             } else {
289                 my @params = (
290                     $key, $img, $fname, $ctype, $ftype,
291                     ($table eq $conf->{focr_mysql_hash} ? 0 : 1),
292                     time, time, $score, $dinfo);
293                 $sql = "insert into $db.$table values (?,?,?,?,?,?,?,?,?,?)";
294                 $ddb->do($sql,undef,@params);
295                 foreach my $p (@params) { $sql =~ s/\?/$p/; }
296                 debuglog($sql);
297             }
298         }
299     }
300     elsif ($conf->{focr_enable_image_hashing} == 2) {
301         import MLDBM qw(DB_File Storable);
302         my $dbfile = $_[2] || $conf->{focr_db_hash};
303         my %DB = (); my $sdbm;
304         $sdbm = tie %DB, 'MLDBM::Sync', $dbfile, O_CREAT|O_RDWR or $ret++;
305         if ($ret>0) {
306             warnlog("Unable to open/create Image Hash database at \"$dbfile\", check permissions.");
307             return;
308         }
309         $sdbm->Lock;
310         infolog("Adding Hash to \"$dbfile\" with score \"$score\"");
311         my ($img,$key) = split('::',$digest,2);
312         if (defined $key) {
313             my $dbm = $DB{$key};
314             $dbm->{fname} = $_[3];
315             $dbm->{ctype} = $_[4];
316             $dbm->{ftype} = $_[5];
317             $dbm->{dinfo} = $_[6];
318             $dbm->{basic} = $img;
319             $dbm->{score} = $score;
320             $dbm->{input} =
321             $dbm->{check} = time;
322             $dbm->{match} = $dbfile eq $conf->{focr_db_hash} ? 0 : 1;
323             $DB{$key} = $dbm;
324         }
325         $sdbm->UnLock;
326         undef $sdbm;
327         untie %DB;
328     }
329     elsif ($conf->{focr_enable_image_hashing} == 1) {
330         if (-e $conf->{focr_digest_db}) {
331             $ret = open DB, ">>$conf->{focr_digest_db}";
332         } else {
333             $ret = open DB,  ">$conf->{focr_digest_db}";
334         }
335         unless ($ret) {
336             warnlog("Unable to open/create Image Hash database at \"$conf->{focr_digest_db}\", check permissions.");
337             return;
338         }
339         infolog("Adding Hash to \"$conf->{focr_digest_db}\"");
340         flock( DB, LOCK_EX );
341         seek( DB, 0, 2 );
342         print DB "${score}::${digest}\n";
343         close(DB);
344     }
345     debuglog("Digest: $digest");
346 }
347
348 sub calc_image_hash {
349     my $conf = get_config();
350     my $imgdir = get_tmpdir();
351     my $thresref = get_thresholds();
352     my %Threshold = %$thresref;
353     my $pfile = $_[0];
354     my $pic   = $_[1];
355     my $hash;
356
357     foreach my $a (qw/ppmhist/) { #pamfile
358         unless (defined $conf->{"focr_bin_$a"}) {
359             errorlog("calc_image_hash cannot exec $a");
360             return (1, '');
361         }
362     }
363    
364     unless (-r $pfile) {
365         errorlog("Cannot read $pfile");
366         return(1, '');
367     }
368
369     my ($r, @stdout_data) = save_execute(
370         "$conf->{focr_bin_ppmhist} -noheader $pfile", undef,
371         ">$imgdir/ppmhist.info",
372         ">/dev/null", 1);
373
374     if ($r) {
375         chomp $r;
376         errorlog("$conf->{focr_bin_ppmhist}: ".
377             ($r<0) ? 'Timed out' : 'Error'
378             ." [$r], skipping...");
379         return (1, '');
380     }
381
382     my $cnt = 0;
383     my $c = scalar(@stdout_data);
384     my $s = (stat($pfile))[7] || 0;
385     $hash = sprintf "%d:%d:%d:%d",$s,
386         defined $pic->{height} ? $pic->{height} : 0,
387         defined $pic->{width}  ? $pic->{width}  : 0,
388         $c;
389     if ($Threshold{max_hash}) {
390         foreach (@stdout_data) {
391             $_ =~ s/ +/ /g;
392             my(@d) = split(' ', $_);
393             $hash .= sprintf("::%d:%d:%d:%d:%d",@d);
394             if ($cnt++ ge $Threshold{max_hash}) {
395                 last;
396             }
397         }
398     }
399     debuglog("Got: <$hash>");
400     return(0, $hash);
401 }
402
403 sub fmt_time {
404     my $when = $_[0];
405     my $ret;
406
407     if ($when>86400) {
408         my $d = int($when/86400);
409         $when -= $d*86400;
410         $ret = "$d days,";
411     }
412     if ($when>3600) {
413         my $h = int($when/3600);
414         $when -= $h*3600;
415         $ret .= " $h hrs.";
416     }
417     if ($when>60) {
418         my $m = int($when/60);
419         $when -= $m*60;
420         $ret .= " $m min.";
421     }
422     if ($when>0) {
423         $ret .= " $when sec.";
424     }
425     $ret .= " ago";
426     return $ret;
427 }
428
429 1;
Note: See TracBrowser for help on using the browser.