Changeset 98
- Timestamp:
- 09.12.2006 19:17:22 (2 years ago)
- Files:
-
- trunk/devel/FuzzyOcr.pm (modified) (10 diffs)
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
trunk/devel/FuzzyOcr.pm
r97 r98 41 41 use FuzzyOcr::Deanimate qw(deanimate); 42 42 use FuzzyOcr::Scoring qw(wrong_ctype wrong_extension corrupt_img known_img_hash); 43 use FuzzyOcr::Misc qw(max removedir s save_execute);43 use FuzzyOcr::Misc qw(max removedir removedirs save_execute); 44 44 45 45 our @ISA = qw(Mail::SpamAssassin::Plugin); … … 146 146 $fname =~ tr/\@\$\%\&/_/s; 147 147 } 148 my $test = 0; 149 $test++ if ($ctype =~ /image/i); 150 $test++ if ($fname =~ /\.(gif|jpe?g|png|bmp|tiff?)$/i); 151 152 if ($test == 0) { 153 infolog("Skipping file with content-type=\"$ctype\" name=\"$fname\""); 154 next; 155 } 156 157 $imgdir = Mail::SpamAssassin::Util::secure_tmpdir(); 158 159 set_tmpdir($imgdir); 160 161 unless ($imgdir) { 162 errorlog("Scan canceled, cannot create Image TMPDIR."); 163 return 0; 164 } 165 #keep raw email for debugging later 166 my $imgfilename = $imgdir . "/raw.eml"; 167 unless (-e $imgfilename) { 168 if (open RAW, ">$imgfilename") { 169 print RAW $pms->{msg}->get_pristine(); 170 close RAW; 171 debuglog("Saved: $imgfilename"); 172 } 173 } 174 175 $fname =~ tr{a-zA-Z0-9\.}{_}cs; 176 $imgfilename = Mail::SpamAssassin::Util::untaint_file_path( 177 $imgdir . "/" . $fname 178 ); 179 my $unique = 0; 180 while (-e $imgfilename) { 181 $imgfilename = Mail::SpamAssassin::Util::untaint_file_path( 182 $imgdir . "/" . chr(65+$unique) . "." . $fname 183 ); 184 $unique++; 185 } 186 148 149 my $filename = $fname; $filename =~ tr{a-zA-Z0-9\.}{_}cs; 187 150 my $pdata = $p->decode(); 188 151 my $pdatalen = length($pdata); … … 191 154 if ( substr($pdata,0,3) eq "\x47\x49\x46" ) { 192 155 ## GIF File 193 $imgfiles{$ imgfilename}{ftype} = 1;156 $imgfiles{$filename}{ftype} = 1; 194 157 ($w,$h) = unpack("vv",substr($pdata,6,4)); 195 infolog("GIF: [${h}x${w}] $ imgfilename");196 $imgfiles{$ imgfilename}{width} = $w;197 $imgfiles{$ imgfilename}{height} = $h;158 infolog("GIF: [${h}x${w}] $filename"); 159 $imgfiles{$filename}{width} = $w; 160 $imgfiles{$filename}{height} = $h; 198 161 } elsif ( substr($pdata,0,2) eq "\xff\xd8" ) { 199 162 ## JPEG File … … 220 183 } else { 221 184 ($h,$w) = unpack("nn",substr($pdata,$pos+3,4)); 222 infolog("JPEG: [${h}x${w}] $ imgfilename");223 $imgfiles{$ imgfilename}{ftype} = 2;224 $imgfiles{$ imgfilename}{height} = $h;225 $imgfiles{$ imgfilename}{width} = $w;185 infolog("JPEG: [${h}x${w}] $filename"); 186 $imgfiles{$filename}{ftype} = 2; 187 $imgfiles{$filename}{height} = $h; 188 $imgfiles{$filename}{width} = $w; 226 189 } 227 190 } elsif ( substr($pdata,0,4) eq "\x89\x50\x4e\x47" ) { 228 191 # PNG File 229 192 ($w,$h) = unpack("NN",substr($pdata,16,8)); 230 $imgfiles{$ imgfilename}{ftype} = 3;231 $imgfiles{$ imgfilename}{width} = $w;232 $imgfiles{$ imgfilename}{height} = $h;233 infolog("PNG: [${h}x${w}] $ imgfilename");193 $imgfiles{$filename}{ftype} = 3; 194 $imgfiles{$filename}{width} = $w; 195 $imgfiles{$filename}{height} = $h; 196 infolog("PNG: [${h}x${w}] $filename"); 234 197 } elsif ( substr($pdata,0,2) eq "BM" ) { 235 198 ## BMP File 236 199 ($w,$h) = unpack("NN",substr($pdata,18,8)); 237 $imgfiles{$ imgfilename}{ftype} = 4;238 $imgfiles{$ imgfilename}{width} = $w;239 $imgfiles{$ imgfilename}{height} = $h;240 infolog("BMP: [${h}x${w}] $ imgfilename");200 $imgfiles{$filename}{ftype} = 4; 201 $imgfiles{$filename}{width} = $w; 202 $imgfiles{$filename}{height} = $h; 203 infolog("BMP: [${h}x${w}] $filename"); 241 204 } elsif ( 205 ## TIFF File 242 206 (substr($pdata,0,4) eq "\x4d\x4d\x00\x2a") or 243 207 (substr($pdata,0,4) eq "\x49\x49\x2a\x00") … … 253 217 last if ($h != 0 and $w != 0); 254 218 } 255 infolog("TIFF: [${h}x${w}] $ imgfilename ($worder)");219 infolog("TIFF: [${h}x${w}] $filename ($worder)"); 256 220 infolog("Cannot determite size of TIFF image, setting to '1x1'") if ($h == 0 and $w == 0); 257 $imgfiles{$imgfilename}{ftype} = 5; 258 $imgfiles{$imgfilename}{width} = $w ? $w : 1; 259 $imgfiles{$imgfilename}{height} = $h ? $h : 1; 260 } 261 next unless defined $imgfiles{$imgfilename}{ftype}; 262 $imgfiles{$imgfilename}{fname} = $fname; 263 $imgfiles{$imgfilename}{ctype} = $ctype; 264 $imgfiles{$imgfilename}{fsize} = $pdatalen; 221 $imgfiles{$filename}{ftype} = 5; 222 $imgfiles{$filename}{width} = $w ? $w : 1; 223 $imgfiles{$filename}{height} = $h ? $h : 1; 224 } 225 226 #Skip unless we found the right header 227 unless (defined $imgfiles{$filename}{ftype}) { 228 infolog("Skipping file with content-type=\"$ctype\" name=\"$fname\""); 229 delete $imgfiles{$filename}; 230 next; 231 232 } 233 234 #Found Image!! Get a temporary dir to save image 235 $imgdir = Mail::SpamAssassin::Util::secure_tmpdir(); 236 unless ($imgdir) { 237 errorlog("Scan canceled, cannot create Image TMPDIR."); 238 return 0; 239 } 240 set_tmpdir($imgdir); 241 242 #Generete unique filename to store image 243 my $imgfilename = Mail::SpamAssassin::Util::untaint_file_path( 244 $imgdir . "/" . $fname 245 ); 246 my $unique = 0; 247 while (-e $imgfilename) { 248 $imgfilename = Mail::SpamAssassin::Util::untaint_file_path( 249 $imgdir . "/" . chr(65+$unique) . "." . $fname 250 ); 251 $unique++; 252 } 253 254 #Save important constants 255 $imgfiles{$filename}{fname} = $fname; 256 $imgfiles{$filename}{ctype} = $ctype; 257 $imgfiles{$filename}{fsize} = $pdatalen; 258 $imgfiles{$filename}{fpath} = $imgfilename; 259 260 #Save Image to disk. 265 261 unless (open PICT, ">$imgfilename") { 266 262 errorlog("Cannot write \"$imgfilename\", skipping..."); 267 delete $imgfiles{$imgfilename}; 263 delete $imgfiles{$filename}; 264 removedir($imgdir); 268 265 next; 269 266 } … … 272 269 close PICT; 273 270 debuglog("Saved: $imgfilename"); 271 272 #Increment valid image file counter 274 273 $cnt++; 274 275 #keep raw email for debugging later 276 my $rawfilename = $imgdir . "/raw.eml"; 277 if (open RAW, ">$imgfilename") { 278 print RAW $pms->{msg}->get_pristine(); 279 close RAW; 280 debuglog("Saved: $rawfilename"); 281 } 282 275 283 } 276 284 277 285 if ($cnt == 0) { 278 286 debuglog("Skipping OCR, no image files found..."); 279 removedirs(get_all_tmpdirs()) if (defined($imgdir) and ($conf->{focr_keep_bad_images}<2));280 287 return 0; 281 288 } … … 284 291 $conf->{focr_mysql_ddb} = get_mysql_ddb(); 285 292 } 286 287 my $haserr = open RAWERR, ">$imgdir/raw.err";288 debuglog("Errors to: $imgdir/raw.err") if ($haserr>0);289 293 290 294 # Try to load personal wordlist … … 304 308 305 309 IMAGE: 306 foreach my $file (keys %imgfiles) {307 my $pic = $imgfiles{$file };308 infolog("Analyzing file with content-type=\"$$pic{ctype}\"");310 foreach my $filename (keys %imgfiles) { 311 my $pic = $imgfiles{$filename}; 312 #infolog("Analyzing file with content-type=\"$$pic{ctype}\""); 309 313 my @used_scansets = (); 310 314 my $corrupt = 0; … … 312 316 my $generic_ctype = 0; 313 317 my $digest; 318 my $file = $$pic{fpath}; 314 319 my $tfile = $file; 315 320 my $pfile = $file . ".pnm"; … … 317 322 debuglog("pfile => $pfile"); 318 323 debuglog("efile => $efile"); 324 325 #Open ERRORLOG 326 my $haserr = $Mail::SpamAssassin::Logger::LOG_SA{level} == 3; 327 328 if ($haserr) { 329 $haserr = open RAWERR, ">$imgdir/raw.err"; 330 debuglog("Errors to: $imgdir/raw.err") if ($haserr>0); 331 } 319 332 320 333 my $mimetype = $$pic{ctype};
