Current Path : /usr/local/share/namazu/filter/ |
FreeBSD hs32.drive.ne.jp 9.1-RELEASE FreeBSD 9.1-RELEASE #1: Wed Jan 14 12:18:08 JST 2015 root@hs32.drive.ne.jp:/sys/amd64/compile/hs32 amd64 |
Current File : //usr/local/share/namazu/filter/pdf.pl |
# # -*- Perl -*- # $Id: pdf.pl,v 1.22.4.16 2006/04/30 08:27:51 opengl2772 Exp $ # Copyright (C) 1997-2000 Satoru Takabayashi , # 1999 NOKUBI Takatsugu , # 2000-2006 Namazu Project All rights reserved. # This is free software with ABSOLUTELY NO WARRANTY. # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either versions 2, or (at your option) # any later version. # # This program is distributed in the hope that it will be useful # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA # 02111-1307, USA # # This file must be encoded in EUC-JP encoding # package pdf; use strict; require 'util.pl'; require 'gfilter.pl'; my $pdfconvpath = undef; my $pdfinfopath = undef; my @pdfconvopts = undef; my @pdfinfoopts = undef; my $pdfconvver = 0; my $pdfinfover = 0; sub mediatype() { return ('application/pdf'); } sub status() { $pdfconvpath = util::checkcmd('pdftotext'); $pdfinfopath = util::checkcmd('pdfinfo'); if (defined $pdfconvpath) { my @cmd = ("$pdfconvpath"); my $result = ""; my $status = util::syscmd( command => \@cmd, option => { "stdout" => "/dev/null", "stderr" => \$result, "mode_stdout" => 'wt', "mode_stderr" => 'wt', }, ); if ($result =~ m/^pdftotext\s+version\s+([0-9]+\.[0-9]+)/m) { $pdfconvver = $1; } if (util::islang("ja")) { if ($pdfconvver >= 1.00) { @pdfconvopts = ('-q', '-raw', '-enc', 'EUC-JP'); } else { @pdfconvopts = ('-q', '-raw', '-eucjp'); } } else { @pdfconvopts = ('-q', '-raw'); } if (defined $pdfinfopath) { my @cmd = ("$pdfinfopath"); my $result = ""; my $status = util::syscmd( command => \@cmd, option => { "stdout" => "/dev/null", "stderr" => \$result, "mode_stdout" => 'wt', "mode_stderr" => 'wt', }, ); if ($result =~ /^pdfinfo\s+version\s+([0-9]+\.[0-9]+)/) { $pdfinfover = $1; } if (util::islang("ja")) { if ($pdfinfover >= 2.02) { @pdfinfoopts = ('-enc', 'EUC-JP'); } else { @pdfinfoopts = (); } } else { @pdfinfoopts = (); } } return 'yes'; } return 'no'; } sub recursive() { return 0; } sub pre_codeconv() { return 0; } sub post_codeconv () { return 0; } sub add_magic ($) { return; } sub filter ($$$$$) { my ($orig_cfile, $cont, $weighted_str, $headings, $fields) = @_; my $cfile = defined $orig_cfile ? $$orig_cfile : ''; util::vprint("Processing pdf file ... (using '$pdfconvpath')\n"); my $tmpfile = util::tmpnam('NMZ.pdf'); my $tmpfile2 = util::tmpnam('NMZ.pdf2'); { my $fh = util::efopen("> $tmpfile"); print $fh $$cont; util::fclose($fh); } my @cmd = ($pdfconvpath, @pdfconvopts, $tmpfile, $tmpfile2); my $status = util::syscmd( command => \@cmd, option => { "stdout" => "/dev/null", "stderr" => "/dev/null", }, ); unless (-e $tmpfile2) { unlink $tmpfile; return 'Unable to convert pdf file (maybe copying protection)'; } { my $fh = util::efopen("< $tmpfile2"); my $size = util::filesize($fh); if ($size == 0) { util::fclose($fh); unlink $tmpfile; unlink $tmpfile2; return "Unable to convert file ($pdfconvpath error occurred)"; } if ($size > $conf::FILE_SIZE_MAX) { util::fclose($fh); unlink $tmpfile; unlink $tmpfile2; return 'Too large pdf file'; } $$cont = util::readfile($fh); util::fclose($fh); } # codeconv::toeuc($cont); codeconv::codeconv_document($cont); if (defined $pdfinfopath) { my @cmd = ($pdfinfopath, @pdfinfoopts, $tmpfile); my $result = ""; my $status = util::syscmd( command => \@cmd, option => { "stdout" => \$result, "stderr" => "/dev/null", "mode_stdout" => 'wt', "mode_stderr" => 'wt', }, ); if ($result =~ /Title:\s+(.*)/) { # or /Subject:\s+(.*)/ $fields->{'title'} = $1; if ($fields->{'title'} =~ /<unicode>/) { delete $fields->{'title'}; } elsif ($fields->{'title'} =~ /^\s*$/) { delete $fields->{'title'}; } } if ($result =~ /Author:\s+(.*)/) { $fields->{'author'} = $1; if ($fields->{'author'} =~ /<unicode>/) { delete $fields->{'author'}; } elsif ($fields->{'author'} =~ /^\s*$/) { delete $fields->{'author'}; } } } unlink $tmpfile; unlink $tmpfile2; # Zenkaku-space handling bug fix (before pdftotext-0.90) if (util::islang("ja") && $pdfconvver <= 0.90) { $$cont =~ s/\xa1\xa0/\xa1\xa1/g; } gfilter::line_adjust_filter($cont); gfilter::line_adjust_filter($weighted_str); gfilter::white_space_adjust_filter($cont); $fields->{'title'} = gfilter::filename_to_title($cfile, $weighted_str) unless $fields->{'title'}; gfilter::show_filter_debug_info($cont, $weighted_str, $fields, $headings); return undef; } 1;