Current Path : /usr/local/share/namazu/pl/ |
FreeBSD hs32.drive.ne.jp 9.1-RELEASE FreeBSD 9.1-RELEASE #1: Wed Jan 14 12:18:08 JST 2015 root@hs32.drive.ne.jp:/sys/amd64/compile/hs32 amd64 |
Current File : //usr/local/share/namazu/pl/document.pl |
# # -*- Perl -*- # document.pl - class for document # # $Id: document.pl,v 1.4.2.3 2006/04/16 13:42:20 opengl2772 Exp $ # # Copyright (C) 2004 Yukio USUDA All rights reserved. # Copyright (C) 2000-2006 Namazu Project All rights reversed. # This is free software with ABSOLUTELY NO WARRANTY. # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either versions 2, or (at your option) # any later version. # # This program is distributed in the hope that it will be useful # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA # 02111-1307, USA package mknmz::document; use strict; use English; use File::Copy; require 'gfilter.pl'; require 'filter.pl'; require 'gettext.pl'; use File::MMagic; my $Magic = new File::MMagic; my $Filter = undef; sub new { my $self = {}; my $proto = shift @_; my $class = ref($proto) || $proto; bless($self, $class); return $self; } sub init { my $self = shift @_; unless ($Filter) { $Filter = mknmz::filter->new(); $Filter->init($Magic); } } sub init_doc { my $self = shift @_; $self->{'_uri'} = shift @_; $self->{'_orig_filename'} = shift @_; $self->{'_contentref'} = shift @_; $self->{'_mimetype'} = shift @_; $self->{'_filename'} = undef; $self->{'_filtered'} = undef; $self->{'_weighted_str'} = undef; $self->{'_headings'} = undef; $self->{'_errmsg'} = undef; $self->{'_field_info'} = { 'author' => '', 'title' => '' }; $self->_check_content(); $self->{'_mimetype'} = $self->get_mimetype(); } sub get_file_size { my $self = shift @_; return $self->{'_field_info'}{'size'}; } sub get_text_size { my $self = shift @_; unless ($self->{'_filtered'}) { $self->get_filtered_contentref(); } my $text_size = 0; if ($self->{'_contentref'}) { # Measure the text size after apply filter. $text_size = length(${$self->{'_contentref'}}) + length($self->{'_weighted_str'}); } return $text_size; } sub is_contain_kanji { my $self = shift @_; my $kanji = ${$self->{'_contentref'}} =~ tr/\xa1-\xfe/\xa1-\xfe/; $kanji += $self->{'_weighted_str'} =~ tr/\xa1-\xfe/\xa1-\xfe/; return $kanji; } sub get_field_info { my $self = shift @_; my $fieldname = shift @_; unless ($self->{'_filtered'}) { $self->get_filtered_contentref(); } return $self->{'_field_info'}{$fieldname}; } sub get_fields { my $self = shift @_; unless ($self->{'_filtered'}) { $self->get_filtered_contentref(); } return %{$self->{'_field_info'}}; } sub get_orig_filename { my $self = shift @_; return $self->{'_orig_filename'}; } sub get_filename { my $self = shift @_; unless ( $self->{'_filename'}) { my $tmp = ""; my $orig_filename = $self->get_orig_filename(); $self->{'_filename'} = gfilter::filename_to_title($orig_filename, $tmp) } return $self->{'_filename'}; } sub get_uri { my $self = shift @_; my $replace_uri = @_; unless ($self->{'_uri'}) { my $orig_filename = $self->get_orig_filename(); } return $self->{'_uri'}; } sub get_errmsg { my $self = shift @_; return $self->{'_errmsg'}; } sub get_filtered_contentref { my $self = shift @_; unless ($self->{'_filtered'}) { my $mtype = $self->get_mimetype(); my $content_encode = $self->get_content_encode(); my ($weighted_str, $headings) = $Filter->apply_filter(\$self->{'_orig_filename'}, $self->{'_contentref'}, $mtype, $content_encode, \$self->{'_field_info'}, ); if ($weighted_str eq 0){ ${$self->{'_contentref'}} = ""; $mtype = $headings; } $self->{'_weighted_str'} = $weighted_str; $self->{'_headings'} = $headings; $self->{'_filtered'} = 'yes'; $self->{'_errmsg'} = _check_file($self->{'_orig_filename'}, $self->get_file_size(), $self->get_text_size(), $mtype, $self->get_uri()); } return $self->{'_contentref'}; } sub get_mimetype { my $self = shift @_; unless ($self->{'_mimetype'}){ $self->{'_mimetype'} = _guess_mimetype($self->{'_orig_filename'}, $self->{'_contentref'}); } return $self->{'_mimetype'}; } sub get_weighted_str { my $self = shift @_; return $self->{'_weighted_str'}; } sub get_headings { my $self = shift @_; return $self->{'_headings'}; } sub get_mtime { my $self = shift @_; my $mtime = (stat($self->{'_orig_filename'}))[9] unless util::isurl($self->{'_orig_filename'}); $self->{'_field_info'}{'mtime'} = $mtime; return $self->{'_field_info'}{'mtime'}; } sub get_content_encode { my $self = shift @_; unless ($self->{'_content_encode'}){ my $mimetype = $self->get_mimetype; if ($mimetype eq "text/html"){ } } return $self->{'_content_encode'}; } sub get_filter { my $self = shift @_; return $Filter; } sub get_mmagic { my $self = shift @_; return $Magic; } sub get_info { my $self = shift @_; my $name = shift @_; if ($name eq 'mmagic_ver'){ return "$File::MMagic::VERSION" if $File::MMagic::VERSION; return ""; } } ############################################################# # # Decide the media type. # FIXME: Very ad hoc. It's just a compromise. -- satoru # sub _decide_type ($$) { my ($name, $cont) = @_; return $name if (!defined $cont || $name eq $cont); util::dprint("decide_type: name: $name, cont: $cont\n"); if ($cont =~ m!^text/plain! && $name =~ m!^text/plain!) { return $name; } elsif ($cont =~ m!^application/octet-stream! && $name !~ m!^text/!) { return $name; } elsif ($cont =~ m!^application/(excel|powerpoint|msword)! && $name !~ m!^application/octet-stream!) { # FIXME: Currently File::MMagic 1.02's checktype_data() # is unreliable for them. return $name; } elsif ($cont =~ m!^application/x-zip! && $name =~ m!^application/!) { # zip format is used other applications e.g. OpenOffice. # It is necessary to add to check extention. return $name; } return $cont; } sub _guess_mimetype ($$) { my ($cfile, $contref) = @_; my $mtype; my $called_dt = 0; { my $mtype_n = $Magic->checktype_byfilename($cfile); my $mtype_c = $Magic->checktype_data($$contref); my $mtype_m; $mtype_m = $Magic->checktype_magic($$contref) if ((!defined $mtype_c) || $mtype_c =~ /^(text\/html|text\/plain|application\/octet-stream)$/); $mtype_c = $mtype_m if (defined $mtype_m && $mtype_m !~ /^(text\/html|text\/plain|application\/octet-stream)$/); $mtype_c = 'text/plain' unless defined $mtype_c; if ($called_dt) { $mtype = $mtype_c; } else { $mtype = _decide_type($mtype_n, $mtype_c); $called_dt = 1; } } util::dprint(_("Detected type: ")."$mtype\n"); return $mtype; } sub _check_content { my $self = shift @_; my $file_size; if (!defined ${$self->{'_contentref'}}) { my $cfile = $self->{'_orig_filename'}; # for handling a filename which contains Shift_JIS code for Windows. # for handling a filename which contains including space. if (($cfile =~ /\s/) || ($English::OSNAME eq "MSWin32" && $cfile =~ /[\x81-\x9f\xe0-\xef][\x40-\x7e\x80-\xfc]|[\x20\xa1-\xdf]/) ) { my $tmpfile = util::tmpnam("NMZ.win32"); unlink $tmpfile if (-e $tmpfile); copy($self->{'_orig_filename'}, $tmpfile); $file_size = util::filesize($tmpfile); # not only file in feature. if ($file_size > $conf::FILE_SIZE_MAX) { $self->{'_errmsg'} = _("is larger than your setup before filtered, skipped: ") . 'conf::FILE_SIZE_MAX (' . $conf::FILE_SIZE_MAX . ') < '. $file_size ; $self->{'_mimetype'} = 'x-system/x-error; x-error=file_size_max'; unlink $tmpfile; return; } ${$self->{'_contentref'}} = util::readfile($tmpfile); unlink $tmpfile; } else { $file_size = util::filesize($cfile); # not only file in feature. if ($file_size > $conf::FILE_SIZE_MAX) { $self->{'_errmsg'} = _("is larger than your setup before filtered, skipped: ") . 'conf::FILE_SIZE_MAX (' . $conf::FILE_SIZE_MAX . ') < '. $file_size ; $self->{'_mimetype'} = 'x-system/x-error; x-error=file_size_max'; return; } ${$self->{'_contentref'}} = util::readfile($cfile); } } else { $file_size = length(${$self->{'_contentref'}}); } $self->{'_field_info'}{'size'} = $file_size; } # check the file. No $msg is good. # sub _check_file ($$$$$) { my ($cfile, $cfile_size, $text_size, $mtype, $uri) = @_; my $msg = undef; if ($mtype =~ /; x-system=unsupported$/) { $mtype =~ s/; x-system=unsupported$//; $msg = _("Unsupported media type ")."($mtype)"._(" skipped."); } elsif ($mtype =~ /; x-error=file_size_max/) { $msg = _("is larger than your setup before filtered, skipped: ") . 'conf::FILE_SIZE_MAX (' . $conf::FILE_SIZE_MAX . ') < '. $cfile_size ; } elsif ($mtype =~ /; x-error=.*$/) { $mtype =~ s/^.*; x-error=(.*)$/$1/; $msg = $mtype; } elsif ($mtype =~ /^x-system/) { $msg = _("system error occurred! ")."($mtype)"._(" skipped."); } elsif (! -e $cfile) { $msg = _("does NOT EXIST! skipped."); } elsif (! util::canopen($cfile)) { $msg = _("is NOT READABLE! skipped."); } elsif ($text_size == 0 || $cfile_size == 0) { $msg = _("is 0 size! skipped."); } elsif ($mtype =~ /^application\/octet-stream/) { $msg = _("may be a BINARY file! skipped."); } elsif ($cfile_size > $conf::FILE_SIZE_MAX) { $msg = _("is larger than your setup before filtered, skipped: ") . 'conf::FILE_SIZE_MAX (' . $conf::FILE_SIZE_MAX . ') < '. $cfile_size ; } elsif ($text_size > $conf::TEXT_SIZE_MAX) { $msg = _("is larger than your setup after filtered, skipped: ") . 'conf::TEXT_SIZE_MAX (' . $conf::TEXT_SIZE_MAX . ') < '. $text_size ; } return $msg; } 1;