Current Path : /compat/linux/proc/self/root/usr/local/share/namazu/pl/ |
FreeBSD hs32.drive.ne.jp 9.1-RELEASE FreeBSD 9.1-RELEASE #1: Wed Jan 14 12:18:08 JST 2015 root@hs32.drive.ne.jp:/sys/amd64/compile/hs32 amd64 |
Current File : //compat/linux/proc/self/root/usr/local/share/namazu/pl/wakati.pl |
# # -*- Perl -*- # $Id: wakati.pl,v 1.9.8.9 2006/03/10 16:25:32 opengl2772 Exp $ # Copyright (C) 1997-1999 Satoru Takabayashi All rights reserved. # Copyright (C) 2000-2006 Namazu Project All rights reserved. # This is free software with ABSOLUTELY NO WARRANTY. # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either versions 2, or (at your option) # any later version. # # This program is distributed in the hope that it will be useful # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA # 02111-1307, USA # # This file must be encoded in EUC-JP encoding # package wakati; use strict; # Do wakatigaki processing for a Japanese text. sub wakatize_japanese ($) { my ($content) = @_; my @tmp = wakatize_japanese_sub($content); # Remove words consists of only Hiragana characters # when -H option is specified. # Original of this code was contributed by <furukawa@tcp-ip.or.jp>. # [1997-11-13] # And do Okurigana processing. [1998-04-24] if ($var::Opt{'hiragana'} || $var::Opt{'okurigana'}){ for (my $ndx = 0; $ndx <= $#tmp; ++$ndx){ $tmp[$ndx] =~ s/(\s)/ $1/g; $tmp[$ndx] = ' ' . $tmp[$ndx]; if ($var::Opt{'okurigana'}) { $tmp[$ndx] =~ s/((?:[^\xa4][\xa1-\xfe])+)(?:\xa4[\xa1-\xf3])+ /$1 /g; } if ($var::Opt{'hiragana'}) { $tmp[$ndx] =~ s/ (?:\xa4[\xa1-\xf3])+ //g; } } } # Collect only noun words when -m option is specified. if ($var::Opt{'noun'}) { $$content = ""; $$content .= shift(@tmp) =~ /(.+ )\xcc\xbe\xbb\xec/ ? $1 : "" while @tmp; # noun (meisi) in Japanese "cc be bb ec" } else { $$content = join("\n", @tmp); } $$content =~ s/^\s+//gm; $$content =~ s/\s+$//gm; $$content =~ s/ +/ /gm; $$content .= "\n"; util::dprint(_("-- wakatized content --\n")."$$content\n"); } sub wakatize_japanese_sub ($) { my ($content) = @_; my $str = ""; my @tmp = (); if ($conf::WAKATI =~ /^module_(\w+)/) { my $module = $1; if ($module eq "kakasi") { $str = $$content; $str =~ s/([\x80-\xff]+)/{my $text = Text::Kakasi::do_kakasi($1); " $text ";}/ge; } elsif ($module eq "chasen") { if ($var::Opt{'noun'}) { $str = Text::ChaSen::sparse_tostr_long($$content); } else { $str = $$content; $str =~ s/([\x80-\xff]+)/{my $text = Text::ChaSen::sparse_tostr_long($1); " $text ";}/ge; } } elsif ($module eq "mecab") { use vars qw($t); if (!defined $t) { require MeCab; import MeCab; eval '$t = new MeCab::Tagger("-Owakati");' or $t = new MeCab::Tagger([qw(mecab -O wakati)]); } END { $t->DESTROY() if defined $t; }; $str = $$content; $str =~ s/([\x80-\xff]+)/{my $s = $1; my $text = $t->parse($s); " $text ";}/ge; } else { util::cdie(_("invalid wakati module: ")."$module\n"); } util::dprint(_("-- wakatized bare content --\n")."$str\n\n"); @tmp = split('\n', $str); } else { my $tmpfile = util::tmpnam("NMZ.wakati"); util::dprint(_("wakati: using ")."$conf::WAKATI\n"); # Don't use IPC::Open2 because it's not efficent. if ($var::Opt{'noun'}) { my $fh_wakati = util::efopen("|$conf::WAKATI > $tmpfile"); print $fh_wakati $$content; util::fclose($fh_wakati); } else { $str = $$content; my $redirect = ">"; while(1) { if ($str =~ s/^([\x80-\xff]+)//s) { my $fh_wakati = util::efopen("|$conf::WAKATI $redirect $tmpfile"); print $fh_wakati " $1\n"; util::fclose($fh_wakati); } elsif ($str =~ s/^([\x00-\x7f]+)//s) { my $fh_wakati = util::efopen("$redirect $tmpfile"); print $fh_wakati " $1 "; util::fclose($fh_wakati); } else { last; } $redirect = ">>"; } } { my $fh_wakati = util::efopen($tmpfile); @tmp = <$fh_wakati>; chomp @tmp; util::fclose($fh_wakati); } unlink $tmpfile; } return @tmp; } 1;