Current Path : /usr/local/bin/ |
FreeBSD hs32.drive.ne.jp 9.1-RELEASE FreeBSD 9.1-RELEASE #1: Wed Jan 14 12:18:08 JST 2015 root@hs32.drive.ne.jp:/sys/amd64/compile/hs32 amd64 |
Current File : //usr/local/bin/kwnmz |
#! /usr/local/bin/perl -w # -*- Perl -*- # # $Id: kwnmz.in,v 1.4.8.3 2005/09/24 12:25:07 opengl2772 Exp $ # # kwnmz - program to make NMZ.field.keywords # by furukawa@tcp-ip.or.jp # modified by osamu2001@livedoor.com # Copyright (C) 2001-2005 Namazu Project All rights reserved. # This is free software with ABSOLUTELY NO WARRANTY. # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either versions 2, or (at your option) # any later version. # # This program is distributed in the hope that it will be useful # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA # 02111-1307, USA # # This file must be encoded in EUC-JP encoding # use strict; my $PKGDATADIR = $ENV{'pkgdatadir'} || "/usr/local/share/namazu"; push(@INC, $PKGDATADIR . "/pl"); require 'nmzidx.pl'; my $rate = 0.5; my $num = 10; my $len = 16; my $e_pat = undef; my $e_flag = undef; my $j_pat = undef; my $j_flag = undef; my $backup = 1; my $meta = 1; my @tfidf; while (@ARGV && $ARGV[0] =~ s/^\-//){ my $argv = shift; &usage, exit if $argv eq '-help'; $backup = 0, next if $argv eq '-no-backup'; $meta = 0, next if $argv eq '-no-meta'; while ($argv =~ s/^(.)//){ $len = ($argv eq '')? shift: $argv, last if $1 eq 'l'; $rate = ($argv eq '')? shift: $argv, last if $1 eq 'r'; $num = ($argv eq '')? shift: $argv, last if $1 eq 'n'; $e_pat = ($argv eq '')? shift: $argv, last if $1 eq 'e'; $e_pat = '/^[\da-z_]{3}/' if $1 eq 'E'; $j_pat = ($argv eq '')? shift: $argv, last if $1 eq 'j'; $j_pat = '/(^(\xa5.){3})|(^([\xb0-\xf4].){2})/' if $1 eq 'J'; $backup = 0 if $1 eq 'b'; $meta = 0 if $1 eq 'm'; } } if (defined $e_pat){ $e_flag = ($e_pat =~ s/^\!\/(.*)\/$/$1/); $e_pat =~ s/^\/(.*)\/$/$1/; } if (defined $j_pat){ $j_flag = ($j_pat =~ s/^\!\/(.*)\/$/$1/); $j_pat =~ s/^\/(.*)\/$/$1/; } if (@ARGV){ for my $argv (@ARGV){ $argv =~ s/NMZ$// unless -d $argv; $argv = '.' if $argv eq ''; &kwnmz($argv, $num, $rate, \&judge, $meta, $backup); } }else{ &kwnmz('.', $num, $rate, \&judge, $meta, $backup); } sub judge{ my $word = shift; return 0 unless $len > length $word; if (0x80 & ord $word){ return 1 unless defined $j_pat; return ($word =~ /$j_pat/ xor $j_flag); }else{ return 1 unless defined $e_pat; return ($word =~ /$e_pat/ xor $e_flag); } } sub kwnmz{ my($dir, $num, $rate, $judge, $meta, $backup) = @_; $num = 10 unless defined $num; $rate = 0.5 unless defined $rate; my %table = (); if (! -f "$dir/NMZ.i") { print "Cannot open index. : $dir\n"; return; } my $nmzi = new nmzidx($dir, 'r'); defined($nmzi) or die("NMZ.lock2 found. Maybe this index is being updated by another process now."); my $nmzo = new nmzidx($dir, 'w'); { my $word; my %list_i; my $word_number = 0; my $word_hit_number = 0; my $number_of_document = $nmzi->open_flist->{'size'}; my $lim = $number_of_document * $rate; my $wh = $nmzi->open_word; exit 1 unless $wh->{'size'}; while (defined $wh->read(\$word, \%list_i)){ print "$word_hit_number/$word_number\n" unless ++$word_number % 10000; next if defined($judge) && !&$judge($word); my $hit = scalar keys %list_i; # a term hitting too much doesn't help for narrowing a search next if $lim && $hit > $lim; # a term hitting only once doesn't help for # extracting related documents because there are # no documents containing this term. next if $hit <= 1; ++$word_hit_number; $hit = log($number_of_document/$hit); for my $fileno (keys %list_i){ $tfidf[$fileno]{$word} = $list_i{$fileno} * $hit; if (scalar(keys %{$tfidf[$fileno]}) > ($num << 1)){ my @sorted_words = sort{ $tfidf[$fileno]->{$b} <=> $tfidf[$fileno]->{$a} or $a cmp $b} keys %{$tfidf[$fileno]}; for my $i ($num..$#sorted_words){ delete $tfidf[$fileno]{$sorted_words[$i]}; } } } } } print "NMZ.w search finished!\n"; { my $fi; my $fo = $nmzo->open_field('keywords'); $fi = $nmzi->open_field('keywords') if $meta; for my $tmp (@tfidf){ my $keywords; $keywords = $fi->{'keywords'}->getline if defined $fi->{'keywords'}; if (!defined($keywords) || $keywords =~ /^ /){ my $cnt = 0; $keywords = ''; for my $word (sort {$tmp->{$b} <=> $tmp->{$a} or $a cmp $b} keys %$tmp){ $keywords .= " $word"; last unless ++$cnt < $num; } $keywords .= "\n"; } $fo->{'keywords'}->putline($keywords); } } $nmzo->replace_db($backup); $nmzi->close; } sub usage{ print ("Usage: kwnmz [options] <target(s)>\n" . " --help show this help and exit.\n" . " -b, --no-backup do not backup original file.\n" . " -m, --no-meta do not protect meta tag field.\n" . " -n num number of keyword per file. (default: 10)\n" . "\n" . " -l MAX_LENGTH set MAX_LENGTH of valid word. (default: 16)\n" . " -r MAX_RATE set MAX_RATE (hit/total). (default: 0.5)\n" . "\n" . " -e PATTERN set PATTERN for ascii words which should be allowed.\n" . " -E same as '-e /^[\\da-z_]{3}/'\n" . " -j PATTERN set PATTERN for non-ascii words which should be allowed.\n" . " -J same as '-j /(^(\\xa5.){3})|(^([\\xb0-\\xf4].){2})/'\n" ); }