config root man

Current Path : /compat/linux/proc/68247/root/usr/local/bin/
FreeBSD hs32.drive.ne.jp 9.1-RELEASE FreeBSD 9.1-RELEASE #1: Wed Jan 14 12:18:08 JST 2015 root@hs32.drive.ne.jp:/sys/amd64/compile/hs32 amd64
Current File : //compat/linux/proc/68247/root/usr/local/bin/bsfilter
#! /usr/bin/env ruby
## -*-Ruby-*- $Id: bsfilter,v 1.85 2008/03/02 13:02:11 nabeken Exp $
## Copyright (C) 2003, 2004, 2005, 2006 NABEYA Kenichi
##
## This program is free software; you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation; either version 2 of the License, or
## (at your option) any later version.
##
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
## GNU General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with this program; if not, write to the Free Software
## Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

require 'getoptlong'
require 'nkf'

class Bsfilter
  def initialize
    @threads = Array::new
    @token_dbs = nil
    @options = Hash::new
    @db_hash = Hash::new
    @jtokenizer = nil
  end
  attr_accessor :token_dbs

  Release = "$Name: release_1_0_16 $".split[1].sub(/\A[^\d]*/, '').gsub(/_/, '.')
  Release.concat("-") if (Release == "")
  Revision = "$Revision: 1.85 $".gsub(/[^\.\d]/, '')
  Languages = ["C", "ja"]
  Default_Language = "C"

##  Options = Hash::new           # used like a global variable
##  DB = Hash::new
  
  Default_header_prefix = "Abuse"
  Default_spam_subject_prefix = "[SPAM] "
  Default_refer_header = 
    ["Ufrom", "From", "To", "Cc", "Subject", "Reply-to", "Return-path", "Received",
     "Content-Transfer-Encoding", "Content-Type", "charset", "Content-Disposition"].join(",")
  
  Default_jtokenizer = "bigram"
  Default_mark_in_token = "|!*'"
  Default_homedir = ".mf"
  Default_conf_file = "/usr/local/etc/bsfilter.conf"
  Default_pid_file = "bsfilter.pid"
  
  Default_method = "rf"           # Robinson Fisher
  Default_db = "sdbm"
  Default_max_mail = 10000
  Default_min_mail = 8000
  Default_max_line = 500
  
  Default_pop_proxy_if = "0.0.0.0"
  Default_pop_port = "110"
  Default_pop_proxy_port = "10110"
  Default_pop_max_size = 50000
  
  Default_imap_port = "143"
  Default_imap_auth = "auto"
  Default_imap_auth_preference = ["cram-md5", "login", "loginc"]

  Default_icon_number = 32512
  
  Clean_ext = ".clean"
  Spam_ext = ".spam"
  Prob_ext = ".prob"
  Lock_ext = ".lock"
  
  SDBM_ext = ".sdbm"
  GDBM_ext = ".gdbm"
  BDB1_ext = ".bdb1"
  BDB_ext = ".bdb"
  QDBM_ext = ".qdbm"
  
  EXIT_NORMAL = 0
  CODE_NORMAL = true
  CODE_SPAM = true
  CODE_CLEAN = false
  
  CODESET_EUCJP = "eucJP"
  CODESET_LATIN = "ISO8859-1"
  CODESET_GB18030 = "GB18030"
  CODESET_UTF8 = "UTF-8"
  PATTERN_UTF8 = '[\xe0-\xef][\x80-\xbf][\x80-\xbf][\xe0-\xef][\x80-\xbf][\x80-\xbf]'
  RE_UTF8 = Regexp.new(PATTERN_UTF8, 'n')
  
  ALL_TAGS = ["html", "head", "title", "meta", "body", "div", "spam",
              "h1", "h2", "h3", "h4", "h5", "h6",
              "em", "strong", "font", "basefont", "big", "small",
              "b", "i", "s", "u", "tt", "sub", "sub",
              "rb", "rp", "rt","ruby",
              "blink", "marquee",
              "dfn", "cite", "abbr", "acronym",
              "blockquote", "q",
              "br", "pre", "ins", "del", "center", "style", "hr",
              "ul", "ol", "li", "dl", "dt", "dd",
              "table", "caption", "thead", "tbody", "tfoot",
              "colgroup", "col", "tr", "td", "th",
              "a", "link", "base", "img", "address",
              "form", "input", "select", "option", "textarea", "label",
              "fieldset", "legend", "optgroup",
              "frameset", "frame", "nofrmaes", "iframe"].join('|')
  
  SPACE_TAGS = "br|p|td|tr|table|ul|ol|dl|li|dt|dd"
  
  RE_ALL_TAGS = Regexp::compile('\A<(' + ALL_TAGS + ')\b', Regexp::IGNORECASE, 'n')
  RE_SPACE_TAGS = Regexp::compile('\A<(' + SPACE_TAGS + ')\b', Regexp::IGNORECASE, 'n')
  
  SOCKET_TIMEOUT = 30             # for single socket operation
  
  module Bsutil
    def insert_header!(buf, header, content)
      buf[0] =~ /([\r\n]*)\z/
      eol = $1
      
      (0 ... buf.length).each do |i|
        if ((i == 0) &&         # unix from line
            (buf[i] =~ /\A>?from\s+(\S+)/))
          next
        elsif (buf[i] =~/\A(.*?:)/)
          h = $1
          if (h == header)
            buf[i] = "#{header} #{content}#{eol}"
            return
          end
        elsif (buf[i] =~ /\A\s+\S/) # folded header
          next
        elsif (buf[i] =~ /\A[\r\n]*\z/) # separator between header and body
          buf[i, 0] = "#{header} #{content}#{eol}"
          return
        else                    # not header. may be body without separator
          buf[i, 0] = "#{header} #{content}#{eol}"
          return
        end
      end
      buf.push("#{header} #{content}#{eol}")
    end

    def append_header!(buf, header, prefix)
      buf[0] =~ /([\r\n]*)\z/
      eol = $1
      append_done = false
      (0 ... buf.length).each do |i|
        if (buf[i] =~/\A(.*?:)(\s*)(.*?)([\r\n]*)\z/)
          h = $1
          org_content = $3
          if (h.downcase == header.downcase)
            buf[i] = "#{header} #{prefix}#{org_content}#{eol}"
            append_done = true
          end
        elsif ((! append_done) &&
               (((buf[i] =~ /\A\S/) && (buf[i] !~ /\A\S+:/)) || # found body without separator
                (buf[i] =~ /\A[\r\n]*\z/))) # separator between header and body
          buf[i, 0] = "#{header} #{prefix}#{eol}"
          append_done = true
          break
        end
      end
      buf.push("#{header} #{prefix}#{eol}") if (! append_done)
    end

    def x_spam_flag
      return sprintf("X-%s-Flag:", @options["header-prefix"])
    end
    
    def x_spam_probability
      return sprintf("X-%s-Rate:", @options["header-prefix"])
    end
    
    def x_spam_revision
      return sprintf("X-%s-Revision:", @options["header-prefix"])
    end
    
    def insert_headers!(buf, spam_flag, probability=nil)
      updated = false
      if (@options["insert-revision"])
        insert_header!(buf, x_spam_revision, "bsfilter release #{Release} revision #{Revision}")
        updated = true
      end
      if (@options["insert-flag"])
        updated = true
        if (spam_flag)
          insert_header!(buf, x_spam_flag, "Yes")
        else
          # insert_header!(buf, x_spam_flag, "No")
        end
      end
      if (@options["insert-probability"] && probability)
        updated = true
        insert_header!(buf, x_spam_probability, sprintf("%f", probability))
      end
      if (@options["mark-spam-subject"])
        updated = true
        if (spam_flag)
          append_header!(buf, "Subject:", @options["spam-subject-prefix"])
        end
      end
      return updated
    end
  end                           # end of module

  include Bsutil

  class DevNull
    def sync=(*args)
    end
    def print(*args)
    end
    def printf(*args)
    end
  end

  class DBHash < Hash
    def flatten(magic="###", head="", &block)
      self.each do |k, v|
        if (v.class == DBHash)
          if (head == "")
            v.flatten(magic, k, &block)
          else
            v.flatten(magic, head + magic + k, &block)
          end
        else
          if (head == "")
            yield k, v
          else
            yield head + magic + k, v
          end
        end
      end
    end
    
    def add(hash)
      hash.each do |k, v|
        if (self[k])
          if ((self[k].class == DBHash) &&
              (v.class == DBHash))
            self[k].add(v)
          else
            self[k] += v
          end
        else
          self[k] = v             # should do deep copy ?
        end
      end
    end
    def sub(hash)
      hash.each do |k, v|
        if (self[k])
          if ((self[k].class == DBHash) &&
              (v.class == DBHash))
            self[k].sub(v)
            if (self[k].empty?)
              self.delete(k)
            end
          else
            if (self[k] > v)
              self[k] -= v
            else
              self.delete(k)
            end
          end
        end
      end
    end
  end

  def safe_require(file)
    begin
      require file
      return true
    rescue LoadError
      return false
    end
  end

  def latin2ascii(str)
    newstr = str.tr("\x92\x93\x94", "'''")
    newstr.tr!("\xc0-\xc5\xc8-\xcb\xcc-\xcf\xd2-\xd6\xd9-\xdc", "AAAAAAEEEEIIIIOOOOOUUUU")
    newstr.tr!("\xe0-\xe5\xe8-\xeb\xec-\xef\xf2-\xf6\xf9-\xfc", "aaaaaaeeeeiiiiooooouuuu")
    return newstr
  end

  def define_safe_iconv
    def Iconv.safe_iconv(tocode, fromcode, *strs)
      return strs.map do |str|
        array = Array::new
        strs.each do |str|
          str.split(/(\s+)/).each do |word|
            begin
              array.push(Iconv.iconv(tocode, fromcode, word)[0])
            rescue
              array.push(' ')
            end
          end
        end
        array.join
      end
    end
    def Iconv.u2eucjp(str)
      return NKF::nkf('-e -E -X -Z0', (Iconv.safe_iconv(CODESET_EUCJP, CODESET_UTF8, str))[0])
    end
    def Iconv.u2latin(str)
      return (Iconv.safe_iconv(CODESET_LATIN, CODESET_UTF8, str))[0]
    end
    def Iconv.gb180302eucjp(str)
      return (Iconv.safe_iconv(CODESET_EUCJP, CODESET_GB18030, str))[0]
    end
  end
  
  def open_ro(file)
    if (file == "-")
      fh = STDIN
      yield fh
    elsif (file.class == Array)
      file.instance_eval <<EOM
      @eof = false
      def gets
        @n = 0 if (! @n)
        if (@n >= self.length)
          nil
        else
          @n = @n + 1
          self[@n - 1]
        end
      end
      def readlines
        @eof = true
        self
      end
      def eof?
        (@eof || empty?)
      end
EOM
      yield file
    else
      if (! FileTest::file?(file))
        raise sprintf("%s is not file", file)
      end
      fh = open(file, "rb")
      yield fh
      fh.close
    end
  end
  
  def open_wo(file, &block)
    if (file == "-")
      fh = STDOUT
    else
      fh = open(file, "wb")
    end
    if (block)
      yield fh
      if (file != "-")
        fh.close
      end
    else
      return fh
    end
  end
  
  class FLOAT
    def initialize(f=0, power=1)
      @mant = 0
      @exp = 0
      set_f(f, power)
    end
    attr_accessor :mant, :exp
    
    def to_f
      return @mant * Math::exp(@exp)
    end
    
    def ln
      return Math::log(@mant) + @exp
    end
    
    def * (a)
      if (a.class == FLOAT)
        n = FLOAT::new
        n.mant = @mant * a.mant
        n.exp = @exp + a.exp
      else
        n = FLOAT::new
        n.exp = @exp
        n.mant = @mant * a
      end
      return n
    end
    def set_f (a, power=1)
      if (a > 0)
        @mant = 1
        @exp = Math::log(a) * power
      elsif (a < 0)
        @mant = -1
        @exp = Math::log(-a) * power
      else
        @mant = 0
        @exp = 0
      end
      self
    end
  end
  
  
  module TokenAccess
    def check_size(max_size, min_size)
      if ((@file_count <= max_size) || (max_size <= 0) || (min_size <= 0))
        return false
      end
      old_count = @file_count
      if (@options["verbose"])
        @options["message-fh"].printf("reduce token database %s from %d to %d\n", @filename, old_count, min_size)
      end
      
      key_cts.each do |(category, token)|
        if (category != ".internal")
          v = value(category, token) || 0
          sub_scalar(category, token, (v * (old_count - min_size).to_f / old_count.to_f).ceil)
          if (@options["debug"] && ! value(category, token))
            @options["message-fh"].printf("deleted %s %s\n", category, token)
          end
        end
      end
      @file_count = min_size
      @dirty = true
      return true
    end
    
    def value_with_degene(category, token)
      if (value(category, token))
        return value(category, token)
      elsif (! @options["degeneration"])           # no degeneration
        return nil
      else
        if (v = value(category, token[0 .. -2])) # cut last char
          return v 
        end
        token = token.gsub(Regexp::compile("[#{@options['mark-in-token']}]"), '')
        if (v = value(category, token))
          return v 
        end
        token = token.downcase
        if (v = value(category, token))
          return v 
        end
        token = token.upcase
        if (v = value(category, token))
          return v 
        end
        token = token.capitalize
        if (v = value(category, token))
          return v 
        end
        return nil
      end
    end
    def set_scalar(category, token, val)
      @dirty = true
      @file_count += 1
      set(category, token, val)
    end
    
    def add_scalar(category, token, val)
      @dirty = true
      @file_count += 1
      if (v = value(category, token))
        set(category, token, v + val)
      else
        set(category, token, val)
      end
    end
    
    def show_new_token(db)
      db.each_ct do |(category, token)|
        if (! value(category, token) || (value(category, token) == 0))
          @options["message-fh"].printf("new %s %s\n", category, token)
        end
      end
    end
    
    def values
      array = Array::new
      each_ct do |c, t|
        array.push(value(c, t))
      end
      return array
    end
    
    def key_cts
      array = Array::new
      each_ct do |c, t|
        array.push([c, t])
      end
      return array
    end
    
    def export(fh)
      each_ct do |(category, token)|
        fh.printf("%s %s %s %g\n", @language, category, token, value(category, token)) if (value(category, token))
      end
    end
  end
  
  class TokenDB
    include TokenAccess
    
    def initialize(language=nil)
      @hash = DBHash::new
      @file_count = 0
      @language = language
      @message_id = "-"
      @probability = nil
      @spam_flag = nil
      @dirty = false
      @time = nil
      @filename = "-"
    end
    attr_accessor :hash, :file_count, :probability, :language, :spam_flag, :message_id, :time, :filename
    
    def size
      @hash.size
    end
    
    def each_ct
      @hash.each_key do |category|
        @hash[category].each_key do |token|
          yield(category, token)
        end
      end
    end
    
    def value(category, token)
      if (! @hash[category])
        return nil
      elsif (v = @hash[category][token])
        return v
      else
        return nil
      end
    end
    
    def set(category, token, v)
      @dirty = true
      @hash[category] = DBHash::new if (! @hash[category])
      @hash[category][token] = v
    end
    
    def print_keys_to_str(hash, separator, fh=STDOUT)
      hash.keys.sort.each do |k|
        v = hash[k]
        v = v.to_i
        fh.print separator
        fh.print(([k] * v).join(separator))
      end
    end
    
    def clear
      @dirty = true
      @file_count = 0
      @hash = DBHash::new
    end
    
    def add_db(db)
      @dirty = true
      @file_count += db.file_count
      if (! @language && db.language)
        @language = db.language
      end
      @hash.add(db.hash)
    end
    
    def add_hash(hash)
      @dirty = true
      @file_count += 1
      @hash.add(hash)
    end
    
    def sub_scalar(category, token, val)
      if (@file_count > 0)
        @file_count -= 1
      end
      @hash.sub({category => {token => val}})
    end
    
    def sub_hash(hash)
      @dirty = true
      if (@file_count > 0)
        @file_count -= 1
      end
      @hash.sub(hash)
    end
    
    def sub_db(db)
      @dirty = true
      @file_count -= db.file_count
      if (@file_count < 1)
        @file_count = 1
      end
      @hash.sub(db.hash)
    end
  end
  
  class TokenDBM
    include TokenAccess
    MAGIC = "###"
    def initialize(options, language, ext)
      @options = options
      @dbm = nil                  # SDBM not Hash
      @dirty = nil                # not used. for TokenAccess
      @lockfh = nil
      @file_count = nil
      @language = language
    end
    attr_accessor :file_count
    
    def size
      @dbm.size
    end
    
    def to_db
      token_db = TokenDB::new(@language)
      @dbm.each do |ct, v|
        (category, token) = ct.split(Regexp.new(MAGIC), 2)
        token_db.set(category, token, v)
        token_db.file_count = @file_count
      end
      return token_db
    end
    
    def clear
      @dbm.clear
      @file_count = 0
      set(".internal", "file_count", 0)
    end
    
    def each_ct
      @dbm.each_key do |ct|
        (category, token) = ct.split(Regexp.new(MAGIC), 2)
        yield(category, token) if (category && token)
      end
    end
    
    def add_db(token_db)
      add_hash(token_db.hash)
      @file_count += + token_db.file_count
    end
    
    def add_hash(hash)
      @dirty = true
      hash.flatten(MAGIC) do |k, v|
        if (@dbm[k])
          @dbm[k] = (@dbm[k].to_f + v.to_f).to_s
        else
          @dbm[k] = v.to_s
        end
      end
    end
    
    def sub_db(token_db)
      sub_hash(token_db.hash)
      if (@file_count > token_db.file_count)
        @file_count -= token_db.file_count
      else
        @file_count= 0
      end
    end
    def sub_hash(hash)
      @dirty = true
      hash.flatten(MAGIC) do |k, v|
        if (@dbm[k])
          if (@dbm[k].to_f > v.to_f)
            @dbm[k] = (@dbm[k].to_f - v.to_f).to_s
          else
            @dbm.delete(k)
          end
        end
      end
    end
    
    def value(category, token)
      v = @dbm[category + MAGIC + token]
      if (v)
        return v.to_f
      else
        return nil
      end
    end
    
    def set(category, token, v)
      @dirty = true
      @dbm[category + MAGIC + token] = v.to_s
    end
    
    def sub_scalar(category, token, v)
      @dirty = true
      if (@file_count > 0)
        @file_count -= 1
      end
      oldv = value(category, token)
      if (oldv)
        if (oldv > v)
          set(category, token, oldv - v)
        else
          @dbm.delete(category + MAGIC + token)
        end
      end
    end
    
    def open(mode="r")
      @lockfh = File::open(@lockfile, "w+")
      case mode
      when "r"
        begin
          @lockfh.flock(File::LOCK_SH)
        rescue Errno::EINVAL ## Win9x doesn't support LOCK_SH
          @lockfh.flock(File::LOCK_EX)
        end
      when "w", "wr", "rw"
        @lockfh.flock(File::LOCK_EX)
      else
        raise "internal error: unknown mode #{mode}"
      end
      
      @dbm = open_dbm(@filename, 0600)
      
      if (v = value(".internal", "file_count"))
        @file_count = v.to_i
      else
        @file_count = 0
        set(".internal", "file_count", @file_count)
      end
      if (@options["verbose"])
        @options["message-fh"].printf("open %s %d tokens %d mails by %d.\n", @filename, @dbm.length, @file_count, Process::pid)
      end
      @dirty = false
    end
    
    def close
      dirty = @dirty
      set(".internal", "file_count", @file_count) if (dirty)
      if (@options["verbose"])
        @options["message-fh"].printf("close %s %d tokens %d mails by %d.\n", @filename, @dbm.length, @file_count, Process::pid)
      end
      if (@options["debug"] && dirty)
        key_cts.sort.each do |(c, t)|
          @options["message-fh"].printf("%s %s %s %f\n", @filename, c, t, value(c, t))
        end
      end
      @dbm.close
      
      @lockfh.flock(File::LOCK_UN)
      @lockfh.close
      @dirty = false
    end
  end
  
  class TokenSDBM < TokenDBM
    def initialize(options, language, ext)
      @filename = options["homedir"] + language + ext + SDBM_ext
      @lockfile = options["homedir"] + language + ext + SDBM_ext + Lock_ext
      super
    end
    def clear
      @file_count = 0
      @dbm.close
      begin
        File::unlink(@filename + ".dir")
        File::unlink(@filename + ".pag")
      rescue
      end
      @dbm = open_dbm(@filename, 0600)
      if (@options["verbose"])
        @options["message-fh"].printf("reopen %s by %d.\n", @filename, Process::pid)
      end
    end
    def open_dbm(filename, mode)
      SDBM::open(filename, mode)
    end
  end
  
  class TokenGDBM < TokenDBM
    def initialize(options, language, ext)
      @options = options
      @filename = @options["homedir"] + language + ext + GDBM_ext
      @lockfile = @options["homedir"] + language + ext + GDBM_ext + Lock_ext
      super
    end
    def clear
      @file_count = 0
      @dbm.close
      begin
        File::unlink(@filename)
      rescue
      end
      @dbm = open_dbm(@filename, 0600)
      if (@options["verbose"])
        @options["message-fh"].printf("reopen %s by %d.\n", @filename, Process::pid)
      end
    end
    def open_dbm(filename, mode)
      GDBM::open(filename, mode, GDBM::NOLOCK)
    end
  end
  
  class TokenBDB1 < TokenDBM
    def initialize(options, language, ext)
      @filename = options["homedir"] + language + ext + BDB1_ext
      @lockfile = options["homedir"] + language + ext + BDB1_ext + Lock_ext
      super
    end
    def clear
      @file_count = 0
      @dbm.close
      begin
        File::unlink(@filename)
      rescue
      end
      @dbm = open_dbm(@filename, 0600)
      if (@options["verbose"])
        @options["message-fh"].printf("reopen %s by %d.\n", @filename, Process::pid)
      end
    end
    def open_dbm(filename, mode)
      BDB1::Hash.open(filename, BDB1::CREATE | BDB1::WRITE, mode)
    end
  end

  class TokenBDB < TokenDBM
    def initialize(options, language, ext)
      @filename = options["homedir"] + language + ext + BDB_ext
      @lockfile = options["homedir"] + language + ext + BDB_ext + Lock_ext
      super
    end
    def clear
      @file_count = 0
      @dbm.close
      begin
        File::unlink(@filename)
      rescue
      end
      @dbm = open_dbm(@filename, 0600)
      if (@options["verbose"])
        @options["message-fh"].printf("reopen %s by %d.\n", @filename, Process::pid)
      end
    end
    def open_dbm(filename, mode)
      BDB::Hash.open(filename, nil, BDB::CREATE, mode)
    end
  end

  class TokenQDBM < TokenDBM
    def initialize(options, language, ext)
      @filename = options["homedir"] + language + ext + QDBM_ext
      @lockfile = options["homedir"] + language + ext + QDBM_ext + Lock_ext
      super
    end
    def value(category, token)
      begin
        v = @dbm[category + MAGIC + token]
      rescue DepotError_ENOITEM
        return nil
      else
        return v.to_f
      end
    end
    def add_hash(hash)
      @dirty = true
      hash.flatten(MAGIC) do |k, v|
        begin
          if (@dbm[k])
            @dbm[k] = (@dbm[k].to_f + v.to_f).to_s
          else
            ## nerver reached. DepotError_ENOITEM asserted when @dbm[k] is nil
            @dbm[k] = v.to_s
          end
        rescue DepotError_ENOITEM
          @dbm[k] = v.to_s
        end
      end
    end
    def clear
      @file_count = 0
      @dbm.close
      begin
        File::unlink(@filename)
      rescue
      end
      @dbm = open_dbm(@filename, 0600)
      if (@options["verbose"])
        @options["message-fh"].printf("reopen %s by %d.\n", @filename, Process::pid)
      end
    end
    def open_dbm(filename, mode)
      Depot::open(filename, Depot::OWRITER | Depot::OCREAT)
    end
  end

  def get_lang_from_headers(headers)
    reg_char_ja = Regexp::compile('\?(iso-2022-jp|iso-2202-jp|x.sjis|shift.jis|euc.jp)\?', Regexp::IGNORECASE, 'n')
    reg_jis = Regexp::compile("\\x1b\\x24[\\x42\\x40]", nil, 'n') # escape sequence to jisx0208 new and old
    @options["refer-header"].keys.each do |header_name|
      str = headers[header_name]
      if (str)
        case str
        when reg_char_ja
          @options["message-fh"].printf("lang ja header char_ja\n") if (@options["debug"])
          return ["ja", nil]
        when reg_jis
          @options["message-fh"].printf("lang ja header jis\n") if (@options["debug"])
          return ["ja", "jis"]
        end
      end
    end
    return nil
  end

  def get_lang_from_buf(buf, html_flag)
    return get_lang(buf, html_flag)
  end

  def get_lang(buf, html_flag=false)
    reg_euc = Regexp::compile("[\xa1\xa2-\xa1\xbc\xa4\xa1-\xa4\xf3\xa5\xa1-\xa5\xf6]{4}", nil, 'e') # kana in euc-jp without zenkaku-space
    reg_sjis = Regexp::compile("[\x81\x40-\x81\x5b\x82\x9f-\x82\xf1\x83\x40-\x83\x96]{2}", nil, 's') # kana in shift-jis
    reg_utf8 = Regexp::compile("[\xe3\x80\x80-\xe3\x80\x82\xe3\x81\x81-\xe3\x82\x93\xe3\x82\xa1-\xe3\x83\xb6]{4}", nil, 'u') # kana in utf8
    reg_jis = Regexp::compile("\\x1b\\x24[\\x42\\x40]", nil, 'n') # escape sequence to jisx0208 new and old
    reg_gb18030_possible = Regexp::compile('[\x80-\x9f]', nil, 'n')
    
##    reg_char_utf8 = Regexp::compile('(^\w+: .*|charset="?)(utf-8)', Regexp::IGNORECASE, 'n')
##    reg_cte_bin = Regexp::compile('\Acontent-transfer-encoding\s*:\s*(base64|quoted-printable)', Regexp::IGNORECASE, 'n')
##    reg_c = Regexp::compile('(^\w+: .*|charset="?)(ks_c_5601|euc-kr|big5|gb2312)', Regexp::IGNORECASE, 'n')

    gb18030_possible = false
    buf.each do |str|
      if (html_flag)
        str = decode_character_reference2u(str)
      end
      if (str =~ reg_gb18030_possible)
        gb18030_possible = true
      end

      case str.gsub(/\s/, '')
      when reg_utf8
        @options["message-fh"].printf("lang ja utf8\n") if (@options["debug"])
        return ["ja", "utf8"]
      when reg_jis
        @options["message-fh"].printf("lang ja jis\n") if (@options["debug"])
        return ["ja", "jis"]
      when reg_sjis
        @options["message-fh"].printf("lang ja sjis\n") if (@options["debug"])
        return ["ja", "sjis"]
      when reg_euc
        if (gb18030_possible)
          @options["message-fh"].printf("lang ja gb18030\n") if (@options["debug"])
          return ["ja", "gb18030"]
        else
          @options["message-fh"].printf("lang ja euc\n") if (@options["debug"])
          return ["ja", "euc"]
        end
      end
    end
    return [nil, nil]
  end
  
  def get_headers(buf, lang)
    headers = DBHash::new
    buf = buf.dup
    header_buf = Array::new
    if ((buf[0] !~ /\A>?from\s+(\S+)/i) && # this isn't mail
        (buf[0] !~ /\A(\S+):/))
      if (@options["max-line"] <= 0)
        return [headers, buf, lang]
      else
        return [headers, buf[0 .. @options["max-line"]], lang]
      end
    end
    
    while (str = buf.shift)
      header_buf.push(str)
      str = str.chomp
      if (str =~ /\A(\S+?):\s*(.*)/)
        current = $1.downcase
        if (current == "received")
          headers[current] = $2.sub(/[\r\n]*\z/, '')
        else
          headers[current] = (headers[current] || "") + " " + $2.sub(/[\r\n]*\z/, '')
        end
      elsif (str =~ /\A>?from\s+(\S+)/i)
        headers["ufrom"] = $1
      elsif (str =~ /\A[\r\n]*\z/) # separator between header and body
        break
      elsif (str =~ /\A\S/)     # found body without separator
        buf.push(str)           # rewind
        break
      elsif (! current)
        break
      else
        if (str =~ /\A\s*=\?/)
          headers[current] += str.sub(/[\r\n]*\z/, '').sub(/\A\s*/, '')
        else
          headers[current] += str.sub(/[\r\n]*\z/, '').sub(/\A\s*/, ' ')
        end
      end
    end

    if ((headers["content-type"] =~ /\bboundary=\s*"(.*?)"/i) ||
        (headers["content-type"] =~ /\bboundary=\s*'(.*?)'/i) ||
        (headers["content-type"] =~ /\bboundary=([^\s;]+)/i))
      headers["boundary"] = $1
    end
    if (headers["content-type"] =~ /charset=([\'\"]*)([^\s\1\;]+)\1/i)
      headers["charset"] = $2
    end
    if (headers["content-type"] =~ /\A([^;]+)/)
      headers["content-type"] = $1
    end
    
    if (@options["max-line"] <= 0)
      return [headers, buf, lang]
    else
      return [headers, buf[0 .. @options["max-line"]], lang]
    end
  end
  
  
  class Jtokenizer
    def initialize(method)
      case method
      when "bigram"
        @method = Proc::new {|s| bigram(s)}
      when "block"
        @method = Proc::new {|s| block(s)}
      when "mecab"
        @method = Proc::new {|s| mecab(s)}
        if (defined?(MeCab::VERSION)) # defined after 0.90
          @m = MeCab::Tagger.new("-Ochasen")
        else
          @m = MeCab::Tagger.new([$0, "-Ochasen"])
        end
      when "chasen"
        Chasen.getopt("-F", '%H %m\n', "-j")
        @method = Proc::new {|s| chasen(s)}
      when "kakasi"
        @method = Proc::new {|s| kakasi(s)}
      else
        raise "internal error: unknown method #{method}"
      end
    end

    def split(str)
      @method.call(str)
    end
    
    Reg_kanji = Regexp::compile("[\xb0\xa1-\xf4\xa4]+", nil, 'e')
    Reg_katakana = Regexp::compile("[\xa1\xbc\xa5\xa1-\xa5\xf6]+", nil, 'e')
    Reg_kanji_katakana = Regexp::compile("[\xb0\xa1-\xf4\xa4\xa1\xbc\xa5\xa1-\xa5\xf6]", nil, 'e')
    Reg_not_kanji_katakana = Regexp::compile("[^\xb0\xa1-\xf4\xa4\xa1\xbc\xa5\xa1-\xa5\xf6]", nil, 'e')
    
    def kakasi(str)
      str = str.gsub(/[\x00-\x7f]/, ' ')
      if (str =~ /\A +\z/)
        return []
      end
      array = Array::new
      Kakasi::kakasi("-oeuc -w", str).scan(/\S+/).each do |token|
        token.gsub!(Reg_not_kanji_katakana, '')
        if ((token =~ Reg_kanji) || (token.length > 2))
          array.push(token)
        end
      end
      return array
    end
    
    def mecab(str)
      str = str.gsub(/[\x00-\x7f]/, ' ')
      if (str =~ /\A +\z/)
        return []
      end
      array = Array::new
      node = @m.parseToNode(str)
      while (node &&
             (defined?(MeCab::VERSION) || (node.hasNode == 1)))
        if defined?(MeCab::VERSION)
          token = node.surface
          hinshi = node.feature.split(/,/)[0]
        else
          token = node.getSurface
          hinshi = node.getFeature.split(/,/)[0]
        end
        ##      print token, hinshi, "\n"
        if (hinshi == "\xcc\xbe\xbb\xec")
          if ((token =~ Reg_kanji_katakana) || (token.length > 2))
            array.push(token)
          end
        else
          token.gsub!(Reg_not_kanji_katakana, '')
          if ((token =~ Reg_kanji) || (token.length > 2))
            array.push(token)
          end
        end
        node = node.next
      end
      return array
    end
    
    def chasen(str)
      str = str.gsub(/[\x00-\x7f]/, ' ')
      if (str =~ /\A +\z/)
        return []
      end
      array = Array::new
      Chasen.sparse(str).split("\n").each do |hinshi_token|
        if (hinshi_token =~ /(.*) (.*)/)
          hinshi = $1
          token = $2
          if (hinshi == "\xcc\xbe\xbb\xec")
            if ((token =~ Reg_kanji_katakana) || (token.length > 2))
              array.push(token)
            end
          else
            token.gsub!(Reg_not_kanji_katakana, '')
            if ((token =~ Reg_kanji) || (token.length > 2))
              array.push(token)
            end
          end
        end
      end
      return array
    end
    
    def block(str)
      tokens = str.scan(Reg_kanji)
      tokens.concat(str.scan(Reg_katakana))
      return tokens
    end
    
    def bigram(str)
      tokens = Array::new
      
      str.scan(Reg_kanji).each do |token|
        case token.length
        when 2, 4
          tokens.push(token)
        else
          l = token.length / 2 - 2
          for i in (0 .. l)
            tokens.push(token[i * 2, 4])
          end
        end
      end
      tokens.concat(str.scan(Reg_katakana))
      return tokens
    end
  end
  
  def tokenize_headers(lang, headers)
    (lang, code) = get_lang_from_headers(headers) if (! lang)

    head_db = TokenDB::new(lang)
    reg_token = Regexp::compile("\\b\\d[\\d\\.]+\\d\\b|[\\w#{@options['mark-in-token']}]+")
    
    if (headers["received"])
      str = headers["received"] 
      str =~ /envelope\-from\s+([\w@\.\-]+)/
      efrom = $1
      str =~ /for\s+<([\w@\.\-]+)>/
      foraddress = $1
      str.sub!(/(\bid|;).*/im, '')
      str.sub!(/\(qmail[^\)]*\)/, '')
      str += " " + efrom if efrom
      str += " " + foraddress if foraddress
      headers["received"] = str
    end

#    if (headers["domainkey-signature"])
#      headers["domainkey-signature"] = headers["domainkey-signature"].sub(/b=[^:;\s]+/, '')
#    end

#    "authentication-results", "domainkey-signature"
    headers.each do |header, content|
      if (@options["refer-header"][header])
        if (lang == "ja")
          content.gsub!(/=\?utf\-8\?([bq])\?(\S*)\?=/i) do |s|
            b_or_q = $1
            encoded_str = $2
            if (@options["utf-8"])
              if (b_or_q =~ /q/i)
                decoded_str = encoded_str.unpack("M*").to_s 
              else
                decoded_str = encoded_str.unpack("m*").to_s 
              end
              Iconv.u2eucjp(decoded_str)
            else
              ""
            end
          end
          content = NKF::nkf('-e -X -Z0', content.gsub(/\?(iso-2202-jp|shift-jis)\?/i, '?ISO-2022-JP?'))
        else
          content = latin2ascii(content)
        end
        content.scan(reg_token).each do |token|
          head_db.add_scalar(header, token, 1) if (token.length < 20)
          @options["message-fh"].printf("tokenizer %s %s\n", header, token) if (@options["debug"])
        end
        if (lang == "ja")
          @jtokenizer.split(content.gsub(/\s+/, '')).each do |token|
            head_db.add_scalar(header, token, 1)
            @options["message-fh"].printf("tokenizer %s %s\n", header, token) if (@options["debug"])
          end
        end
      end
    end
    return head_db
  end
  
  def tokenize_buf(buf)
    lang = nil                  # lang in unknown at first
    
    separators = Array::new
    delimiters = Array::new
    (headers, buf, lang) = get_headers(buf, lang)
    if (headers.empty?)           # this is not a mail
      (db, buf) = tokenize_body(lang, headers, buf, separators, delimiters)
      db.time = Time::new
      db.language = Default_Language if (! db.language)
##      db.language = Default_Language if (@options["unified-db"])
      return db
    end
    
    body_db = TokenDB::new(lang)
    body_db.message_id = headers["message-id"] || "-"
    
    sub_head_db = TokenDB::new(lang)
    main_head_db = tokenize_headers(lang, headers)
    lang = main_head_db.language if main_head_db
    
    found_html_part = false
    plain_bodies = Array::new
    html_bodies = Array::new
    
    while (! buf.empty?)
      separators.push("--" + headers["boundary"]) if (headers["boundary"])
      delimiters.push("--" + headers["boundary"] + "--") if (headers["boundary"])
      
      if ((! headers["content-type"]) ||
          (headers["content-type"] !~ /rfc822/i))
        (db, buf) = tokenize_body(lang, headers, buf, separators, delimiters)
        lang = db.language
        if (headers["content-type"] =~ /html/i)
          found_html_part = true
          html_bodies.push(db)
        else
          plain_bodies.push(db)
        end
      end
      (headers, buf, lang) = get_headers(buf, lang)
      db = tokenize_headers(lang, headers)
      sub_head_db.add_db(db)
    end
    
    if (@options["ignore-plain-text-part"] && found_html_part)
      html_bodies.each do |db|
        body_db.add_db(db)
      end
    else                          # default
      html_bodies.each do |db|
        body_db.add_db(db)
      end
      plain_bodies.each do |db|
        body_db.add_db(db)
      end
    end
    
    body_db.add_db(main_head_db)
    body_db.add_db(sub_head_db)
    body_db.file_count = 1
    body_db.time = Time::new
    body_db.language = Default_Language if (! body_db.language)
##    body_db.language = Default_Language if (@options["unified-db"])
    return body_db
  end
  
  def i2eucjp(i)
    Iconv.u2eucjp([i].pack("U"))
  end
  
  def i2ascii(i)
    latin2ascii(Iconv.u2latin([i].pack("U")))
  end
  
  def i2u(i)
    [i].pack("U")
  end

  def decode_character_reference2u(str)
    if (@options["utf-8"])
      newstr = str.gsub(/\&\#(\d{1,5}|x[\da-f]{1,4});/i) do
        hex_or_dec = $1
        if (hex_or_dec =~ /^x(.*)/i)
          hex_str = $1
          i2u(hex_str.hex)
        else
          i2u(hex_or_dec.to_i)
        end
      end
    else
      newstr = str.gsub(/\&\#(\d{1,5}|x[\da-f]{1,4});/i, "")
    end
    return newstr
  end

  def decode_character_reference(str, lang)
    if (@options["utf-8"])
      newstr = str.gsub(/\&\#(\d{1,5}|x[\da-f]{1,4});/i) do
        hex_or_dec = $1
        if (hex_or_dec =~ /^x(.*)/i)
          hex_str = $1
          if (lang == "ja")
            i2eucjp(hex_str.hex)
          else
            i2ascii(hex_str.hex)
          end
        else
          if (lang == "ja")
            i2eucjp(hex_or_dec.to_i)
          else
            i2ascii(hex_or_dec.to_i)
          end
        end
      end
    else
      newstr = str.gsub(/\&\#(\d{1,5}|x[\da-f]{1,4});/i, "")
    end
    return newstr
  end
  
  def tokenize_str(str, lang)
    body_hash = DBHash::new(0)
    url_hash = DBHash::new(0)
    
    reg_token = Regexp::compile("(?:http:|www)[\\w\\-\\.\\/@%:\?=]+|[\\w\\-\\.]+@[\\w\\-\\.]+|\\b\\d[\\d\\.]+\\d\\b|[\\w#{@options['mark-in-token']}]+")
    reg_url = Regexp::compile('(^http:|https:|^www|@)')
    reg_token2 = Regexp::compile('\b\d[\d\.]+\d\b|[\w%]+')
    reg_noret = Regexp::compile('[\r\n]*\z')
    
    str.scan(reg_token).each do |token|
      if (token =~ reg_url)
        token.scan(reg_token2).each do |token2|
          if (token2.length < 20)
            url_hash[token2] += 1 
            @options["message-fh"].printf("tokenizer %s %s\n", "url", token2) if (@options["debug"])
          end
        end
      elsif (token.length < 20)
        body_hash[token] += 1 
        @options["message-fh"].printf("tokenizer C %s %s\n", "body", token) if (@options["debug"])
      end
    end
    
    if (lang == "ja")
      str.gsub!(Regexp::compile("^[ -\\~]*[\|\>]+", nil, 'e'), '') # delete cite mark
      str.gsub!(Regexp::compile("^[ \\t\xa1\xa1]+", nil, 'e'), '') # delete white space
      str.gsub!(Regexp::compile("(\\r?\\n){2,}", nil, 'e'), ' ') # keep multiple newline as space
      str.gsub!(Regexp::compile("[\\r\\n]+", nil, 'e'), '') # delete newline
      str.split.each do |s|
        @jtokenizer.split(s).each do |token|
          body_hash[token] += 1
          @options["message-fh"].printf("tokenizer ja %s %s\n", "body", token) if (@options["debug"])
        end
      end
    end
    return [body_hash, url_hash]
  end
  
  def base64_encoded?(buf)
    [buf.dup, buf.reverse].each do |b|
      while (str = b.shift)
        if (str =~ /\A[\.\s\r\n]*\z/)
          next
        elsif (str =~ /\A[A-z0-9=+\/]+[\s\r\n]*\z/)
          break
        else
          return false
        end
      end
    end
    return true
  end

  def tokenize_body(lang, headers, body, separators, delimiters)
    reg_return_codes = Regexp::compile('[\r\n]*\z')
    
    db = TokenDB::new(lang)
    body = body.dup
    
    buf = Array::new
    
    delimiter = delimiters.last
    separator = separators.last
    
    if (separators.empty?)
      buf = body
      body = Array::new
    else
      while (str = body.shift)
        str_noret = str.sub(reg_return_codes, '')
        case str_noret
        when separator
          break
        when delimiter
          delimiters.pop
          separators.pop
          delimiter = delimiters.last
          separator = separators.last
          break
        else
          buf.push(str)
        end
      end
    end
    
    if (headers["content-type"] && headers["content-type"] !~ /text/i)
      return [db, body]           # skip non-text body
    end
    
    case headers["content-transfer-encoding"]
    when /base64/i
      if (base64_encoded?(buf))
##        buf.map! {|str| str.unpack("m*").to_s}
        buf = buf.join.gsub(/[\r\n]/, '').unpack("m*")
      end
    when /quoted-printable/i
      buf.map! {|str| str.unpack("M*").to_s}
    end

    lang_backup = lang
    if (headers["content-type"] =~ /html/i)
      (lang, code) = get_lang_from_buf(buf, true)
    else
      (lang, code) = get_lang_from_buf(buf, false)
    end
    if (! lang)
      lang = lang_backup
    end

    str = buf.join
    str.gsub!(/^begin[^\r\n]+(([\r\n]+M)([^\r\n]+))*/, '') # remove uuencoded lines

    if (lang == "ja")
      if (code == "utf8")
        if (@options["utf-8"])
          str = Iconv.u2eucjp(str)
        else
          lang = Default_Language              # can't use iconv / stop ja tokenizer
        end
      elsif (code == "gb18030")
        if (@options["utf-8"])
          str = Iconv.gb180302eucjp(str)
        else
          lang = Default_Language
        end
      else
        str = NKF::nkf('-e -X -Z0', str)
      end
    else
      str = latin2ascii(str)
    end

    tags = Array::new
    if (headers["content-type"] =~ /html/i)
      # remove salad at head of part
      if (str =~ Regexp::compile('\A[^<>]*?(<(\?xml|!doctype|html|body)\b.*)\z', Regexp::MULTILINE | Regexp::IGNORECASE, 'n'))
        str = $1
      end
      
      # remove salad in head, except style
      if (str =~ /\A(.*?)(<body.*)\z/im)
        before_body_tag = $1
        after_body_tag = $2
        before_body_tag.gsub!(/>[^<>]*<(?!\/style)/im, '><')
        str = before_body_tag + after_body_tag
      end
      
      # remove <p style="font-size:0px..>
      str.gsub!(/(<p[^<>]*font-size\s*:\s*[01]\s*(;|px)[^<>]*>)([^<>]*)(<\/p>)/im, '')
      str.gsub!(/(<font[^<>]*font-size\s*:\s*[01]\s*(;|px)[^<>]*>)([^<>]*)(<\/font>)/im, '')

      # remove <span style="DISPLAY: none..>
      str.gsub!(/(<span[^<>]*display\s*:\s*none[^>]*>)([^<>]*)(<\/span>)/im, '')
      
      if (@options["ignore-after-last-atag"])
        if (str =~ /\A(.*)<\/a>/im)
          str = $1
        end
      end
      
      
      # remove salad after body or html
      if (str =~ Regexp::compile('\A(.*)</html>[^<>]*?\z', Regexp::MULTILINE | Regexp::IGNORECASE, 'n')) 
        str = $1
      end
      if (str =~ Regexp::compile('\A(.*)</body>[^<>]*?\z', Regexp::MULTILINE | Regexp::IGNORECASE, 'n')) 
        str = $1
      end
      str.gsub!(Regexp::compile('<[^>]*>', Regexp::MULTILINE, 'n')) do |t|
        t = t.gsub(/\n/, '')
        if (t =~ RE_ALL_TAGS)     # end tags are thrown away
          tags.push(t)
        end
        
        if (t =~ RE_SPACE_TAGS)
          " "
        else
          ""
        end
      end
      body_str = decode_character_reference(str, lang) # out of tags
      tag_str = decode_character_reference(tags.join, lang) # in tags
    else                          # if plain text
      body_str = str
      tag_str = ""
    end
    (body_hash, url_body_hash) = tokenize_str(body_str, lang)
    (tag_hash, url_tag_hash) = tokenize_str(tag_str, lang)
    
    if (! body_hash.empty? && @options["use-body"])
      db.add_hash({"body" => body_hash})
    end
    if (! tag_hash.empty?)
      db.add_hash({"tag" => tag_hash})
    end
    if (! url_body_hash.empty?)
      db.add_hash({"url" => url_body_hash})
    end
    if (! url_tag_hash.empty?)
      db.add_hash({"url" => url_tag_hash})
    end
    db.file_count = 1
    db.language = lang    
    return [db, body]
  end
  
  class Probability               # for each lang
    def initialize(options, lang)
      @options = options
      @filename = @options["homedir"] + lang + Prob_ext
      case (@options["db"])
      when "sdbm"
        @clean = TokenSDBM::new(@options, lang, Clean_ext)
        @spam = TokenSDBM::new(@options, lang, Spam_ext)
        @prob = TokenSDBM::new(@options, lang, Prob_ext)
      when "gdbm"
        @clean = TokenGDBM::new(@options, lang, Clean_ext)
        @spam = TokenGDBM::new(@options, lang, Spam_ext)
        @prob = TokenGDBM::new(@options, lang, Prob_ext)
      when "bdb1"
        @clean = TokenBDB1::new(@options, lang, Clean_ext)
        @spam = TokenBDB1::new(@options, lang, Spam_ext)
        @prob = TokenBDB1::new(@options, lang, Prob_ext)
      when "bdb"
        @clean = TokenBDB::new(@options, lang, Clean_ext)
        @spam = TokenBDB::new(@options, lang, Spam_ext)
        @prob = TokenBDB::new(@options, lang, Prob_ext)
      when "qdbm"
        @clean = TokenQDBM::new(@options, lang, Clean_ext)
        @spam = TokenQDBM::new(@options, lang, Spam_ext)
        @prob = TokenQDBM::new(@options, lang, Prob_ext)
      end
      
      @language = lang
    end
    
    attr_accessor :prob, :clean, :spam, :spam_cutoff, :language
    
    def merge_dbs_of_lang(token_dbs)
      new_db = TokenDB::new
      token_dbs.each do |db|
        if (@language == db.language)
          new_db.add_db(db)
        end
      end
      return new_db
    end
  end
  
  class Graham < Probability
    def initialize(options, lang)
      @spam_cutoff = 0.9
      @default_probability = 0.4
      super
    end

    def product(a)
      n = 1
      a.each do |v|
        n = n * v if (v != 0)
      end
      return n
    end

    def get_combined_probability(token_db)
      prob_db = TokenDB::new      # temporary
      
      token_db.each_ct do |(category, token)|
        probability = @prob.value_with_degene(category, token)
        if (probability)
          prob_db.set_scalar(category, token, probability)
        else
          prob_db.set_scalar(category, token, @default_probability) # 0.4
        end
      end
      
      probs = prob_db.values.sort {|a, b| (b - 0.5).abs <=> (a - 0.5).abs}[0, 15]
      
      if (@options["debug"])
        prob_array = Array::new
        prob_db.each_ct do |c, t|
          prob_array.push([[c, t], prob_db.value(c, t)])
        end
        prob_array.sort! {|a, b| (b[1] - 0.5).abs <=> (a[1] - 0.5).abs}
        prob_array = prob_array[0, 15]
        prob_array.sort! {|a, b| b[1] <=> a[1]}
        prob_array.each do |k, v|
          @options["message-fh"].printf("word probability %s %s %f\n", k[0], k[1], v)
        end
      end
      
      prod = product(probs)
      token_db.probability = prod / (prod + product(probs.map {|x| 1 - x}))
      if (token_db.probability > @spam_cutoff)
        token_db.spam_flag = true
      else
        token_db.spam_flag = false
      end
      return token_db
    end
    
    def update_probability(token_dbs)
      c_count = [@clean.file_count, 1].max
      s_count = [@spam.file_count, 1].max
      
      if (token_dbs.empty?)
        incremental = false
        target_cts = @clean.key_cts | @spam.key_cts
        @prob.open("w")
        @prob.clear
      else
        incremental = true
        merged_db = merge_dbs_of_lang(token_dbs)
        target_cts = merged_db.key_cts
        return if (target_cts.empty?)
        @prob.open("rw")
      end
      old_file_count = @prob.file_count
      new_file_count = 0
      
      cnum = c_count.to_f
      snum = s_count.to_f
      
      target_cts.each do |(category, token)|
        c_count = @clean.value(category, token) || 0
        s_count = @spam.value(category, token) || 0
        update = false
        if (incremental && @prob.value(category, token))
          @prob.sub_scalar(category, token, 1.0) # 1.0 is big enough for delete
          new_file_count -= 1
        end
        if (c_count == 0)
          if (s_count > 10)
            new_file_count += 1
            @prob.set_scalar(category, token, 0.9999)
          elsif (s_count > 5)
            new_file_count += 1
            @prob.set_scalar(category, token, 0.9998)
          end
        elsif (s_count == 0)
          if (c_count > 10)
            new_file_count += 1
            @prob.set_scalar(category, token, 0.0001)
          elsif (c_count > 5)
            new_file_count += 1
            @prob.set_scalar(category, token, 0.0002)
          end
        elsif (c_count + s_count > 5)
          c = c_count * 2
          s = s_count
          p = [[[s / snum, 1.0].min / ([c / cnum, 1.0].min + [s / snum, 1.0].min),
                0.9999].min,
               0.0001].max
          new_file_count += 1
          @prob.set_scalar(category, token, p)
        end
      end
      @prob.file_count = new_file_count + old_file_count if (incremental)
      @prob.close
    end
  end
  
  class Robinson < Probability
    def initialize(options, lang)
      @robx_max = 1
      @min_dev = 0.1
      @spam_cutoff = 0.582
      @center = 0.5
      @robs = 0.001               # from bogofilter/robinson.h
      @default_robx = 0.415	# from bogofilter/robinson.h / not used
      super
    end
    
    def get_pw(category, token, g, b)
      return pw
    end
    
    
    def update_probability(token_dbs)
      pwdb = TokenDB::new
      c_count = [@clean.file_count, 1].max
      s_count = [@spam.file_count, 1].max
      
      if (token_dbs.empty?)
        incremental = false
        target_cts = @clean.key_cts | @spam.key_cts
      else
        incremental = true
        merged_db = merge_dbs_of_lang(token_dbs)
        target_cts = merged_db.key_cts
        return if (target_cts.empty?)      
      end
      
      ## loop1
      ## get pw and robx(average of pw)
      count = 0
      pw_sum = 0.0
      
      good_mail = [1, @clean.file_count].max.to_f
      bad_mail = [1, @spam.file_count].max.to_f
      target_cts.each do |(category, token)|
        g = [@clean.value(category, token) || 0, c_count].min
        b = [@spam.value(category, token) || 0, s_count].min
        n = g + b
        if (n == 0)
          pwdb.set_scalar(category, token, nil) # need to delete this token from prob.db
        else
          pw = (b / bad_mail) / (b / bad_mail + g / good_mail)
          if ((@robx_max == 0) || (n <= @robx_max))
            pw_sum += pw
            count += 1
          end
          pwdb.set_scalar(category, token, pw)
        end
      end
      
      if (incremental)
        @prob.open("rw")
        old_file_count = @prob.file_count
        old_robx = @prob.value(".internal", "robx") || @default_robx
        robx = (pw_sum + old_file_count * old_robx) / (count + old_file_count)
        robs = @robs
      else
        @prob.open("w")
        @prob.clear
        if (count != 0)
          robx = pw_sum / count
        else
          robx = @default_robx
        end
        robs = @robs
      end
      ## loop2
      ## get fw from pw
      new_file_count = 0
      pwdb.key_cts.each do |(category, token)|
        g = [@clean.value(category, token) || 0, c_count].min
        b = [@spam.value(category, token) || 0, s_count].min
        n = g + b
        pw = pwdb.value(category, token)
        if (incremental && @prob.value(category, token))
          new_file_count -= 1
          @prob.sub_scalar(category, token, 1.0) # 1.0 is big enough for delete        
        end
        if (pw)
          new_file_count += 1
          @prob.set_scalar(category, token, (robs * robx + n * pw) / (robs + n)) # fw
        end
      end
      @prob.set_scalar(".internal", "robx", robx)
      @prob.file_count = new_file_count + old_file_count if (incremental)
      @prob.close
    end
    
    def get_probability(pminus, qminus, count)
      r = 1.0 / [1, count].max
      p = 1.0 - Math::exp(pminus.ln * r)
      q = 1.0 - Math::exp(qminus.ln * r)
      s = (1.0 + (p - q) / (p + q)) / 2.0
      return s
    end
    
    def get_combined_probability(token_db)
      robx = @prob.value(".internal", "robx") || @default_robx
      
      count = 0
      pminus = FLOAT::new(1)
      qminus = FLOAT::new(1)
      token_db.each_ct do |(category, token)|
        probability = @prob.value_with_degene(category, token) || robx
        if ((probability - @center).abs > @min_dev)
          if (probability <= 0.0)
            probability = 0.0000001
          elsif (probability >= 1.0)
            probability = 0.9999999
          end
          c = token_db.value(category, token)
          count += c
          pminus = pminus * FLOAT::new(1.0 - probability, c)
          qminus = qminus * FLOAT::new(probability, c)
          @options["message-fh"].printf("word probability %s %s %d %f\n", category, token, c, probability) if (@options["debug"])
        end
      end
      
      if (count == 0)
        token_db.probability = 0.0
      else
        token_db.probability = get_probability(pminus, qminus, count)
      end
      if (token_db.probability > @spam_cutoff)
        token_db.spam_flag = true
      else
        token_db.spam_flag = false
      end
      return token_db
    end
  end
  
  
  class RobinsonFisher < Robinson
    def initialize(options, lang)
      super
      @spam_cutoff = 0.95
    end
    
    def chi2q(x2, v)
      m = x2 / 2.0
      sum = Math::exp(0.0 - m)
      term = FLOAT::new
      term.exp = 0.0 - m
      term.mant = 1
      
      (1 .. (v / 2) - 1).each do |i|
        term = term * FLOAT::new(m / i)
        sum += term.to_f
      end
      return sum < 1.0 ? sum : 1.0
    end
    
    def get_probability(pminus, qminus, count)
      p = 1 - chi2q(-2.0 * pminus.ln, 2 * count)
      q = 1 - chi2q(-2.0 * qminus.ln, 2 * count)
      s = (1.0 + p - q) / 2.0
      return s
    end
  end
  
  def init_dir(dir)
    if (! FileTest::directory?(dir))
      Dir.mkdir(dir, 0700)
    end
  end
  
  def soft_raise(str=nil)
    STDERR.puts str if (str)
    STDERR.puts "Try `#{File.basename($0)} --help' for more information."
    exit 2
  end

  def usage
    
    print <<EOM

NAME
	#{File.basename($0)} - bayesian spam filter

SYNOPSIS
	#{File.basename($0)} [options] [commands] < MAIL
	#{File.basename($0)} [options] [commands] MAIL ...


DESCRIPTION
	filter spam.
	If commands are specified, bsfilter is in maintenance mode, otherwise it is in filtering mode.
	If bsfilter does not find spam in filtering mode, exit status is 1.
	If bsfilter runs with --pipe option or finds spam, exit status is 0.

COMMANDS
	--add-clean|-c
		add mails into the clean token database

	--add-spam|-s
		add mails into the spam token database

	--sub-clean|-C
		subtract mails from the clean token database

	--sub-spam|-S
		subtract mails from the spam token database

	--update|-u
		update the probability table from clean and spam token databases

	--export-clean
		export the clean token database

	--export-spam
		export the spam token database

	--import-clean
		import the clean token database

	--import-spam
		import the spam token database

	--export-probability
		export the probability database (for debugging purpose)
OPTIONS
        --homedir directory
		specify the name of the bsfilter\'s home directory
		If this option is not used, a directory specified with the environment variable "BSFILTERHOME" is used
		If the variable "BSFILTERHOME" is not defined, ".bsfilter" directory under your home is used
		If the variable "HOME" is not defined, a directory which bsfilter is located at is used

	--config-file file
		specify the name of the bsfilter\'s configuration file
		"bsfilter.conf" in bsfilter\'s home directory is used by default

        --max-line number
		check and/or study the first number of lines
		default is #{Default_max_line}. 0 means all

	--db sdbm|gdbm|bdb1|bdb|qdbm
		specify the name of database type
		"sdbm" by default

        --jtokenizer|-j bigram|block|mecab|chasen|kakasi
		specify algorithm of a tokenizer for Japanese language
		"bigram" by default

	--list-clean
		print filename of clean mail

	--list-spam
		print filename of spam

	--imap
		access IMAP server

	--imap-server hostname
		specify hostname of IMAP server

	--imap-port number
		specify port number of IMAP server. default is #{Default_imap_port}

	--imap-auth method
		specify authorization method. default is "auto"
		"cram-md5"	use "AUTHENTICATE CRAM-MD5" command
		"login"		use "AUTHENTICATE LOGIN" command
		"loginc"	use "LOGIN" command
		"auto"		try #{Default_imap_auth_preference.join(', ')} in this order. 

	--imap-user name
		specify user name of IMAP server

	--imap-password password
		specify password of imap-user

	--imap-folder-clean folder
		specify destination folder for clean mails. "inbox.clean" for example

	--imap-folder-spam folder
		specify destination folder for spams. "inbox.spam" for example

	--imap-fetch-unseen
		filter or study mails without SEEN flag

	--imap-fetch-unflagged
		filter or study mails without "X-Spam-Flag" header

	--imap-reset-seen-flag
		reset SEEN flag when bsfilter moves or modifies mails

	--pop
		work as POP proxy

	--pid-file file
		specify filename for logging process ID of bsfilter
		"bsfilter.pid" in bsfilter\'s home directory is used by default		
                this function is valid when "--pop" is specified

	--tasktray
		sit in tasktray
		this is valid with "--pop" on VisualuRuby

	--pop-server hostname
		specify hostname of POP server

	--pop-port number
		specify port number of POP server. default is #{Default_pop_port}

	--pop-proxy-if address
		specify address of interface which bsfilter listens at
		default is 0.0.0.0 and all interfaces are active
		
	--pop-proxy-port number
		specify port number which bsfilter listens at. default is #{Default_pop_proxy_port}

	--pop-user name
		optional. specify username of POP server.
		bsfilter checks match between value of this options and a name which MUA sends.
		in case of mismatch, bsfilter closes sockets.

	--pop-proxy-set set[,set...]        
		specify rules of pop proxy.
		alternative way of pop-server, pop-port, pop-proxy-port and pop-user option.
		format of "set" is "pop-server:pop-port:[proxy-interface]:proxy-port[:pop-user]"
		If proxy-interface is specified and isn\'t 0.0.0.0 , other interfaces are not used.
		"--pop-proxy-set 192.168.1.1:110::10110" is equivalent with
		"--pop-server 192.168.1.1 --pop-port 110 --pop-proxy-port 10110" 		

	--pop-max-size number
		When mail is longer than the specified number, the mail is not filtered.
		When 0 is specified, all mails are tested and filtered.
		unit is byte. default is #{Default_pop_max_size}

	--ssl
		use POP over SSL with --pop option
		use IMAP over SSL with --imap option

	--ssl-cert filename|dirname
		specify a filename of a certificate of a trusted CA or
		a name of a directory of certificates

	--method|-m g|r|rf
		specify filtering method. "rf" by default
		"g" means Paul Graham method,
		"r" means Gary Robinson method,
		and "rf" means Robinson-Fisher method

	--spam-cutoff number
		specify spam-cutoff value
		0.9 by default for Paul Graham method
		0.582 by default for Gary Robinson method
		0.95 by default for Robinson-Fisher method

	--auto-update|-a
		recognize mails, add them into clean or spam token database
		and update the probability table

        --disable-degeneration|-D
                disable degeneration during probability table lookup

        --disable-utf-8
                disable utf-8 support

	--refer-header header[,header...]
		refer specified headers of mails
		"#{Default_refer_header}"
		by default

	--ignore-header|-H
		ignore headers of mails
		same as --refer-header ""

	--ignore-body|-B
		ignore body of mails, except URL or mail address

	--ignore-plain-text-part
		ignore plain text part if html part is included in the mail

	--ignore-after-last-atag
		ignore text after last "A" tag

        --mark-in-token "characters"
		specify characters which are allowable in a token
		"#{Default_mark_in_token}" by default

	--show-process
		show summary of execution

	--show-new-token
		show tokens which are newly added into the token database

	--mbox
		use "unix from" to divide mbox format file

	--max-mail number
		reduce token database when the number of stored mails is larger than this one
		#{Default_max_mail} by default

	--min-mail number
		reduce token database as if this number of mails are stored
		#{Default_min_mail} by default

	--pipe
		write a mail to stdout.
		this options is invalid when "--imap" or "--pop" is specified

	--insert-revision
		insert "X-#{Default_header_prefix}-Revision: bsfilter release..." into a mail

	--insert-flag
		insert "X-#{Default_header_prefix}-Flag: Yes" or "X-#{Default_header_prefix}-Flag: No" into a mail

	--insert-probability
		insert "X-#{Default_header_prefix}-Rate: number" into a mail

	--header-prefix string
		valid with --insert-flag and/or --insert-probability option
		insert "X-specified_string-..." headers, instead of "#{Default_header_prefix}"

	--mark-spam-subject
		insert "#{Default_spam_subject_prefix}" at the beginning of Subject header

	--spam-subject-prefix string
		valid with --mark-spam-subject option
		insert specified string, instead of "#{Default_spam_subject_prefix}"

	--show-db-status
		show numbers of tokens and mails in databases and quit

        --help|-h
		help

	--quiet|-q
		quiet mode

	--verbose|-v
		verbose mode

	--debug|-d
		debug mode

EXAMPLES

% bsfilter -s ~/Mail/spam/*			## add spam
% bsfilter -u -c ~/Mail/job/* ~/Mail/private/*	## add clean mails and update probability table
% bsfilter ~/Mail/inbox/1			## show spam probability

## recipe of procmail (1)
:0 HB
* ? bsfilter -a
spam/.

## recipe of procmail (2)
:0 fw
| bsfilter -a --pipe --insert-flag --insert-probability

:0
* ^X-Spam-Flag: Yes
spam/.

LICENSE
	this file is distributed under GPL version2 and might be compiled by Exerb with VisualuRuby

SEE ALSO
	http://bsfilter.org/
	http://sourceforge.jp/projects/bsfilter/
	http://exerb.sourceforge.jp/
	http://www.osk.3web.ne.jp/~nyasu/software/vrproject.html
	http://www.ruby-lang.org/

RELEASE
	#{Release}

REVISION
	#{Revision}
EOM
  end

  class Mbox
    def initialize(options, fh)
      @options = options
      @buf = fh.readlines
      if ((@buf.length == 1) && (@buf.last =~ /\r\z/)) # Mac style EOL
        @buf = @buf.last.scan(/.*?\r/)
      end
    end
    def read
      return nil if (@buf.empty?) # EOF
      if (! @options["mbox"])   # one file == one mail
        ret_buf = @buf.dup
        @buf.clear
        return ret_buf
      else
        ##    reg_ufrom = Regexp::compile('^From .*@.* \d{2}:\d{2}:\d{2} ')
        ret_buf = Array::new
        while (str = @buf.shift)
          if (str =~ /^From /)
            if (ret_buf.empty?) # head of mail
              ret_buf.push(str)
            else                # head of next mail
              @buf.unshift(str) # rewind
              return ret_buf
            end
          else
            ret_buf.push(str)
          end
        end
        return ret_buf          # last mail of the file
      end
    end
  end
  
  def update_token_db_one(db, command=@options)
    maintenance_command = ""
    maintenance_command += "c" if (command["add-clean"])
    maintenance_command += "s" if (command["add-spam"])
    maintenance_command += "C" if (command["sub-clean"])
    maintenance_command += "S" if (command["sub-spam"])
    maintenance_command = "-" if (maintenance_command == "")
    
    show_process(db, maintenance_command) if (@options["show-process"])
    
    if (command["add-clean"] || command["import-clean"])
      @db_hash[db.language].clean.show_new_token(db) if (@options["show-new-token"])
      @db_hash[db.language].clean.add_db(db)
    end
    if (command["add-spam"] || command["import-spam"])
      @db_hash[db.language].spam.show_new_token(db) if (@options["show-new-token"])
      @db_hash[db.language].spam.add_db(db)
    end
    if (command["sub-clean"])
      @db_hash[db.language].clean.sub_db(db)
    end
    if (command["sub-spam"])
      @db_hash[db.language].spam.sub_db(db)
    end
  end
  
  def read_exported_text(fh)
    dbs = DBHash::new
    @options["languages"].each do |lang|
      dbs[lang] = TokenDB::new(lang)
      dbs[lang].time = Time::new
    end
    while (str = fh.gets)
      str.chomp!
      if (str =~ /^\s*#/)
        next
      end
      (lang, category, token, val) = str.split
      val = val.to_f.to_i
      if (category == ".internal") 
        if (token == "file_count")
          dbs[lang].file_count = dbs[lang].file_count + val
        end
      else
        dbs[lang].add_scalar(category, token, val)
        dbs[lang].file_count = dbs[lang].file_count - 1
      end
    end
    return dbs
  end
  
  def update_token_dbs(files)
    dbs = Array::new
    @options["languages"].each do |lang|
      @db_hash[lang].clean.open("rw")
      @db_hash[lang].spam.open("rw")
    end
    
    if (@options["imap"])
      if (@options["ssl"])
        if (@options["ssl-cert"])
          verify_mode = OpenSSL::SSL::VERIFY_PEER
        else
          verify_mode = nil
        end
        imap = Net::IMAP::new(@options["imap-server"], @options["imap-port"], @options["ssl"], @options["ssl-cert"], verify_mode)
      else
        imap = Net::IMAP::new(@options["imap-server"], @options["imap-port"])
      end
      imap.auto_authenticate(@options, @options["imap-auth"], @options["imap-user"], @options["imap-password"], @options["imap-auth-preference"])

      files.each do |mailbox|
        target_mailbox = mailbox
        target_mailbox = @options["imap-folder-clean"] if (@options["add-clean"] && @options["imap-folder-clean"])
        target_mailbox = @options["imap-folder-spam"] if (@options["add-spam"] && @options["imap-folder-spam"])
        uids = imap_get_target_uids(imap, mailbox)
        uids.each do |uid|
          imapm = IMAPMessage::new(@options, imap, uid)
          imapm.fetch_rfc822
          db = tokenize_buf(imapm.buf)
          db.filename = uid
          update_token_db_one(db)
          updated = imapm.insert_rfc822_headers!((@options["add-spam"] || @options["sub-clean"]), nil)
          if (updated)
            imapm.append(target_mailbox)
            imapm.set_delete_flag
          elsif (target_mailbox != mailbox)
            imapm.copy(target_mailbox)
            imapm.set_delete_flag
          end
        end
        imap.close
      end
      imap.logout
    else
      files.each do |file|
        open_ro(file) do |fh|
          if (@options["import-clean"] || @options["import-spam"])
            imported_dbs = read_exported_text(fh)
            imported_dbs.each do |lang, db|
              update_token_db_one(db)
            end
          else
            mbox = Mbox::new(@options, fh)
            while (buf = mbox.read)
              db = tokenize_buf(buf)
              db.filename = file
              dbs.push(db)
              if (@options["pipe"])
                insert_headers!(buf, (@options["add-spam"] || @options["sub-clean"]), nil) 
                @options["pipe-fh"].print buf
              end
              update_token_db_one(db)
            end
          end
        end
      end
    end
    
    slimed = false
    @options["languages"].each do |lang|
      slimed |= @db_hash[lang].clean.check_size(@options["max-mail"], @options["min-mail"])
      slimed |= @db_hash[lang].spam.check_size(@options["max-mail"], @options["min-mail"])
      @db_hash[lang].clean.close
      @db_hash[lang].spam.close
    end
    dbs.clear if (slimed) # disable incremental
    return dbs
  end
  
  def auto_update(token_dbs)
    command = Hash::new
    updated_langs = Array::new
    token_dbs.each do |token_db|
      updated_langs.push(token_db.language)
    end
    updated_langs.uniq.each do |lang|
      @db_hash[lang].clean.open("rw")
      @db_hash[lang].spam.open("rw")
    end
    
    command["sub-clean"] = false
    command["sub-spam"] = false
    command["import-clean"] = false
    command["import-spam"] = false
    
    token_dbs.each do |token_db|
      if (token_db.spam_flag)
        command["add-clean"] = false
        command["add-spam"] = true
      else
        command["add-clean"] = true
        command["add-spam"] = false
      end
      update_token_db_one(token_db, command)
    end
    
    slimed = false
    updated_langs.uniq.each do |lang|
      slimed |= @db_hash[lang].clean.check_size(@options["max-mail"], @options["min-mail"])
      slimed |= @db_hash[lang].spam.check_size(@options["max-mail"], @options["min-mail"])
    end
    token_dbs.clear if (slimed)   # can't use incremental mode
    
    updated_langs.uniq.each do |lang|
      @db_hash[lang].update_probability(token_dbs)
    end
    
    updated_langs.uniq.each do |lang|
      @db_hash[lang].clean.close
      @db_hash[lang].spam.close
    end
  end
  
  def read_config_file(file)
    configs = Array::new
    
    open(file) do |fh|
      while (str = fh.gets)
        if ((str =~ /\A\s*#/) || (str =~ /\A\s*\z/))
          next
        end
        str.chomp!
        str.sub!(/\s+\z/, '')
        str.sub!(/\A\s+/, '')
        tokens = str.split(/\s+/, 2)
        if (! tokens.empty?)
          tokens[0] = "--" + tokens[0]
          configs.concat(tokens)
        end
      end
    end
    return configs
  end
  
  def imap_get_target_uids(imap, mailbox)
    keys = Array::new
    if (mailbox =~ /(.*)\/(.*)/)
      mailbox = $1
      seqs = $2
    else
      seqs = nil
    end
    imap.select(mailbox)
    if (@options["imap-fetch-unseen"])
      if (seqs)
        uids = imap.uid_search(["UNSEEN", seqs])
      else
        uids = imap.uid_search(["UNSEEN"])
      end
    else
      if (seqs)
        uids = imap.uid_search([seqs])
      else
        uids = imap.uid_search(["ALL"])
      end
    end
    if (@options["imap-fetch-unflagged"])
      null = imap.uid_search(["HEADER", x_spam_flag.sub(/:$/, ''), ""])
      yes = imap.uid_search(["HEADER", x_spam_flag.sub(/:$/, ''), "Yes"])
      no = imap.uid_search(["HEADER", x_spam_flag.sub(/:$/, ''), "No"])
      @options["message-fh"].printf("imap-fetch-unflagged working original %d null %d Yes %d No %d\n",
                                    uids.length, null.length, yes.length, no.length) if (@options["verbose"])
##      uids = uids - imap.uid_search(["HEADER", x_spam_flag.sub(/:$/, ''), ""])
      ## Sendmail Advanced Message Server returns all mails when search string is zero-length ???
      uids = uids - yes - no
      @options["message-fh"].printf("imap-fetch-unflagged worked %d\n",
                                    uids.length) if (@options["verbose"])
    end
    return uids
  end
  
  class IMAPMessage
    include Bsutil
    def initialize(options, imap, uid=nil)
      @options = options
      @seqno = nil
      @seen = nil
      @uid = uid
      @imap = imap
      @buf = Array::new
    end
    attr_accessor :seqno, :uid, :imap, :buf, :seen
    
    def fetch_rfc822
      #    @options["message-fh"].printf("fetch_rfc822 %d\n", @uid) if (@options["verbose"])
      fetched = @imap.uid_fetch(@uid, ["RFC822", "FLAGS"])
      @seqno = fetched[0].seqno
      @buf = fetched[0].attr["RFC822"].split("\n")
      @seen = fetched[0].attr["FLAGS"].include?(:Seen)
      if (! @seen)
        @imap.uid_store(@uid, "-FLAGS", [:Seen])
      end
    end
    
    def insert_rfc822_headers!(*args)
      return insert_headers!(@buf, *args)
    end
    
    def insert_rfc822_header!(header, content)
      #    @options["message-fh"].printf("insert_rfc822_header %d %s %s\n", @uid, header, content) if (@options["verbose"])
      insert_header!(@buf, header, content)
    end
    
    def append(mailbox)
      @buf.map! do |str|
        str.sub(/[\r\n]*\z/, "\r\n")
      end
      # @options["message-fh"].printf("append %d %s\n", @uid, mailbox) if (@options["verbose"])    
      if (@seen)
        @imap.append(mailbox, @buf.join, [:Seen])
      else
        @imap.append(mailbox, @buf.join, [])
      end
    end
    
    def copy(mailbox)
      #    @options["message-fh"].printf("copy %d %s\n", @uid, mailbox) if (@options["verbose"])    
      @imap.uid_copy(@uid, mailbox)
    end
    
    def set_delete_flag
      #    @options["message-fh"].printf("set_delete_flag %d\n", @uid) if (@options["verbose"])    
      @imap.uid_store(@uid, "+FLAGS", [:Deleted])
    end
    
    def reset_seen_flag
      #    @options["message-fh"].printf("reset_seen_flag %d\n", @uid) if (@options["verbose"])    
      @seen = false
      @imap.uid_store(@uid, "-FLAGS", [:Seen])
    end
  end                           # end of class IMAPMessage
  
  def socket_send_rec(command, socket)
    buf = Array::new
    if (command)
      @options["message-fh"].printf("send %s %s", socket, command.sub(/\APASS.*/i, "PASS ********")) if (@options["debug"])
      socket.write_timeout(command) # pass command to pop-server
    end
    response = socket.gets_timeout # get response from pop-server
    buf.push(response)
    @options["message-fh"].printf("resp %s %s", socket, response.sub(/\APASS.*/i, "PASS ********")) if (@options["debug"])
    if ((response =~ /\A\+OK/) &&
        ((command =~ /\A(RETR|TOP|CAPA)/i) ||
         (command =~ /\A(UIDL|LIST)[^\d]*\z/i)))
      while (response != ".\r\n")
        response = socket.gets_timeout
        buf.push(response)
      end
    end
    return buf
  end
  
  def pop_proxy_multi(pop_proxy_sets)
    trap("SIGINT") do
      @options["message-fh"].printf("SIGINT received\n") if (@options["verbose"])
      @threads.each do |thread|   # kill child threads
        Thread::kill(thread)
      end
    end
    
    pop_proxy_sets.split(/,/).each do |pop_proxy_set|
      (pop_server, pop_port, pop_proxy_if, pop_proxy_port, pop_user) = pop_proxy_set.split(/:/)
      pop_port = Default_pop_port if ((! pop_port) || pop_port == '')
      pop_proxy_if = Default_pop_proxy_if if ((! pop_proxy_if) || pop_proxy_if == '')
      pop_proxy_port = Default_pop_proxy_port if ((! pop_proxy_port) || pop_proxy_port == '')
      t = Thread::start do        # start child threads
        pop_proxy_one(pop_server, pop_port, pop_proxy_if, pop_proxy_port, pop_user)
      end
      @threads.push(t)
    end
    @threads.each do |t|          # join child threads
      t.join
    end
    
    Thread::list.each do |t|      # join grandchild threads
      t.join if (t != Thread::current)
    end
    return 0
  end
  
  def pop_bypass_large_mail(command, pop_socket, pop_proxy_socket)
    pop_socket.write_timeout(command) # RETR to server
    str = pop_socket.gets_timeout # response from server
    pop_proxy_socket.write_timeout(str) # forward
    return if (str =~ /^\A\-ERR/)
    
    while (str != ".\r\n")
      timeout(SOCKET_TIMEOUT) do
        pop_proxy_socket.write(str = pop_socket.gets) # forward
      end
    end
    return
  end
  
  def snoop_list_response(strs)
    h = DBHash::new
    if (strs[0] =~ /\A\+OK\s*(\d+)\s+(\d+)/)
      h[$1] = $2.to_i
    else
      strs.each do |str|
        if (str =~ /^(\d+)\s+(\d+)/)
          h[$1] = $2.to_i
        end
      end
    end
    return h
  end
  
  def pop_proxy_one(pop_server, pop_port, pop_proxy_if, pop_proxy_port, pop_user)
    gs = TCPserver.open(pop_proxy_if, pop_proxy_port)
    addr = gs.addr
    addr.shift
    @options["message-fh"].printf("pop_proxy is on %s\n", addr.join(":")) if (@options["verbose"])
    while true
      Thread::start(gs.accept) do |pop_proxy_socket| # start grandchild threads
        @options["message-fh"].print(pop_proxy_socket, " is accepted\n") if (@options["verbose"])
        begin
          pop_socket = nil
          timeout(SOCKET_TIMEOUT) do
            pop_socket = TCPsocket.open(pop_server, pop_port)
          end
          @options["message-fh"].print(pop_socket, " is connected\n") if (@options["verbose"])

          pop_socket = get_ssl_socket(pop_socket, @options["ssl-cert"]) if (@options["ssl"])
          
          hello = socket_send_rec(nil, pop_socket)[0]
          hello.sub!(/(.*)\r/, "\\1(pop_proxy by bsfilter)\r")
          pop_proxy_socket.write(hello)
          
          sizes = DBHash::new
          while (command = socket_send_rec(nil, pop_proxy_socket)[0]) # get command from MUA
            if (command =~ /\ARETR\s+(\d+)/i)
              n = $1
              if (sizes[n] && 
                  (0 < @options["pop-max-size"]) && (@options["pop-max-size"] < sizes[n]))
                pop_bypass_large_mail(command, pop_socket, pop_proxy_socket)
                next
              end
            end
            response = socket_send_rec(command, pop_socket)
            if (command =~ /\ALIST/i)
              sizes.update(snoop_list_response(response))
            elsif ((command =~ /\A(TOP|RETR)/i) && (response[0] =~ /\A\+OK/))
              buf = response[1..-1].dup
              token_db = tokenize_buf(buf)
              @db_hash[token_db.language].prob.open("r")
              @db_hash[token_db.language].get_combined_probability(token_db)
              @db_hash[token_db.language].prob.close
              if (@options["auto-update"])
                auto_update([token_db])
              elsif (@options["show-process"])
                show_process(token_db, "-")
              end
              @options["message-fh"].printf("combined probability %f\n", token_db.probability) if (@options["verbose"])
              insert_headers!(buf, token_db.spam_flag, token_db.probability)
              response[1..-1] = buf
            end
            # don't use elsif
            if (command =~ /QUIT/i)
              @options["message-fh"].printf("send %s %s", pop_proxy_socket, response[0]) if (@options["debug"])
              pop_proxy_socket.write(response) # return response to MUA
              break
            elsif ((command =~ /\AUSER\s*(\S*)\r/) &&
                   (pop_user && pop_user != $1))
              @options["message-fh"].printf("username unmatch error\n")
              pop_proxy_socket.write("-ERR unregistered user\r\n") # return response to MUA              
              break
            else
              @options["message-fh"].printf("send %s %s", pop_proxy_socket, response[0]) if (@options["debug"])
              pop_proxy_socket.write(response) # return response to MUA              
            end
          end
        rescue TimeoutError
          @options["message-fh"].printf("Timeout error %s %s %s\n", pop_server, pop_port, pop_proxy_port) if (@options["verbose"])
        rescue
          @options["message-fh"].printf("pop exception caught %s %s %s\n", pop_server, pop_port, pop_proxy_port) if (@options["verbose"])
          p "#{$!}" if (@options["verbose"])
          p "#{$@}" if (@options["debug"])
        ensure
          if (pop_proxy_socket && ! pop_proxy_socket.closed?)
            @options["message-fh"].print(pop_proxy_socket, " is gone\n") if (@options["verbose"])
            pop_proxy_socket.close 
          end
          if (pop_socket && ! pop_socket.closed?)
            @options["message-fh"].print(pop_socket, " is gone\n") if (@options["verbose"])
            pop_socket.close 
          end
        end
      end                         # thread end
    end
  end
    
  def check_options_for_pop!(options)
    error = false
    options["icon_number"] = (options["icon-number"] || Default_icon_number).to_i
    options["pop-port"] = Default_pop_port if (! options["pop-port"])
    options["pop-proxy-if"] = Default_pop_proxy_if if (! options["pop-proxy-if"])
    options["pop-proxy-port"] = Default_pop_proxy_port if (! options["pop-proxy-port"])
    options["pop-max-size"] = (options["pop-max-size"] || Default_pop_max_size).to_i
    
    if (options["tasktray"])
      require('vr/vrcontrol')
      require('vr/vrtray')
    end
    
    if (options["pop-proxy-set"] || options["pop-server"])
      ## ok
    else
      soft_raise("#{$0}: pop-server unspecified")
    end

    return
  end
  
  def check_options_for_imap!(options)
    error = false
    options["imap-port"] = Default_imap_port if (! options["imap-port"])
    ["imap-server", "imap-auth", "imap-user", "imap-password"].each do |name|
      if (! options[name])
        printf("specify %s\n", name)
        error = true
      end
    end
    
    raise "error found in imap options" if (error)
    return
  end
  
  def do_imap(command_line_args, token_dbs)
    ret_code = CODE_CLEAN
    if (@options["ssl"])
      if (@options["ssl-cert"])
        verify_mode = OpenSSL::SSL::VERIFY_PEER
      else
        verify_mode = nil
      end
      imap = Net::IMAP::new(@options["imap-server"], @options["imap-port"], @options["ssl"], @options["ssl-cert"], verify_mode)
    else
      imap = Net::IMAP::new(@options["imap-server"], @options["imap-port"])
    end
    imap.auto_authenticate(@options, @options["imap-auth"], @options["imap-user"], @options["imap-password"], @options["imap-auth-preference"])

    imap.select(@options["imap-folder-clean"]) if (@options["imap-folder-clean"]) # only for check
    imap.select(@options["imap-folder-spam"]) if (@options["imap-folder-spam"]) # only for check
    command_line_args.each do |mailbox|
      uids = imap_get_target_uids(imap, mailbox)
      uids.each do |uid|
        imapm = IMAPMessage::new(@options, imap, uid)
        imapm.fetch_rfc822
        token_db = tokenize_buf(imapm.buf)
        token_db.filename = uid
        @db_hash[token_db.language].get_combined_probability(token_db)
        token_dbs.push(token_db)
        @options["message-fh"].printf("combined probability %s %d %f\n", mailbox, imapm.seqno, token_db.probability) if (@options["verbose"])

        updated = false
        target_mailbox = mailbox
        if (token_db.spam_flag)
          target_mailbox = @options["imap-folder-spam"] if (@options["imap-folder-spam"])
          ret_code = CODE_SPAM
        else
          target_mailbox = @options["imap-folder-clean"] if (@options["imap-folder-clean"])
        end
        updated = imapm.insert_rfc822_headers!(token_db.spam_flag, token_db.probability)
        if (updated)
          imapm.reset_seen_flag if (@options["imap-reset-seen-flag"])
          imapm.append(target_mailbox)
          imapm.set_delete_flag
        elsif (target_mailbox != mailbox)
          imapm.reset_seen_flag if (@options["imap-reset-seen-flag"])
          imapm.copy(target_mailbox)
          imapm.set_delete_flag
        end
      end
      imap.close
    end
    imap.logout
    return ret_code
  end
  
  
  def do_export(command_line_args)
    if (command_line_args.empty?)
      file = "-"
    else
      file = command_line_args[0]
    end
    if (@options["export-clean"])
      open_wo(file) do |fh|
        @options["languages"].each do |lang|
          @db_hash[lang].clean.open("r")
          @db_hash[lang].clean.export(fh) if (@db_hash[lang].clean.file_count > 0)
          @db_hash[lang].clean.close
        end
      end
    end
    if (@options["export-spam"])
      open_wo(file) do |fh|
        @options["languages"].each do |lang|
          @db_hash[lang].spam.open("r")
          @db_hash[lang].spam.export(fh) if (@db_hash[lang].spam.file_count > 0)
          @db_hash[lang].spam.close
        end
      end
    end
    if (@options["export-probability"])
      open_wo(file) do |fh|
        @options["languages"].each do |lang|
          @db_hash[lang].prob.open("r")
          @db_hash[lang].prob.export(fh) if (@db_hash[lang].prob.file_count > 0)
          @db_hash[lang].prob.close
        end
      end
    end
  end
  
  def setup_imap
    Net::IMAP.class_eval <<EOM
      def auto_authenticate(options, auth, user, password, auth_list=[])
        case auth.downcase
        when "loginc"
          if (options["verbose"])
            options["message-fh"].printf("try to login imap server for %s with login command\n", user)
          end
          return login(user, password)
        when "auto"
          capa = capability
          auth_list.each do |auth|
            if (auth == "loginc")
              return auto_authenticate(options, "loginc", user, password)
            elsif (capa.include?("AUTH=" + auth.upcase))
              return auto_authenticate(options, auth, user, password)
            end
          end
          raise sprintf("can't login imap server for %s with %s", user, auth_list)
        else
          if (options["verbose"])
            options["message-fh"].printf("try to login imap server for %s with authenticate %s\n", user, auth)
          end
          return authenticate(auth, user, password)
        end
      end
EOM
  end
  
  def setup_socket_timeout
    TCPSocket.class_eval <<EOM
      def write_timeout(str)
        timeout(SOCKET_TIMEOUT) do
          return self.write(str)
        end
      end
      def gets_timeout
        timeout(SOCKET_TIMEOUT) do
          s = self.gets
          if (s == nil)
            raise "socket.gets returned nil"
          else
            return s
          end
        end
      end
EOM
  end

  def setup_ssl_socket_timeout
    OpenSSL::SSL::SSLSocket.class_eval <<EOM
      def write_timeout(str)
        timeout(SOCKET_TIMEOUT) do
          return self.write(str)
        end
      end
      def gets_timeout
        timeout(SOCKET_TIMEOUT) do
          s = self.gets
          if (s == nil)
            raise "ssl_socket.gets returned nil"
          else
            return s
          end
        end
      end
EOM
  end

  def get_ssl_socket(socket, cert=nil)
    context = OpenSSL::SSL::SSLContext::new()

    if (cert)
      if (FileTest::file?(cert))
        @options["message-fh"].print(cert, " is used for SSL ca_file\n") if (@options["verbose"])
        context.ca_file = cert
      elsif (FileTest::directory?(cert))
        @options["message-fh"].print(cert, " is used for SSL ca_path\n") if (@options["verbose"])
        context.ca_path = cert
      end
      context.verify_mode = OpenSSL::SSL::VERIFY_PEER
    end
    ssl = OpenSSL::SSL::SSLSocket::new(socket, context)
    ssl.connect
    print(ssl, " is connected\n") if (@options["verbose"])
    return ssl
  end

  def setup_tasktray
    eval <<EOM
    class MyForm < VRForm
      include VRTrayiconFeasible
      include VRMenuUseable
      LoadIcon = Win32API.new("user32", "LoadIcon", "II", "I")
      
      def construct
        @traymenu = newPopupMenu
        @traymenu.set([
                       ["exit", "exit"]
                     ])
        @mytrayicon=0
      end
      def self_trayrbuttonup(iconid)
        showPopup @traymenu
      end
      def into_trayicon(icon_number)
        create_trayicon(LoadIcon.call(0, icon_number),
                        "bsfilter release #{Release} revision #{Revision}", @mytrayicon)
        myexstyle = self.exwinstyle
        myexstyle.ws_ex_toolwindow = true
        myexstyle.ws_ex_appwindow = false
      end
      
      def exit_clicked
        delete_trayicon(@mytrayicon)
        self.close
      end
    end
EOM
    frm = VRLocalScreen.newform(nil, nil, MyForm)
    frm.create
    frm.into_trayicon(@options["icon_number"])
    VRLocalScreen.messageloop
    @threads.each do |thread|   # kill child threads
      Thread::kill(thread)
    end
  end
    
  def do_pop
    Thread.abort_on_exception = true
    @options["message-fh"].print "pop mode start ", Time::new.to_s, "\n" if (@options["verbose"])    
    
    if (@options["tasktray"])
      Thread::start do
        setup_tasktray 
      end
    end
    
    if (@options["pop-proxy-set"])
      pop_proxy_sets = @options["pop-proxy-set"].gsub(/\s/, '')
    else
      pop_proxy_sets = [@options["pop-server"], @options["pop-port"], 
                        @options["pop-proxy-if"], @options["pop-proxy-port"], @options["pop-user"]].join(":")
    end
    ret_code = pop_proxy_multi(pop_proxy_sets)
    
    # never reached
    @options["message-fh"].print "pop mode end ", Time::new.to_s, "\n" if (@options["verbose"])
    return ret_code
  end
    
  def write_pid_file(file)
    open(file, "w") do |fh|
      fh.print Process::pid, "\n"
    end
  end
    
  def parse_command_line
    options = DBHash::new
    
    parser = GetoptLong.new
    parser.ordering = GetoptLong::REQUIRE_ORDER
    parser.set_options(
                       ["--icon-number", GetoptLong::REQUIRED_ARGUMENT],
                       ["--ssl", GetoptLong::NO_ARGUMENT],
                       ["--ssl-cert", GetoptLong::REQUIRED_ARGUMENT],
                       ["--pop", GetoptLong::NO_ARGUMENT],
                       ["--tasktray", GetoptLong::NO_ARGUMENT],
                       ["--pop-proxy-set", GetoptLong::REQUIRED_ARGUMENT],
                       ["--pop-server", GetoptLong::REQUIRED_ARGUMENT],
                       ["--pop-port", GetoptLong::REQUIRED_ARGUMENT],
                       ["--pop-proxy-if", GetoptLong::REQUIRED_ARGUMENT],
                       ["--pop-proxy-port", GetoptLong::REQUIRED_ARGUMENT],
                       ["--pop-user", GetoptLong::REQUIRED_ARGUMENT],
                       ["--pop-max-size", GetoptLong::REQUIRED_ARGUMENT],
                       ["--imap", GetoptLong::NO_ARGUMENT],
                       ["--imap-server", GetoptLong::REQUIRED_ARGUMENT],
                       ["--imap-port", GetoptLong::REQUIRED_ARGUMENT],
                       ["--imap-auth", GetoptLong::REQUIRED_ARGUMENT],
                       ["--imap-user", GetoptLong::REQUIRED_ARGUMENT],
                       ["--imap-password", GetoptLong::REQUIRED_ARGUMENT],
                       ["--imap-folder-clean", GetoptLong::REQUIRED_ARGUMENT],
                       ["--imap-folder-spam", GetoptLong::REQUIRED_ARGUMENT],
                       ["--imap-fetch-unseen", GetoptLong::NO_ARGUMENT],
                       ["--imap-fetch-unflagged", GetoptLong::NO_ARGUMENT],
                       ["--imap-reset-seen-flag", GetoptLong::NO_ARGUMENT],
                       ["--homedir", GetoptLong::REQUIRED_ARGUMENT],
                       ["--config-file", GetoptLong::REQUIRED_ARGUMENT],
                       ["--pid-file", GetoptLong::REQUIRED_ARGUMENT],
                       ["--db", GetoptLong::REQUIRED_ARGUMENT],
##                     ["--unified-db", GetoptLong::NO_ARGUMENT],
                       ["--max-line", GetoptLong::REQUIRED_ARGUMENT],
                       ["--export-clean", GetoptLong::NO_ARGUMENT],
                       ["--export-spam", GetoptLong::NO_ARGUMENT],
                       ["--export-probability", GetoptLong::NO_ARGUMENT],
                       ["--import-clean", GetoptLong::NO_ARGUMENT],
                       ["--import-spam", GetoptLong::NO_ARGUMENT],
                       ["--mbox", GetoptLong::NO_ARGUMENT],
                       ["--jtokenizer", "-j", GetoptLong::REQUIRED_ARGUMENT],
                       ["--method", "-m", GetoptLong::REQUIRED_ARGUMENT],
                       ["--spam-cutoff", GetoptLong::REQUIRED_ARGUMENT],
                       ["--mark-in-token", GetoptLong::REQUIRED_ARGUMENT],
                       ["--max-mail", GetoptLong::REQUIRED_ARGUMENT],
                       ["--min-mail", GetoptLong::REQUIRED_ARGUMENT],
                       ["--show-new-token", GetoptLong::NO_ARGUMENT],
                       ["--auto-update", "-a", GetoptLong::NO_ARGUMENT],
                       ["--update", "-u", GetoptLong::NO_ARGUMENT],
                       ["--add-clean", "-c", GetoptLong::NO_ARGUMENT],
                       ["--add-spam", "-s", GetoptLong::NO_ARGUMENT],
                       ["--sub-clean", "-C", GetoptLong::NO_ARGUMENT],
                       ["--sub-spam", "-S", GetoptLong::NO_ARGUMENT],
                       ["--disable-degeneration", "-D", GetoptLong::NO_ARGUMENT],
                       ["--disable-utf-8", GetoptLong::NO_ARGUMENT],
                       ["--ignore-body", "-B", GetoptLong::NO_ARGUMENT],
                       ["--refer-header", GetoptLong::REQUIRED_ARGUMENT],
                       ["--ignore-header", "-H", GetoptLong::NO_ARGUMENT],
                       ["--ignore-plain-text-part", GetoptLong::NO_ARGUMENT],
                       ["--ignore-after-last-atag", GetoptLong::NO_ARGUMENT],
                       ["--pipe", GetoptLong::NO_ARGUMENT],
                       ["--insert-revision", GetoptLong::NO_ARGUMENT],
                       ["--insert-flag", GetoptLong::NO_ARGUMENT],
                       ["--insert-probability", GetoptLong::NO_ARGUMENT],
                       ["--header-prefix", GetoptLong::REQUIRED_ARGUMENT],
                       ["--mark-spam-subject", GetoptLong::NO_ARGUMENT],
                       ["--spam-subject-prefix", GetoptLong::REQUIRED_ARGUMENT],
                       ["--list-clean", GetoptLong::NO_ARGUMENT],
                       ["--list-spam", GetoptLong::NO_ARGUMENT],
                       ["--show-db-status", GetoptLong::NO_ARGUMENT],
                       ["--show-process", GetoptLong::NO_ARGUMENT],
                       ["--help", "-h", GetoptLong::NO_ARGUMENT],
                       ["--revision", GetoptLong::NO_ARGUMENT],
                       ["--quiet", "-q", GetoptLong::NO_ARGUMENT],
                       ["--debug", "-d", GetoptLong::NO_ARGUMENT],
                       ["--verbose", "-v", GetoptLong::NO_ARGUMENT])
    
    allow_multi = {"pop-proxy-set" => true}
    
    parser.quiet = true
    begin
      parser.each_option do |name, arg|
        name.sub!(/^--/, '')
        if (options[name] && allow_multi[name])
          options[name] += ("," + arg)
        else
          options[name] = arg.dup
        end
      end
    rescue
      soft_raise(sprintf("#{$0}: %s", parser.error_message))
    end
    return options
  end
    
    
  def get_options
    argv_backup = Marshal::load(Marshal::dump(ARGV)) # shallow copy is enough?
    options = parse_command_line
    
    if (options["config-file"] && (! File::file?(options["config-file"])))
      soft_raise(sprintf("#{$0}: can't open config file `%s'. check argument of --config-file\n", options["config-file"]))
    end
    
    if (! options["homedir"])
      if (ENV["BSFILTERHOME"])
        options["homedir"] = ENV["BSFILTERHOME"]
      elsif (ENV["HOME"])
        options["homedir"] = ENV["HOME"] + "/" + Default_homedir
      elsif (defined?(Exerb) && Exerb.runtime?)
        options["homedir"] = File.dirname(Exerb.filepath)
      else
        options["homedir"] = File.dirname($0)
      end
    end
    
    if (! options["config-file"])
    #  options["config-file"] = options["homedir"] + "/" + Default_conf_file
      options["config-file"] = Default_conf_file
    end
    if (options["config-file"] && File::file?(options["config-file"]))
      ARGV.clear
      argv_config = read_config_file(options["config-file"])
      (argv_config + argv_backup).reverse.each do |argv|
        ARGV.unshift(argv)
      end
      options.update(parse_command_line)
    end
    
    if (options["help"])
      usage
      exit 0
    end
    if (options["revision"])
      print "bsfilter release #{Release} revision #{Revision}\n"
      exit 0
    end
    
    options["homedir"] = options["homedir"].sub(/\/*$/, '') + "/"
    
    if (options["method"])
      if (options["method"] !~ /\A(g|r|rf)\z/)
        soft_raise(sprintf("#{$0}: unsupported method `%s' for --method or -m\n", options["method"]))
      end
    else
      options["method"] = Default_method
    end
    
    options["header-prefix"] = Default_header_prefix if (! options["header-prefix"])
    options["spam-subject-prefix"] = Default_spam_subject_prefix if (! options["spam-subject-prefix"])
    
    options["db"] = Default_db if (! options["db"])
    case options["db"]
    when "sdbm"
      require 'sdbm'
    when "gdbm"
      require 'gdbm'
    when "bdb1"
      require 'bdb1'
    when "bdb"
      require 'bdb'
    when "qdbm"
      require 'depot'
    else
      soft_raise(sprintf("#{$0}: unsupported argument `%s' for --db\n", options["db"]))
    end
    
    if (options["jtokenizer"])
      options["jtokenizer"].downcase!
    else
      options["jtokenizer"] = Default_jtokenizer
    end
    case options["jtokenizer"]
    when "bigram"
    when "block"
    when "mecab"
      require 'MeCab'
    when "chasen"
      require 'chasen.o'
    when "kakasi"
      require 'kakasi'
    else
      soft_raise(sprintf("#{$0}: unsupported argument `%s' for --jtokenizer or -j\n", options["jtokenizer"]))
    end
    @jtokenizer = Jtokenizer::new(options["jtokenizer"])

##    if (options["unified-db"])
##      options["languages"] = [Default_Language]
##    else
##      options["languages"] = Languages
##    end

    options["languages"] = Languages

    options['mark-in-token'] = Default_mark_in_token if (! options['mark-in-token'])
    options['mark-in-token'].gsub!(/\s/, '')
    options["max-line"] = (options["max-line"] || Default_max_line).to_i
    options["max-mail"] = (options["max-mail"] || Default_max_mail).to_i
    options["min-mail"] = (options["min-mail"] || Default_min_mail).to_i
    
    options["degeneration"] = options["disable-degeneration"] ? false : true

    if (options["refer-header"])
      array = options["refer-header"].downcase.split(',')
    elsif (options["ignore-header"])
      array = Array::new
    else
      array = Default_refer_header.downcase.split(',')
    end
    options["refer-header"] = Hash::new
    array.each do |header|
      options["refer-header"][header] = true
    end

    options["use-body"] = options["ignore-body"] ? false : true
    
    options["pid-file"] = options["homedir"] + Default_pid_file if (! options["pid-file"]) 
    
    options["imap-auth"] = options["imap-auth"] || Default_imap_auth
    options["imap-auth-preference"] = Default_imap_auth_preference # can't modify with command line option
    
    if ((! options["disable-utf-8"]) &&
        safe_require("iconv"))
      options["utf-8"] = true
      define_safe_iconv if (! defined?(Iconv.safe_iconv))
    else
      options["utf-8"] = false
    end
    
    if (options["pop"])
      check_options_for_pop!(options)
      require 'timeout' 
      require 'socket'
      setup_socket_timeout
    end
    if (options["imap"])
      check_options_for_imap!(options)
      require 'net/imap'
      setup_imap
    end
    if (options["ssl"])
      if (options["ssl-cert"])
        if (! File::readable?(options["ssl-cert"]))
          soft_raise(sprintf("#{$0}: can't read %s. check --ssl-cert option", options["ssl-cert"]))
        end
      end
      require "openssl"
      setup_ssl_socket_timeout
    end
    return options
  end
    
  def show_db_status
    @options["languages"].each do |lang|
      @db_hash[lang].clean.open("r")
      @db_hash[lang].spam.open("r")
      @db_hash[lang].prob.open("r")
      @options["message-fh"].printf("db %s %d %d %d %d %d\n", lang,
                         @db_hash[lang].clean.size,
                         @db_hash[lang].clean.file_count,
                         @db_hash[lang].spam.size,
                         @db_hash[lang].spam.file_count,
                         @db_hash[lang].prob.size)
      @db_hash[lang].prob.close
      @db_hash[lang].spam.close
      @db_hash[lang].clean.close
    end
  end
    
  def show_process(token_db, maintenance_command)
    if (@options["pop"])
      prot = "pop"
    elsif (@options["imap"])
      prot = "imap"
    else
      prot = "file"
    end
    
    case token_db.spam_flag
    when nil
      filter_result = "-"
    when true
      filter_result = "spam"
    when false
      filter_result = "clean"
    else
      raise "internal error: unknown spam_flag"
    end
    
    @options["message-fh"].printf("%s %s %s %s %s %s %s\n",
                                 prot,
                                 token_db.language,
                                 filter_result,
                                 maintenance_command,
                                 token_db.time.strftime("%Y%m%d%H%M%S"),
                                 token_db.message_id,
                                 token_db.filename)
  end

  def spam?
    @token_dbs.last.spam_flag
  end

  def probability
    @token_dbs.last.probability
  end
    
  def setup(command_line_options)
    @options.clear
    @db_hash.clear

    command_line_options_backup = command_line_options.dup
    argv_backup = ARGV.dup
    ARGV.clear
    if (! command_line_options_backup.empty?)
      ARGV.unshift(*command_line_options_backup)
    end

    @options.update(get_options)

    STDIN::binmode
    if (@options["quiet"])
      @options["message-fh"] = DevNull::new
      @options["pipe-fh"] = DevNull::new
    elsif (((@options["export-clean"] || @options["export-spam"] || @options["export-probability"]) &&
         ((ARGV.length == 0) || (ARGV[0] == "-"))) || # export to stdout
        @options["list-clean"] || @options["list-spam"] || @options["pipe"])
      @options["message-fh"] = STDERR
      @options["pipe-fh"] = STDOUT
      STDOUT::binmode
    else
      @options["message-fh"] = STDOUT
      @options["pipe-fh"] = STDOUT
      # keep STDOUT in text mode
      @options["message-fh"].sync = true
    end
    
    @options['mark-in-token'] = Regexp::quote(@options['mark-in-token'])
    
    init_dir(@options["homedir"])
    
    @options["languages"].each do |lang|
      case @options["method"]
      when 'rf'
        @db_hash[lang] = RobinsonFisher::new(@options, lang)
      when 'r'
        @db_hash[lang] = Robinson::new(@options, lang)
      when 'g'
        @db_hash[lang] = Graham::new(@options, lang)
      else
        raise sprintf("internal error: unknown method %s", @options["method"])
      end
      @db_hash[lang].spam_cutoff = @options["spam-cutoff"].to_f if (@options["spam-cutoff"])
    end

    rest_options = ARGV.dup
    ARGV.clear
    if (! argv_backup.empty?)
      ARGV.unshift(*argv_backup)
    end

    return rest_options
  end

  def run(command_line_args)
    @options["message-fh"].print "start ", Time::new.to_s, "\n" if (@options["verbose"])
    if (@options["show-db-status"])
      show_db_status
      return EXIT_NORMAL
    end
    
    if (@options["pop"])
      write_pid_file(@options["pid-file"])
      do_pop
      File::unlink(@options["pid-file"])
      return EXIT_NORMAL
    end
    
    filtering_mode = true
    
    token_dbs = Array::new
    @token_dbs = token_dbs
    if (@options["import-clean"] ||
        @options["import-spam"] ||
        @options["add-clean"] ||
        @options["add-spam"] ||
        @options["sub-clean"] ||
        @options["sub-spam"])
      filtering_mode = false
      if (command_line_args.empty? && ! @options["imap"])
        token_dbs = update_token_dbs(["-"])
      else
        token_dbs = update_token_dbs(command_line_args)
      end
    end
    
    if (@options["export-clean"] || @options["export-spam"] || @options["export-probability"])
      filtering_mode = false
      do_export(command_line_args)
    end
    
    if (@options["update"])
      filtering_mode = false
      @options["languages"].each do |lang|
        @db_hash[lang].clean.open("r")
        @db_hash[lang].spam.open("r")
        @db_hash[lang].update_probability(token_dbs) # dbs = Array of TokenDB for -c, -s
        @db_hash[lang].clean.close
        @db_hash[lang].spam.close
      end
    end
    
    ret_code = CODE_NORMAL
    if (filtering_mode)
      @options["languages"].each do |lang|
        @db_hash[lang].prob.open("r")
      end
      if (@options["imap"])
        ret_code = do_imap(command_line_args, token_dbs)
      else
        if (command_line_args.empty?)
          command_line_args = ["-"]
        end
        ret_code = CODE_CLEAN if (! @options["pipe"])
        command_line_args.each do |file|
          open_ro(file) do |fh|
            number = 1
            mbox = Mbox::new(@options, fh)
            while (buf = mbox.read)
              token_db = tokenize_buf(buf)
              token_db.filename = file
              @db_hash[token_db.language].get_combined_probability(token_db)
              insert_headers!(buf, token_db.spam_flag, token_db.probability) 
              if (@options["pipe"])
                @options["pipe-fh"].print buf
              end
              printf("%s\n", file) if (token_db.spam_flag && @options["list-spam"])
              printf("%s\n", file) if (! token_db.spam_flag && @options["list-clean"])
              ret_code = CODE_SPAM if (token_db.spam_flag && (! @options["pipe"]))
              token_dbs.push(token_db)
              if (defined?(fh.path))
                @options["message-fh"].printf("combined probability %s %d %f\n", 
                                             fh.path, number, token_db.probability) 
              end
              number += 1
            end
          end
        end
      end
      @options["languages"].each do |lang|
        @db_hash[lang].prob.close
      end
      STDOUT::flush
      if (@options["auto-update"])
        auto_update(token_dbs) 
      elsif (@options["show-process"])
        token_dbs.each do |token_db|
          show_process(token_db, "-")
        end
      end
    end
    @options["message-fh"].print "end ", Time::new.to_s, "\n" if (@options["verbose"])
    
    return ret_code
  end
end

if ($0 == __FILE__)
  bsfilter = Bsfilter::new
  args = bsfilter.setup(ARGV)
  if (bsfilter.run(args))
    exit 0
  else
    exit 1
  end
end
Man Man