Return to Snippet

Revision: 3718
at September 5, 2007 21:09 by ctran


Updated Code
require 'unicode'

# Normalizes token text to lower case.
class UnicodeLowerCaseFilter
  def initialize(token_stream)
    @input = token_stream
  end
  
  def text=(text)
    @input.text = text   
  end
  
  def next()
    t = @input.next()
    
    if (t == nil)
      return nil
    end
    
    t.text = Unicode.downcase(t.text)
    return t
  end
end

class VietnameseAnalyzer < Ferret::Analysis::Analyzer
  include Ferret::Analysis
  
  # Standard Character mappings to remove all special characters
  # so only default ASCII characters get indexed
  CHARACTER_MAPPINGS = {
    ['á','à','ạ','ả','ã','ă','ắ','ằ','ặ','ẳ','ẵ','â','ấ','ầ','ậ','ẩ','ẫ'] => 'a',
    ['Ä‘'] => 'd',
    ['é','è','ẹ','ẻ','ẽ','ê','ế','ề','ệ','ể','ễ'] => 'e',
    ['í','ì','ị','ỉ','ĩ'] => 'i',
    ['ó','ò','ọ','ủ','õ','ơ','ớ','ờ','ợ','ở','ỡ','ô','ố','ồ','ộ','ổ','ỗ'] => 'o',
    ['ú','ù','ụ','ů','ũ','ư','ứ','ừ','ự','ử','ữ'] => 'u',
    ['ý','ỳ','ỵ','ỷ','ỹ'] => 'y',
  } unless defined?(CHARACTER_MAPPINGS)
  
  def token_stream(field, str)
    ts = StandardTokenizer.new(str)
    ts = UnicodeLowerCaseFilter.new(ts)
    ts = MappingFilter.new(ts, CHARACTER_MAPPINGS)
  end
end

Revision: 3717
at September 5, 2007 19:26 by ctran


Initial Code
class VietnameseAnalyzer < Ferret::Analysis::Analyzer
  include Ferret::Analysis
  
  # Standard Character mappings to remove all special characters
  # so only default ASCII characters get indexed
  CHARACTER_MAPPINGS = {
    ['á','à','ạ','ả','ã','ă','ắ','ằ','ặ','ẳ','ẵ','â','ấ','ầ','ậ','ẩ','ẫ'] => 'a',
    ['Ä‘'] => 'd',
    ['é','è','ẹ','ẻ','ẽ','ê','ế','ề','ệ','ể','ễ'] => 'e',
    ['í','ì','ị','ỉ','ĩ'] => 'i',
    ['ó','ò','ọ','ủ','õ','ơ','ớ','ờ','ợ','ở','ỡ','ô','ố','ồ','ộ','ổ','ỗ'] => 'o',
    ['ú','ù','ụ','ů','ũ','ư','ứ','ừ','ự','ử','ữ'] => 'u',
    ['ý','ỳ','ỵ','ỷ','ỹ'] => 'y',
  } unless defined?(CHARACTER_MAPPINGS)      
  
  
  def token_stream(field, str)
    MappingFilter.new(HyphenFilter.new( 
      LowerCaseFilter.new( StandardTokenizer.new(str) ) ), CHARACTER_MAPPINGS )
  end
end

Initial URL

                                

Initial Description
Convert Vietnamese characters into ASCII so they can be indexed and searched.

Initial Title
VietnameseAnalyzer.rb

Initial Tags

                                

Initial Language
Ruby