Revision: 3718
Updated Code
at September 5, 2007 21:09 by ctran
Updated Code
require 'unicode'
# Normalizes token text to lower case.
class UnicodeLowerCaseFilter
def initialize(token_stream)
@input = token_stream
end
def text=(text)
@input.text = text
end
def next()
t = @input.next()
if (t == nil)
return nil
end
t.text = Unicode.downcase(t.text)
return t
end
end
class VietnameseAnalyzer < Ferret::Analysis::Analyzer
include Ferret::Analysis
# Standard Character mappings to remove all special characters
# so only default ASCII characters get indexed
CHARACTER_MAPPINGS = {
['á','à ','ạ','ả','ã','ă','ắ','ằ','ặ','ẳ','ẵ','â','ấ','ầ','áº','ẩ','ẫ'] => 'a',
['Ä‘'] => 'd',
['é','è','ẹ','ẻ','ẽ','ê','ế','á»','ệ','ể','á»…'] => 'e',
['Ã','ì','ị','ỉ','Ä©'] => 'i',
['ó','ò','á»','á»§','õ','Æ¡','á»›','á»','ợ','ở','ỡ','ô','ố','ồ','á»™','ổ','á»—'] => 'o',
['ú','ù','ụ','ů','Å©','ư','ứ','ừ','á»±','á»','ữ'] => 'u',
['ý','ỳ','ỵ','ỷ','ỹ'] => 'y',
} unless defined?(CHARACTER_MAPPINGS)
def token_stream(field, str)
ts = StandardTokenizer.new(str)
ts = UnicodeLowerCaseFilter.new(ts)
ts = MappingFilter.new(ts, CHARACTER_MAPPINGS)
end
end
Revision: 3717
Initial Code
Initial URL
Initial Description
Initial Title
Initial Tags
Initial Language
at September 5, 2007 19:26 by ctran
Initial Code
class VietnameseAnalyzer < Ferret::Analysis::Analyzer
include Ferret::Analysis
# Standard Character mappings to remove all special characters
# so only default ASCII characters get indexed
CHARACTER_MAPPINGS = {
['á','à ','ạ','ả','ã','ă','ắ','ằ','ặ','ẳ','ẵ','â','ấ','ầ','áº','ẩ','ẫ'] => 'a',
['Ä‘'] => 'd',
['é','è','ẹ','ẻ','ẽ','ê','ế','á»','ệ','ể','á»…'] => 'e',
['Ã','ì','ị','ỉ','Ä©'] => 'i',
['ó','ò','á»','á»§','õ','Æ¡','á»›','á»','ợ','ở','ỡ','ô','ố','ồ','á»™','ổ','á»—'] => 'o',
['ú','ù','ụ','ů','Å©','ư','ứ','ừ','á»±','á»','ữ'] => 'u',
['ý','ỳ','ỵ','ỷ','ỹ'] => 'y',
} unless defined?(CHARACTER_MAPPINGS)
def token_stream(field, str)
MappingFilter.new(HyphenFilter.new(
LowerCaseFilter.new( StandardTokenizer.new(str) ) ), CHARACTER_MAPPINGS )
end
end
Initial URL
Initial Description
Convert Vietnamese characters into ASCII so they can be indexed and searched.
Initial Title
VietnameseAnalyzer.rb
Initial Tags
Initial Language
Ruby