/ Published in: Ruby
Convert Vietnamese characters into ASCII so they can be indexed and searched.
Expand |
Embed | Plain Text
Copy this code and paste it in your HTML
require 'unicode' # Normalizes token text to lower case. class UnicodeLowerCaseFilter def initialize(token_stream) @input = token_stream end def text=(text) @input.text = text end def next() t = @input.next() if (t == nil) return nil end t.text = Unicode.downcase(t.text) return t end end class VietnameseAnalyzer < Ferret::Analysis::Analyzer include Ferret::Analysis # Standard Character mappings to remove all special characters # so only default ASCII characters get indexed CHARACTER_MAPPINGS = { ['á','à ','ạ','ả','ã','ă','ắ','ằ','ặ','ẳ','ẵ','â','ấ','ầ','áº','ẩ','ẫ'] => 'a', ['Ä‘'] => 'd', ['é','è','ẹ','ẻ','ẽ','ê','ế','á»','ệ','ể','á»…'] => 'e', ['Ã','ì','ị','ỉ','Ä©'] => 'i', ['ó','ò','á»','á»§','õ','Æ¡','á»›','á»','ợ','ở','ỡ','ô','ố','ồ','á»™','ổ','á»—'] => 'o', ['ú','ù','ụ','ů','Å©','ư','ứ','ừ','á»±','á»','ữ'] => 'u', ['ý','ỳ','ỵ','á»·','ỹ'] => 'y', } unless defined?(CHARACTER_MAPPINGS) def token_stream(field, str) ts = StandardTokenizer.new(str) ts = UnicodeLowerCaseFilter.new(ts) ts = MappingFilter.new(ts, CHARACTER_MAPPINGS) end end
Comments
Subscribe to comments
