/ Published in: Python
Expand |
Embed | Plain Text
Copy this code and paste it in your HTML
#!-*- coding:utf-8 -*- import htmlentitydefs import re # 実体å‚ç…§ & æ–‡å—å‚照を通常ã®æ–‡å—ã«æˆ»ã™ def htmlentity2unicode(text): # æ£è¦è¡¨ç¾ã®ã‚³ãƒ³ãƒ‘イル reference_regex = re.compile(u'&(#x?[0-9a-f]+|[a-z]+);', re.IGNORECASE) num16_regex = re.compile(u'#x\d+', re.IGNORECASE) num10_regex = re.compile(u'#\d+', re.IGNORECASE) result = u'' i = 0 while True: # 実体å‚ç…§ or æ–‡å—å‚照を見ã¤ã‘ã‚‹ match = reference_regex.search(text, i) if match is None: result += text[i:] break result += text[i:match.start()] i = match.end() name = match.group(1) # 実体å‚ç…§ if name in htmlentitydefs.name2codepoint.keys(): result += unichr(htmlentitydefs.name2codepoint[name]) # æ–‡å—å‚ç…§ elif num16_regex.match(name): # 16進数 result += unichr(int(u'0'+name[1:], 16)) elif num10_regex.match(name): # 10進数 result += unichr(int(name[1:])) return result # テストコード text = u"文字参照 & 実体参照 を通常ã®æ–‡å—ã«æˆ»ã—ã¾ã™ã€‚"; print htmlentity2unicode(text)