æ•°å€¤æ–‡å—å‚ç…§ã‹ã‚‰æ–‡å—ã¸å¤‰æ›

Revision: 10974

at January 20, 2009 08:00 by tamuratetsuya

Initial Code

#!-*- coding:utf-8 -*-
import htmlentitydefs
import re

# å®Ÿä½“å‚ç…§ & æ–‡å—å‚ç…§ã‚’é€šå¸¸ã®æ–‡å—ã«æˆ»ã™
def htmlentity2unicode(text):
    # æ£è¦è¡¨ç¾ã®ã‚³ãƒ³ãƒ‘ã‚¤ãƒ«
    reference_regex = re.compile(u'&(#x?[0-9a-f]+|[a-z]+);', re.IGNORECASE)
    num16_regex = re.compile(u'#x\d+', re.IGNORECASE)
    num10_regex = re.compile(u'#\d+', re.IGNORECASE)
    
    result = u''
    i = 0
    while True:
        # å®Ÿä½“å‚ç…§ or æ–‡å—å‚ç…§ã‚’è¦‹ã¤ã‘ã‚‹
        match = reference_regex.search(text, i)
        if match is None:
            result += text[i:]
            break
        
        result += text[i:match.start()]
        i = match.end()
        name = match.group(1)
        
        # å®Ÿä½“å‚ç…§
        if name in htmlentitydefs.name2codepoint.keys():
            result += unichr(htmlentitydefs.name2codepoint[name])
        # æ–‡å—å‚ç…§
        elif num16_regex.match(name):
            # 16é€²æ•°
            result += unichr(int(u'0'+name[1:], 16))
        elif num10_regex.match(name):
            # 10é€²æ•°
            result += unichr(int(name[1:]))
        
    return result

# ãƒ†ã‚¹ãƒˆã‚³ãƒ¼ãƒ‰
text = u"&#25991;&#23383;&#x53C2;&#x7167; &amp; &#x5B9F;&#x4F53;&#21442;&#29031; ã‚’é€šå¸¸ã®æ–‡å—ã«æˆ»ã—ã¾ã™ã€‚";
print htmlentity2unicode(text)

Initial URL

Initial Description

Initial Title

æ•°å€¤æ–‡å—å‚ç…§ã‹ã‚‰æ–‡å—ã¸å¤‰æ›

Initial Tags

python

Initial Language

Python

Choose a language for easy browsing: