/ Published in: Python
Expand |
Embed | Plain Text
!/usr/bin/python # -*- coding: utf-8 -*-</code> #%m POS ID 36-47 is noun #%s 0: known 1: unknown #tasks #1 create a set to avoid noun duplication #2 set rules to avoid url tokenization #3 sort noun set to prioritize type:1 nouns import sys import MeCab class MeCabTagger: def __init__(self, text = ""): self.m = MeCab.Tagger ('-F%m\s%h\s%s\s') self.nouns = [] print "Creating MeCabTagger instance." self.parseText(text) def parseText(self, text): self.posList = [] taggedSpeach = self.m.parse (text) taggedSpeachList = taggedSpeach.split(" ") lenOfSpeach = (len(taggedSpeachList) - 1)/3 for i in range(lenOfSpeach): self.posList.append(taggedSpeachList[3*i:3*(i+1)]) #print " ".join(taggedSpeachList) def showNouns(self): if len(self.posList) == 0: print "No text parsed." return self.nouns = [] for pos in self.posList: if int(pos[1]) >=36 and int(pos[1]) self.nouns.append(pos[0]) print " ".join(self.nouns) mecab = MeCabTagger("20代も後半に入ったので、年相応の行動を心掛けよう、というのが最近の私らの口癖です。") mecab.showNouns()
You need to login to post a comment.
