Posted By

haruo666 on 09/02/11


Tagged


Versions (?)

mecab sample


 / Published in: Python
 

  1. !/usr/bin/python
  2. # -*- coding: utf-8 -*-</code>
  3.  
  4. #%m POS ID 36-47 is noun
  5. #%s 0: known 1: unknown
  6.  
  7. #tasks
  8. #1 create a set to avoid noun duplication
  9. #2 set rules to avoid url tokenization
  10. #3 sort noun set to prioritize type:1 nouns
  11.  
  12. import sys
  13. import MeCab
  14.  
  15. class MeCabTagger:
  16.  
  17. def __init__(self, text = ""):
  18. self.m = MeCab.Tagger ('-F%m\s%h\s%s\s')
  19. self.nouns = []
  20. print "Creating MeCabTagger instance."
  21. self.parseText(text)
  22.  
  23. def parseText(self, text):
  24. self.posList = []
  25. taggedSpeach = self.m.parse (text)
  26. taggedSpeachList = taggedSpeach.split(" ")
  27. lenOfSpeach = (len(taggedSpeachList) - 1)/3
  28. for i in range(lenOfSpeach):
  29. self.posList.append(taggedSpeachList[3*i:3*(i+1)])
  30. #print " ".join(taggedSpeachList)
  31.  
  32. def showNouns(self):
  33. if len(self.posList) == 0:
  34. print "No text parsed."
  35. return
  36. self.nouns = []
  37. for pos in self.posList:
  38. if int(pos[1]) &gt;=36 and int(pos[1]) self.nouns.append(pos[0])
  39. print " ".join(self.nouns)
  40.  
  41. mecab = MeCabTagger("20代も後半に入ったので、年相応の行動を心掛けよう、というのが最近の私らの口癖です。")
  42. mecab.showNouns()

Report this snippet  

You need to login to post a comment.