Posted By

eiger_824 on 12/09/15


Tagged

asdfasdfasdfasdf


Versions (?)

Lab3Python


 / Published in: Python
 

URL: asdfdfsdfasd

fsdfasdfasdfa

  1. # -*- coding: utf-8 -*-
  2. """
  3. Created on Tue Nov 03 09:56:05 2015
  4.  
  5. @author: emewa519
  6. """
  7. import nltk
  8. from bs4 import BeautifulSoup #Better Call Scrapy!
  9. import re
  10. #Assignment 1)
  11. from urllib import *
  12. a = urlopen('http://www.liu.se/forskning/reportage/han-ska-losa-spinnandets-gata?l=en')
  13. html = a.read()
  14. soup = BeautifulSoup(html)
  15. texts = soup.findAll(text=True)
  16. #Up to now, this was the COLLECTION step on the text mining
  17. def visible(element):
  18. if element.parent.name in ['style', 'script', '[document]', 'head', 'title']:
  19. return False
  20. '''elif re.match('<!--.*-->', str(element)):
  21. return False'''
  22. return True
  23.  
  24. visible_texts = filter(visible, texts)
  25. #The UNFORMATTING was just applied to the mined text
  26.  
  27. i1=visible_texts.index('Solving the mystery of purring')
  28. j1=visible_texts.index('Photos:')
  29. j1 = j1-2
  30.  
  31. extracted_text = visible_texts[i1-1:j1-1] #Our text in list format
  32. merged_extText = "".join(extracted_text) #Our text in string format
  33.  
  34. re.sub(r"\s+"," ",merged_extText) #FURTHER UNFORMATTING: matches white space, set of \t\n\r\f\v
  35.  
  36. def mapWords2Position(string):
  37. cnt=0
  38. for i in string.split():
  39. print i.__add__(" : ".__add__(str(cnt)))
  40. cnt=cnt+1
  41.  
  42. mapWords2Position(merged_extText)
  43.  
  44. #Assignment 2)
  45. word_list = merged_extText.split() #Approach 1: split text into smaller units(split at blank space)
  46. tokenized_text = nltk.word_tokenize(merged_extText)#Approach 2: split text into smaller units - words (more accurate)
  47. tokenized_norm = [word.lower() for word in tokenized_text] #Normalization of words in the text
  48. #Assignment 3)
  49. pos_taggedText = nltk.pos_tag(tokenized_text) #Part-of-speech tagger from NLTK. Results on report
  50. #nltk.help.upenn_tagset('CC') #Help & info for CC tag (same for rest of tags)
  51.  
  52. #Assignment 4)
  53. named_entities = nltk.ne_chunk(pos_taggedText, binary=True)
  54. #Comments on the report

Report this snippet  

You need to login to post a comment.