Posted By

scrapy on 09/01/12


Tagged

javascript rendered scrapy webdrivers


Versions (?)

rendered javascript with webdrivers


 / Published in: Python
 

  1. # This is a piece of code that use webdrivers to load&render a page with Scrapy and Selenium.
  2. #
  3. # This work is based on the snippets [wynbennett](http://snippets.scrapy.org/users/wynbennett/) [posted here](http://snippets.scrapy.org/snippets/21/) some time ago
  4.  
  5. from scrapy.contrib.spiders import CrawlSpider, Rule
  6. from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
  7. from scrapy.selector import HtmlXPathSelector
  8. from scrapy.http import Request
  9. from myItem.items import myItem
  10. from selenium import webdriver
  11. from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
  12.  
  13. import time
  14. import pprint
  15.  
  16. class WebDriverSpider(CrawlSpider):
  17. name = "WebDriverSpider"
  18. start_urls = ["http://yourDomain.com/yourUrl.html"]
  19.  
  20. rules = (
  21. Rule(SgmlLinkExtractor(allow=('\.html', ), allow_domains=('yourDomain.com', )), callback='parse_page',follow=False),
  22. )
  23.  
  24. def __init__(self):
  25. CrawlSpider.__init__(self)
  26. self.verificationErrors = []
  27. #create a profile with specific add-ons
  28. #and do this. Firefox to load it
  29. profile = FirefoxProfile(profile_directory="/home/yourUser/.mozilla/firefox/selenium/")
  30. self.selenium = webdriver.Firefox(profile)
  31.  
  32. def __del__(self):
  33. self.selenium.quit()
  34. print self.verificationErrors
  35. CrawlSpider.__del__(self)
  36.  
  37. def parse_page(self, response):
  38. #normal scrapy result
  39. hxs = HtmlXPathSelector(response)
  40. #webdriver rendered page
  41. sel = self.selenium
  42. sel.get(response.url)
  43.  
  44. if sel:
  45. #Wait for javascript to load in Selenium
  46. time.sleep(2.5)
  47.  
  48. #Do some crawling of javascript created content with Selenium
  49. item = myItem()
  50. item['url'] = response.url
  51. item['title'] = hxs.select('//title/text()').extract()
  52.  
  53.  
  54. #something u can do only with webdrivers
  55. item['thatDiv'] = sel.find_element_by_id("thatDiv")
  56.  
  57. # Snippet imported from snippets.scrapy.org (which no longer works)
  58. # author: rollsappletree
  59. # date : Aug 25, 2011
  60.  

Report this snippet  

You need to login to post a comment.