Posted By

richyeung on 05/31/13


Tagged

python scraping crawling scrapy


Versions (?)

Pandora for Food – Crawl Yelp for personalized recommendations


 / Published in: Python
 

Web crawling Yelp for personalized food recommendations

  1. # This is just a fun little script that acts like a Pandora for food.Its
  2. # implementation is simplistic. You choose a set of restaurants on Yelp that you
  3. # like, and the script finds all reviewers that gave these restaurants 5 stars. You
  4. # trust these reviewers because they share your awesome taste in food. The script
  5. # then spits out all restaurants that these "trusted reviewers" also reviewed, and
  6. # their rating for each review.
  7.  
  8. # You would need a few additional lines of code to turn the scrapy output into a
  9. # sorted list of restaurants. For example, the code below will sort restaurants by
  10. # number 5 star reviews from "trusted reviewers":
  11. # import pandas
  12. # reviews = pandas.read_csv('scrapy_output.csv')
  13. # fiveStarReviews = reviews[reviews['rating']==5]
  14. # fiveStarReviews.restaurant.value_counts()
  15.  
  16. # There are countless ways you can improve on this. One obvious one is you would
  17. # want to normalize by total restaurant reviews. You would probably also want to
  18. # pull in restaurant category information.
  19.  
  20. # Happy food hunting!
  21.  
  22. from scrapy.spider import BaseSpider
  23. from scrapy.selector import HtmlXPathSelector
  24. from scrapy.http import Request
  25. import re
  26.  
  27. from pandoraFood.items import Review
  28.  
  29. # url string components for reviewer pages
  30. URL_BASE = 'http://www.yelp.com/user_details_reviews_self?userid='
  31. FILTER_SETTINGS = '&review_filter=category&category_filter=restaurants'
  32.  
  33. # yelp unique url endings for each restaurant
  34. RESTAURANTS = ['z-and-y-restaurant-san-francisco', \
  35. 'koi-palace-daly-city', \
  36. 'ino-sushi-san-francisco', \
  37. 'blackwood-san-francisco-3']
  38.  
  39. def createRestaurantPageLinks(self, response):
  40. reviewsPerPage = 40
  41. hxs = HtmlXPathSelector(response)
  42. totalReviews = int(hxs.select('//h2[@id="total_reviews"]/text()').extract()[0].strip().split(' ')[0])
  43. pages = [Request(url=response.url + '?start=' + str(reviewsPerPage*(n+1)), \
  44. callback=self.parse) \
  45. for n in range(totalReviews/reviewsPerPage)]
  46. return pages
  47.  
  48. def createReviewerPageLinks(self, response):
  49. reviewsPerPage = 10
  50. hxs = HtmlXPathSelector(response)
  51. totalReviews = int(hxs.select('//div[@id="review_lister_header"]/em/text()').extract()[0].split(' ')[0])
  52. pages = [Request(url=response.url + '&rec_pagestart=' + str(reviewsPerPage*(n+1)), \
  53. callback=self.parseReviewer) \
  54. for n in range(totalReviews/reviewsPerPage)]
  55. return pages
  56.  
  57. class RestaurantSpider(BaseSpider):
  58. name = 'crawlRestaurants'
  59. allowed_domains = ['yelp.com']
  60. start_urls = [ 'http://www.yelp.com/biz/%s' % s for s in RESTAURANTS]
  61.  
  62. # default parse used for the landing page for each start_url
  63. def parse(self, response):
  64. requests = []
  65.  
  66. # extract all reviews from the page and return a list of requests for the 5 star reviewers' profiles
  67. hxs = HtmlXPathSelector(response)
  68. userIDs = [userUrl.split('?userid=')[1] for \
  69. userUrl in hxs.select('//li[@class="user-name"]/a/@href').extract()]
  70. ratings = hxs.select('//div[@id="reviews-other"]//meta[@itemprop="ratingValue"]/@content').extract()
  71.  
  72. for i in range(len(ratings)):
  73. if float(ratings[i]) == 5:
  74. requests.append(Request(url=URL_BASE + userIDs[i] + FILTER_SETTINGS, \
  75. callback=self.parseReviewer))
  76.  
  77. # request additional pages if we are on page 1 of the restaurant
  78. if response.url.find('?start=') == -1:
  79. requests += createRestaurantPageLinks(self, response)
  80.  
  81. return requests
  82.  
  83. # parse a given reviewer
  84. def parseReviewer(self, response):
  85. hxs = HtmlXPathSelector(response)
  86. restaurantUrls = hxs.select('//div[@class="review clearfix"]/ \
  87. div[@class="biz_info"]/h4/a/@href').extract()
  88. restaurants = [re.search(r'(?<=/biz/)[^#]*', rest).group() for rest in restaurantUrls]
  89. reviewerName = hxs.select('//title/text()').extract()[0].split('|')[0].replace('\'s Profile','').strip()
  90. reviewerUserID = re.search(r'(?<=userid=)[^&]*', response.url).group()
  91. ratingText = hxs.select('//div[@class="rating"]/i/@title').extract()
  92. ratings = [s.replace(' star rating','') for s in ratingText]
  93.  
  94. reviews = []
  95. for i in range(len(restaurants)):
  96. review = Review()
  97. review['restaurant'] = restaurants[i]
  98. review['reviewerName'] = reviewerName
  99. review['reviewerUserID'] = reviewerUserID
  100. review['rating'] = float(ratings[i])
  101. reviews.append(review)
  102.  
  103. # request additional pages if we are on page 1 of the reviewer
  104. additionalPages = []
  105. if response.url.find('&rec_pagestart=') == -1:
  106. additionalPages = createReviewerPageLinks(self, response)
  107.  
  108. return reviews + additionalPages

Report this snippet  

Comments

RSS Icon Subscribe to comments
Posted By: RedHog on August 7, 2013

from pandoraFood.items import Review

error - no items.py file =(

Posted By: RedHog on August 7, 2013

show please example of this file

You need to login to post a comment.