/ Published in: Ruby
reads from pages.txt ( one url per line ) checks url ( for title & h1 tag ) writes to output.csv ( path, title, headline, http_status )
Expand |
Embed | Plain Text
require 'rubygems' require 'open-uri' require 'scrapi' require 'cgi' require 'csv' # Functions def self.checkPage(page) httpPrefix = 'http://www.bikeshd.co.uk/' resultsHash = Array.new() page = httpPrefix << page scraper = Scraper.define do array :items process "html", :items => Scraper.define { process "h1", :headline => :text process "title", :title => :text result :headline, :title } result :items end # Check http status and scrape begin # try and open it file = open(page) the_status = file.status[0] # Scrape it! item = scraper.scrape(open(page).read) resultsArray = ["path" => page, "title" => item[0].title, "headline" => item[0].headline, "status" => '200'] return resultsArray rescue OpenURI::HTTPError => the_error # Whut? ~ Error code the_status = the_error.io.status[0] resultsArray = ["path" => page, "title" => '', "headline" => '', "status" => the_status] return resultsArray end end # Execution pages = Hash.new() c = 0; puts "GETTING PAGES -----------------------------" File.open("pages.txt", "r").each_with_index { |page,i| !CGI.escape(page) puts "#{i} >>> #{page}" pages[i] = Array.new() pages[i] << checkPage(page) #if i == 500 # break #end c = c + 1 if c == 50 sleep 3 c = 0 end } puts "WRITING CSV -------------------------------" begin CSV.open("output.csv", "wb") do |csv| pages.each_with_index do |page,i| path = page[1][0][0]['path'] status = page[1][0][0]['status'] title = page[1][0][0]['title'] headline = page[1][0][0]['headline'] csv << [path, title, headline, status] end end end
You need to login to post a comment.
