Posted By

shadevampire on 05/29/11


Tagged

parse parser pastebin


Versions (?)

pastebin parser


 / Published in: Python
 

URL: http://www.michielovertoom.com/python/pastebin-abused/

"This is the source code to the program I used to scrape these 'public pastes' from pastebin.com. Use at your own peril!" - from the mentioned link

  1. import BeautifulSoup
  2. import urllib2
  3. import time
  4. import Queue
  5. import threading
  6. import sys
  7. import datetime
  8. import random
  9. import os
  10.  
  11. pastesseen = set()
  12. pastes = Queue.Queue()
  13.  
  14. def downloader():
  15. while True:
  16. paste = pastes.get()
  17. fn = "pastebins/%s-%s.txt" % (paste, datetime.datetime.today().strftime("%Y-%m-%d"))
  18. content = urllib2.urlopen("http://pastebin.com/raw.php?i=" + paste).read()
  19. if "requesting a little bit too much" in content:
  20. print "Throttling... requeuing %s" % paste
  21. pastes.put(paste)
  22. time.sleep(0.1)
  23. else:
  24. f = open(fn, "wt")
  25. f.write(content)
  26. f.close()
  27. delay = 1.1 # random.uniform(1, 3)
  28. sys.stdout.write("Downloaded %s, waiting %f sec\n" % (paste, delay))
  29. time.sleep(delay)
  30. pastes.task_done()
  31.  
  32. def scraper():
  33. scrapecount = 0
  34. while scrapecount < 10:
  35. html = urllib2.urlopen("http://www.pastebin.com").read()
  36. soup = BeautifulSoup.BeautifulSoup(html)
  37. ul = soup.find("ul", "right_menu")
  38. for li in ul.findAll("li"):
  39. href = li.a["href"]
  40. if href in pastesseen:
  41. sys.stdout.write("%s already seen\n" % href)
  42. else:
  43. href = href[1:] # chop off leading /
  44. pastes.put(href)
  45. pastesseen.add(href)
  46. sys.stdout.write("%s queued for download\n" % href)
  47. delay = 12 # random.uniform(6,10)
  48. time.sleep(delay)
  49. scrapecount += 1
  50.  
  51. num_workers = 1
  52. for i in range(num_workers):
  53. t = threading.Thread(target=downloader)
  54. t.setDaemon(True)
  55. t.start()
  56.  
  57. if not os.path.exists("pastebins"):
  58. os.mkdir("pastebins") # Thanks, threecheese!
  59.  
  60. s = threading.Thread(target=scraper)
  61. s.start()
  62. s.join()

Report this snippet  

You need to login to post a comment.