Posted By

scrapy on 09/01/12


Tagged

spider scrapy file-upload uberupload-upload-file-mime


Versions (?)

Upload file (uber upload)


 / Published in: Python
 

  1. # This is an example of file uploading using scrapy to a server that uses uberuploader.
  2. # The spider logins to the page to make the upload, some webpages dont need to be logged in order to upload.
  3. # IMPORTANT : You should increment DOWNLOAD_TIMEOUT in settings.py, but by the time this snippet was wrote isnt working fine. I recompile the whole scrapy changing the default of 3min.
  4. # Observations about my snippet:
  5. # Is possible this is not the best code, please comment corrections.
  6. # This could or should be implemented in a downloader middleware or pipeline?
  7. # Don't show uploading state.
  8. # Mime message creating could or should be in other place.
  9.  
  10. class fileUploadSpider(CrawlSpider):
  11. name = "spidertrigger.upload"
  12. allowed_domains = ["uploadhost.com"]
  13. start_urls = [
  14. "http://www.uploadhost.com/url_to_login_page",
  15. ]
  16.  
  17. def parse(self,response):
  18. return [FormRequest.from_response(
  19. response,
  20. formdata={'user':'username','password':'secret'},
  21. callback=self.after_login,
  22. )]
  23.  
  24. def after_login(self,response):
  25. if "Log in to your account" in response.body:
  26. self.log("Login Failed",level=log.ERROR)
  27. return
  28. else:
  29. dataObjetcs = DataObject.objects.all()#I am using django ORM
  30. for data in dataObjects:
  31. #note the next line, the url should point to ubr_link_upload.php
  32. # I will get the random ticket to be able to upload file,rnd_id is hardcoded but could be generated via code
  33. yield Request(
  34. url='http://upload.uploadhost.com/upload/ubr_link_upload.php?rnd_id=1280793046605',
  35. callback=self.obtener_id_upload,
  36. meta={'data' : data},
  37. )
  38. return
  39.  
  40. def get_id_upload(self,response):
  41. #here I will get the upload id
  42. hxs = HtmlXPathSelector(response)
  43. data = response.request.meta['data']
  44. file_name = settings.IMAGES_STORE+'/'+data.path+'.zip' #here I require that the file exist (you should add more code here , like a try catch)
  45. #get the upload_id
  46. upload_id = re.search('\\\"\w+\\\"',hxs.select('/html/body').extract()[0]).group(0).replace('\"','')
  47.  
  48. #build the fields that the request will have
  49. fields = { 'title':data.nombre,
  50. 'adpaid' :'0',
  51. 'private':'no',
  52. 'category[]':'1',
  53. 'fontcolor':'black',
  54. 'helpbox' : 'Font size: [size=50%]small text[/size]',
  55. 'textarea':'',
  56. 'fontsize':'',
  57. 'compare' : '14936',
  58. }
  59. files = {'upfile_0':file_name,}
  60. headers,body = self.get_mime(fields,files)
  61. print 'Iniciando Request POST'
  62. #next NOTE that the url should point to cgi-bin/ubr_upload.pl with the proper upload_id
  63. yield FormRequest (
  64. url='http://upload.uploadhost.com/cgi-bin/ubr_upload.pl?upload_id='+upload_id,
  65. method='POST',
  66. body=body,
  67. meta={'data' : data},
  68. headers = headers,
  69. callback=self.lastcall,
  70. )
  71.  
  72. return
  73.  
  74. #this lastcall is for postprocessing the upload data, is an artificial example to obtain the id of the upload object on the webpage
  75. def lastcall(self,response):
  76.  
  77. hxs = HtmlXPathSelector(response)
  78. linkUploaded = hxs.select('//div[@id=\'col2contentright\']/p/strong/a/@href').extract()[0]
  79. idUploaded = re.search('\d+',linkUploaded)
  80. print "Success Uploaded "+ ipUploaded
  81. return
  82.  
  83. #this next code will need more improvement, is working for now. It could have problems with binary data!
  84. def get_mime(self,fields,files):
  85. BOUNDARY = '----------BOUNDARY_$'
  86. # CRLF =
  87. L = StringIO()
  88. for key in fields.keys() :
  89. value = fields[key]
  90. L.write('--' + BOUNDARY+'\r\n')
  91. L.write('Content-Disposition: form-data; name="%s"' % key+'\r\n')
  92. L.write(''+'\r\n')
  93. L.write(value.encode('utf-8')+'\r\n')
  94. for key in files.keys():
  95. value = files[key]
  96. filename = value
  97. L.write('--' + BOUNDARY+'\r\n')
  98. L.write('Content-Disposition: form-data; name="%s"; filename="%s"' % (key, 'full.zip')+'\r\n')
  99. L.write('Content-Type: %s' % self.get_content_type(filename)+'\r\n')
  100. L.write(''+'\r\n')
  101. L.write(open(value,'rb').read()+'\r\n')
  102. L.write('--' + BOUNDARY + '--'+'\r\n')
  103. L.write(''+'\r\n')
  104.  
  105. body = L.getvalue()
  106.  
  107. content_type = {'Content-Type': 'multipart/form-data; boundary=%s' % BOUNDARY }
  108. return content_type,body
  109.  
  110. def get_content_type(self,filename):
  111. return mimetypes.guess_type(filename)[0] or 'application/octet-stream'
  112.  
  113. # Snippet imported from snippets.scrapy.org (which no longer works)
  114. # author: llazzaro
  115. # date : Aug 15, 2010
  116.  

Report this snippet  

You need to login to post a comment.