Revision: 17728
Updated Code
at May 8, 2010 11:12 by manatlan
Updated Code
import urllib,urllib2 from gzip import GzipFile from StringIO import StringIO def getContent(url,data=None): # data is a dict of posted vars headers = { "User-Agent": "Mozilla/5.0 (X11; U; Linux i686; fr-FR; rv:1.7.5) Gecko/20041108 Firefox/1.0", "Accept": "text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5", "Accept-Language": "fr,fr-fr;q=0.8,en-us;q=0.5,en;q=0.3", "Accept-Charset": "utf-8;q=0.7,*;q=0.7", "Keep-Alive": "300", "Proxy-Connection": "keep-alive", 'Cookie': '', "http-referer":"http://www.google.com/" } if data: data = urllib.urlencode(data) request= urllib2.Request(url,data,headers) try: response = urllib2.urlopen(request) html=response.read(1000000) try: html=GzipFile(fileobj=StringIO(html), mode='rb').read(1000000) except: pass return html except urllib2.HTTPError, exc: print "HTTP error %d : %s" % (exc.code, exc.msg) except urllib2.URLError, exc: print "URL Error : %s" % exc.reason
Revision: 17727
Updated Code
at September 12, 2009 18:17 by manatlan
Updated Code
import urllib,urllib2 def getContent(url,data=None): # data is a dict of posted vars headers = { "User-Agent": "Mozilla/5.0 (X11; U; Linux i686; fr-FR; rv:1.7.5) Gecko/20041108 Firefox/1.0", "Accept": "text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5", "Accept-Language": "fr,fr-fr;q=0.8,en-us;q=0.5,en;q=0.3", "Accept-Charset": "utf-8;q=0.7,*;q=0.7", "Keep-Alive": "300", "Proxy-Connection": "keep-alive", 'Cookie': '', "http-referer":"http://www.google.com/" } if data: data = urllib.urlencode(data) request= urllib2.Request(url,data,headers) try: response = urllib2.urlopen(request) return response.read(500000) except urllib2.HTTPError, exc: print "HTTP error %d : %s" % (exc.code, exc.msg) except urllib2.URLError, exc: print "URL Error : %s" % exc.reason
Revision: 17726
Initial Code
Initial URL
Initial Description
Initial Title
Initial Tags
Initial Language
at September 12, 2009 17:19 by manatlan
Initial Code
import urllib,urllib2 def getContent(url,data=None): # data is a dict of posted vars headers = { "User-Agent": "Mozilla/5.0 (X11; U; Linux i686; fr-FR; rv:1.7.5) Gecko/20041108 Firefox/1.0", "Accept": "text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5", "Accept-Language": "fr,fr-fr;q=0.8,en-us;q=0.5,en;q=0.3", "Accept-Charset": "utf-8;q=0.7,*;q=0.7", "Keep-Alive": "300", "Proxy-Connection": "keep-alive", 'Cookie': '', "http-referer":"http://www.google.com/" } if data: data = urllib.urlencode(data) request= urllib2.Request(url,data,headers) response = urllib2.urlopen(request) return response.read().decode("utf_8")
Initial URL
Initial Description
Initial Title
Get content of a page (with headers, GET, POST), and ensure ungzipped
Initial Tags
python, web
Initial Language
Python