Get content of a page (with headers, GET, POST), and ensure ungzipped

Revision: 17728

at May 8, 2010 11:12 by manatlan

Updated Code

import urllib,urllib2
from gzip import GzipFile
from StringIO import StringIO

def getContent(url,data=None):  # data is a dict of posted vars
    headers = {
        "User-Agent": "Mozilla/5.0 (X11; U; Linux i686; fr-FR; rv:1.7.5) Gecko/20041108 Firefox/1.0",
        "Accept": "text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5",
        "Accept-Language": "fr,fr-fr;q=0.8,en-us;q=0.5,en;q=0.3",
        "Accept-Charset": "utf-8;q=0.7,*;q=0.7",
        "Keep-Alive": "300",
        "Proxy-Connection": "keep-alive",
        'Cookie': '',
        "http-referer":"http://www.google.com/"
    }
    if data:
        data = urllib.urlencode(data)
    request= urllib2.Request(url,data,headers)
    try:
        response = urllib2.urlopen(request)
        html=response.read(1000000)
        try:
            html=GzipFile(fileobj=StringIO(html), mode='rb').read(1000000)
        except:
            pass
        return html

    except urllib2.HTTPError, exc:
        print "HTTP error %d : %s" % (exc.code, exc.msg)
    except urllib2.URLError, exc:
        print "URL Error : %s" % exc.reason

Revision: 17727

at September 12, 2009 18:17 by manatlan

Updated Code

import urllib,urllib2

def getContent(url,data=None):  # data is a dict of posted vars
    headers = {
        "User-Agent": "Mozilla/5.0 (X11; U; Linux i686; fr-FR; rv:1.7.5) Gecko/20041108 Firefox/1.0",
        "Accept": "text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5",
        "Accept-Language": "fr,fr-fr;q=0.8,en-us;q=0.5,en;q=0.3",
        "Accept-Charset": "utf-8;q=0.7,*;q=0.7",
        "Keep-Alive": "300",
        "Proxy-Connection": "keep-alive",
        'Cookie': '',
        "http-referer":"http://www.google.com/"
    }
    if data:
        data = urllib.urlencode(data)
    request= urllib2.Request(url,data,headers)
    try:
        response = urllib2.urlopen(request)
        return response.read(500000)
    except urllib2.HTTPError, exc:
        print "HTTP error %d : %s" % (exc.code, exc.msg)
    except urllib2.URLError, exc:
        print "URL Error : %s" % exc.reason

Revision: 17726

at September 12, 2009 17:19 by manatlan

Initial Code

import urllib,urllib2

def getContent(url,data=None):  # data is a dict of posted vars
    headers = {
        "User-Agent": "Mozilla/5.0 (X11; U; Linux i686; fr-FR; rv:1.7.5) Gecko/20041108 Firefox/1.0",
        "Accept": "text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5",
        "Accept-Language": "fr,fr-fr;q=0.8,en-us;q=0.5,en;q=0.3",
        "Accept-Charset": "utf-8;q=0.7,*;q=0.7",
        "Keep-Alive": "300",
        "Proxy-Connection": "keep-alive",
        'Cookie': '',
        "http-referer":"http://www.google.com/"
    }
    if data:
        data = urllib.urlencode(data)
    request= urllib2.Request(url,data,headers)
    response = urllib2.urlopen(request)
    return response.read().decode("utf_8")

Initial URL

Initial Description

Initial Title

Get content of a page (with headers, GET, POST), and ensure ungzipped

Initial Tags

python, web

Initial Language

Python

Choose a language for easy browsing: