Official ObjectGraph Blog
Tuesday, September 25, 2007
PyScraper: The Python Screen Scraper
I wrote a light weight python based screen scraper, which seems to be working great.
Some of the features:
- Session Management when using cookies
- Seperate functions for get,post and downloading large files
- Automatic handling of redirections
Here is the code for it.
import httplib,urllib,random,sys,re,os
from urlparse import urlparse
class PyScraper:
def __init__(self):
self.cookie=""
self.currenturl=""
self.urlhist=[]
def __str__(self):
ret=''
for item in self.urlhist:
ret=ret+item+'->'
return ret
def download(self,url,localfolder):
bufsize = 1024
self.urlhist.append(url)
o=urlparse(url)
scheme,hostname,path,q,query,position=o
head,fname = os.path.split(path)
if(query!=''):
path=path+"?"+query
conn=httplib.HTTPConnection(hostname)
conn.request('GET', path,None,{'Cookie':self.cookie})
resp=conn.getresponse()
total=int(resp.getheader('content-length'))
f = open(localfolder+"/"+fname,'wb')
sofar = 0
while 1:
data = resp.read(bufsize)
f.write(data)
sofar += len(data)
perc = (float(sofar)/float(total))
count = int(perc * 20)
sys.stdout.write("\r%-30s|%-20s|%3d percent" % (fname,'#'*count,perc*100))
sys.stdout.flush()
#sys.stdout.write("\r" + str(sofar) + " / " + str(total))total
if len(data)==0:
break
f.close()
if(resp.getheader('set-cookie')!=None):
self.cookie=resp.getheader('set-cookie')
conn.close()
if(resp.status==302 or resp.status ==301):
return self.get(resp.getheader('location'))
return data
def get(self,url):
self.urlhist.append(url)
o=urlparse(url)
scheme,hostname,path,q,query,position=o
if(query!=''):
path=path+"?"+query
conn=httplib.HTTPConnection(hostname)
conn.request('GET', path,None,{'Cookie':self.cookie})
resp=conn.getresponse()
data= resp.read()
if(resp.getheader('set-cookie')!=None):
self.cookie=resp.getheader('set-cookie')
conn.close()
if(resp.status==302 or resp.status ==301):
return self.get(resp.getheader('location'))
return data
def post(self,url,data):
self.urlhist.append(url)
o=urlparse(url)
scheme,hostname,path,q,query,position=o
conn=httplib.HTTPConnection(hostname)
conn.request('POST', path,data,
{'Content-Type':'application/x-www-form-urlencoded','Cookie':self.cookie})
resp=conn.getresponse()
data= resp.read()
if(resp.getheader('set-cookie')!=None):
self.cookie=resp.getheader('set-cookie')
conn.close()
if(resp.status==302 or resp.status ==301):
return self.post(resp.getheader('location'),data)
return data
Here is some snippet of code on how to use it.
from pyscraper import PyScraper
p=PyScraper()
data=p.get("http://www.yahoo.com/")
print data
posted by gavi at 9:31 AM
2 Comments:
This doesn't handle cookies properly, it just sends the full text of the cookie eg "attribute=value; domain=blah; expires=blah", which isn't right as the browser should strip off everything when returning the cookie to the webserver.
By
Anonymous, at 11:15 AM
you are absolutely correct.
I should really clean this code up. I am hard coding too many parameters and will look at cookie handling also.
By
gavi, at 4:04 PM
Post a Comment
<< Home