Official ObjectGraph Blog
Tuesday, September 25, 2007
PyScraper: The Python Screen Scraper
I wrote a light weight python based screen scraper, which seems to be working great.
Some of the features:
- Session Management when using cookies
- Seperate functions for get,post and downloading large files
- Automatic handling of redirections
Here is the code for it.
import httplib,urllib,random,sys,re,os from urlparse import urlparse class PyScraper: def __init__(self): self.cookie="" self.currenturl="" self.urlhist=[] def __str__(self): ret='' for item in self.urlhist: ret=ret+item+'->' return ret def download(self,url,localfolder): bufsize = 1024 self.urlhist.append(url) o=urlparse(url) scheme,hostname,path,q,query,position=o head,fname = os.path.split(path) if(query!=''): path=path+"?"+query conn=httplib.HTTPConnection(hostname) conn.request('GET', path,None,{'Cookie':self.cookie}) resp=conn.getresponse() total=int(resp.getheader('content-length')) f = open(localfolder+"/"+fname,'wb') sofar = 0 while 1: data = resp.read(bufsize) f.write(data) sofar += len(data) perc = (float(sofar)/float(total)) count = int(perc * 20) sys.stdout.write("\r%-30s|%-20s|%3d percent" % (fname,'#'*count,perc*100)) sys.stdout.flush() #sys.stdout.write("\r" + str(sofar) + " / " + str(total))total if len(data)==0: break f.close() if(resp.getheader('set-cookie')!=None): self.cookie=resp.getheader('set-cookie') conn.close() if(resp.status==302 or resp.status ==301): return self.get(resp.getheader('location')) return data def get(self,url): self.urlhist.append(url) o=urlparse(url) scheme,hostname,path,q,query,position=o if(query!=''): path=path+"?"+query conn=httplib.HTTPConnection(hostname) conn.request('GET', path,None,{'Cookie':self.cookie}) resp=conn.getresponse() data= resp.read() if(resp.getheader('set-cookie')!=None): self.cookie=resp.getheader('set-cookie') conn.close() if(resp.status==302 or resp.status ==301): return self.get(resp.getheader('location')) return data def post(self,url,data): self.urlhist.append(url) o=urlparse(url) scheme,hostname,path,q,query,position=o conn=httplib.HTTPConnection(hostname) conn.request('POST', path,data, {'Content-Type':'application/x-www-form-urlencoded','Cookie':self.cookie}) resp=conn.getresponse() data= resp.read() if(resp.getheader('set-cookie')!=None): self.cookie=resp.getheader('set-cookie') conn.close() if(resp.status==302 or resp.status ==301): return self.post(resp.getheader('location'),data) return data
Here is some snippet of code on how to use it.
from pyscraper import PyScraper p=PyScraper() data=p.get("http://www.yahoo.com/") print data
posted by gavi at 9:31 AM | 2 comments |