Official ObjectGraph Blog

Tuesday, September 25, 2007

PyScraper: The Python Screen Scraper

I wrote a light weight python based screen scraper, which seems to be working great.

Some of the features:

  • Session Management when using cookies
  • Seperate functions for get,post and downloading large files
  • Automatic handling of redirections

Here is the code for it.

import httplib,urllib,random,sys,re,os
from urlparse import urlparse


class PyScraper:
 def __init__(self):
  self.cookie=""
  self.currenturl=""
  self.urlhist=[]
 
 def __str__(self):
  ret=''
  for item in self.urlhist:
   ret=ret+item+'->'
  return ret
 
 def download(self,url,localfolder):
  bufsize = 1024
  self.urlhist.append(url)
  o=urlparse(url)
  scheme,hostname,path,q,query,position=o
  head,fname = os.path.split(path)
  if(query!=''):
   path=path+"?"+query
   
  conn=httplib.HTTPConnection(hostname)
  conn.request('GET', path,None,{'Cookie':self.cookie})
  resp=conn.getresponse()
  total=int(resp.getheader('content-length'))
  f = open(localfolder+"/"+fname,'wb')
  sofar = 0
  while 1:
   data = resp.read(bufsize)
   f.write(data)
   sofar += len(data)
   perc = (float(sofar)/float(total))
   count = int(perc * 20)
   sys.stdout.write("\r%-30s|%-20s|%3d percent" % (fname,'#'*count,perc*100))
   
   sys.stdout.flush()
   #sys.stdout.write("\r" + str(sofar) + " / " + str(total))total
   if len(data)==0:
    break
  f.close()
  if(resp.getheader('set-cookie')!=None):
   self.cookie=resp.getheader('set-cookie')
  conn.close()
  if(resp.status==302 or resp.status ==301):
   return self.get(resp.getheader('location'))
  return data
 
 def get(self,url):
  self.urlhist.append(url)
  o=urlparse(url)
  scheme,hostname,path,q,query,position=o
  if(query!=''):
   path=path+"?"+query
   
  conn=httplib.HTTPConnection(hostname)
  conn.request('GET', path,None,{'Cookie':self.cookie})
  resp=conn.getresponse()
  data= resp.read()
  if(resp.getheader('set-cookie')!=None):
   self.cookie=resp.getheader('set-cookie')
  conn.close()
  if(resp.status==302 or resp.status ==301):
   return self.get(resp.getheader('location'))
  return data
 
 def post(self,url,data):
  self.urlhist.append(url)
  o=urlparse(url)
  scheme,hostname,path,q,query,position=o
  conn=httplib.HTTPConnection(hostname)
  conn.request('POST', path,data,
{'Content-Type':'application/x-www-form-urlencoded','Cookie':self.cookie})
  resp=conn.getresponse()
  data= resp.read()
  if(resp.getheader('set-cookie')!=None):
   self.cookie=resp.getheader('set-cookie')
  conn.close()
  if(resp.status==302 or resp.status ==301):
   return self.post(resp.getheader('location'),data)
  return data

Here is some snippet of code on how to use it.

from pyscraper import PyScraper

p=PyScraper()
data=p.get("http://www.yahoo.com/")
print data



posted by gavi at 9:31 AM | 2 comments |