''' Created on Aug 28, 2009 @author: jpd ''' import os import sys import subprocess import HTMLParser import logging from optparse import OptionParser import urllib def setupLogging(name, level = logging.DEBUG): logger = logging.getLogger(name) logger.setLevel(logging.DEBUG) # create console handler and set level to debug ch = logging.StreamHandler() ch.setLevel(level) # create formatter formatter = logging.Formatter("%(asctime)s: %(name)s : %(levelname)s : %(message)s") # add formatter to ch ch.setFormatter(formatter) # add ch to logger logger.addHandler(ch) return logger class Company(dict) : def __init__(self): pass class CROParser(HTMLParser.HTMLParser): def __init__(self): if hasattr(HTMLParser.HTMLParser, '__init__'): HTMLParser.HTMLParser.__init__(self) self._log = logging.getLogger("cro") self._companyList = [] self._tableCount = 0 self._dataCount = 0 self._companyCount = 0 self._company = Company() def parse(self, s): #"Parse the given string 's'." self.feed(s) self.close() def handle_starttag(self, tag, attrs): self._log.debug("start tag : %s" % tag) if (tag == "table") and (self._companyCount > 0) : #self._log.debug( "%s %s" % (tag, attrs)) self._tableCount = self._tableCount + 1 self._log.debug("Table Count : %i" % self._tableCount) def handle_data(self, data): self._log.debug("Handle_data : '%s'" % data) if data.startswith('Found: '): self._companyCount = int(data.split(": ")[1]) self._log.debug("Found %i companies" % self._companyCount) if (data != 'Home') and (self._tableCount == 2) and (self._companyCount > 0) : self._dataCount = self._dataCount + 1 self._log.debug("Parsing : %s, dataCount : %i" % (data, self._dataCount)) if self._dataCount == 2 : self._company[ 'Name' ] = data if self._dataCount == 3 : self._company[ 'Number' ] = data if self._dataCount == 4 : self._company[ 'Address' ] = data if self._dataCount == 5 : self._company[ 'Type' ] = data self._dataCount = 0 self._companyCount = self._companyCount - 1 self._companyList.append(self._company.copy()) self._tableCount = 0 self._log.debug("parsed : %s " % self._companyList[-1]) def handle_endtag(self, tag): #print "end tag : %s" % tag if tag == "table": pass def companies(self): return self._companyList if __name__ == '__main__': log = setupLogging("cro") parser = OptionParser(usage = "%prog ", version = "%prog release: 0.1") parser.add_option("-c", "--company", dest = "company", default = "PutPlace", help = "return information on company named ") parser.add_option("-d", "--debug", dest = "debug", action = "store_true", default = False, help = "turn on debugging") (options, args) = parser.parse_args() if options.debug : logging.getLogger("cro").setLevel(logging.DEBUG) else: logging.getLogger("cro").setLevel(logging.INFO) request = { 'radOption' : 'COMPANY', 'name' : options.company, 'radSearch' : 'CONTAINSWORDS', 'address' : '', 'number' : '', 'alpha' : '', 'submit2' : 'Submit', 'radPrevOption' : 'EXISTING' } params = urllib.urlencode(request) #print "params = %s" % params f = urllib.urlopen("http://www.cro.ie/search/companyindexe.asp", params) parser = CROParser() parser.parse(f.read()) for i in parser.companies(): print i