iwannamassage.py v0.3
# Fetch and parse an HTML page into a DOM-like tree of elements. # Uses my new favorite XML parser, ElementTree by the effbot. # Rather than use ElementTree's interface to the Tidy command # (which runs Tidy in a subshell), I'm using M.-A. Lemburg's version # that turns Tidy into a Python extention. import sys, os, types, tempfile from urllib2 import urlopen, URLError from mx.Tidy import Tidy from elementtree import ElementTree, HTMLTreeBuilder from fileadaptor import FileAdaptor from toolbox import enumerate class TidyError(Exception): pass class TidyFilter: def __init__(self, src): self.src = src # mx.tidy.Tidy must have a true file object for input and output. # If the data source is not a true file object, use FileAdaptor to turn it into one. if not isinstance(self.src, types.FileType): self.inFileObj = FileAdaptor(self.src).file() else: self.inFileObj = self.src
# Set up a temp file for output. self.outFileObj = os.tmpfile() # Use mx.Tidy.tidy to convert our input HTML into clean # output XHTML. nerrors, nwarnings, outputdata, errordata = Tidy.tidy(self.inFileObj, self.outFileObj, output_xhtml=1) # Raise an exception if Tidy got an error if nerrors: raise TidyError # Make sure the temp output file is ready for input self.outFileObj.flush() self.outFileObj.seek(0,0) return def read(self, size=None): return self.outFileObj.read(size) class ApptSched: """Initialize an instance of this class with the object returned by ElementTree. It will extract the schedule into a searchable form. It provides methods for getting the next available appointment slot.""" def __init__(self, treeObj): # First, do the very data-specific stuff needed to find the appointment schedule # table element. treeRoot = treeObj.getroot() bodyElem = treeRoot[1] outerTableElem = bodyElem[0] outerTBodyElem = outerTableElem[0] secondTrElem = outerTBodyElem[1] firstTdElem = secondTrElem[0] secondInnerTableElem = firstTdElem[3] secondTBodyElem = secondInnerTableElem[0]
# Get a list of all the rows in the table. rows = secondTBodyElem.getchildren() # Some of the columns in the table contain row labels, which need to be skipped. skipCols = (0, 4, 8) # Row 0 holds the date labels, and some blanks. self.dates = [ dateElem[0].text for index, dateElem in enumerate(rows[0]) if index not in skipCols ] # Column 0 contains the time labels. self.times = [ row[0].text for row in rows[1:] ] # Now pull out the appointment slots availability info. # Each slot element has a 'class' tag, that can be one of "taken", "avail" or "na" # (meaning 'Not Available'). # Here, we generate a two-dimensional array (actually, a list of lists) to represent # the appointment schedule slots. Each element in the array contains the text of # the 'class' tag. self.slots = [ [ col.get("class") for colIndex, col in enumerate(row.getchildren()) if colIndex not in skipCols ] for row in rows[1:] ] return def getNextAvail(self): """This method will return the next available appointment slot, or 'None'.""" for timeIndex, timeSlots in enumerate(self.slots): for dateIndex, dateSlot in enumerate(timeSlots): #print "slot[%d,%d] is %s" % (timeIndex, dateIndex, dateSlot) if dateSlot.startswith("avail"): return "%s at %s" % (self.dates[dateIndex], self.times[timeIndex]) return "None" class MassageAppt: """This class will extract the appointment schedule table from the web page, and provides methods for getting the next available appointment slot, and making an appointment for that slot.""" def __init__(self, url): # Get an ElementTree HTML parser. htmlParserObj = HTMLTreeBuilder.TreeBuilder()
# Open the URL, getting a file-like URL object. try: urlObj = urlopen(url) except URLError: print "Error opening URL '%s'" % url raise # Tell ElementTree to parse our input XHTML source, using an # HTML parser. This will give us a top-level ElementTree object. treeObj = ElementTree.parse(TidyFilter(urlObj), parser=htmlParserObj) # Use an instance of the ApptSched class to extract the appointment # schedule data from the tree of elements. self.apptSched = ApptSched(treeObj) return def getNextAvail(self): return self.apptSched.getNextAvail() if __name__ == "__main__": print MassageAppt(sys.argv[1]).getNextAvail() sys.exit(0)
|
© Copyright
2003
Michael Kent.
Last update:
6/26/2003; 12:14:26 PM.
This theme is based on the SoundWaves
(blue) Manila theme. |
|