#!/usr/bin/env python

"""
HtmlDom.py - Fetch a URI to an (X)HTML resource and return its DOM

Fetching arbitrary HTML from the Web triggers a number of bugs and
limitations in PyXML's DOM parsers. This convenience function tries
to navigate around as many of them as possible, in order to return
a navigable DOM for arbitrary URIs that return HTML or XHTML 
representations.

Applications which use this library should set UAString appropriately,
so that they can be reliably identified.

Caveats:
  * This library has only been tested with PyXML 0.71; other versions
    may or may not work as designed.

  * PyXML's HTML parser returns a DOM with uppercased element names, 
    while XHTML uses lowercase element names. As a result, code which 
    uses the resultant DOM needs to be aware of its source; this can 
    be achieved by using the .isXml() and .isHtml() methods. For 
    example:
    
      d = fetch(uri)
      if d.isHtml():
        title = xml.xpath.Evaluate("/HTML/HEAD/TITLE/text()", d)
      elif d.isXml():
        title = xml.xpath.Evaluate("/html/head/title/text()", d)

  * HTML is transformed into a DOM, sometimes losing (ususally
    unimportant) information, such as the DTD, the lang attribute of
    HTML, certain character references, etc. As a result, it is NOT
    recommended to create HTML for display from the returned DOM.    

  * If a URI returns an XHTML representation, it may be fetched from 
    the server twice.
"""

# THIS SOFTWARE IS SUPPLIED WITHOUT WARRANTY OF ANY KIND, AND MAY BE
# COPIED, MODIFIED OR DISTRIBUTED IN ANY WAY, AS LONG AS THIS NOTICE
# AND ACKNOWLEDGEMENT OF AUTHORSHIP REMAIN.


import signal, xml, urllib
from xml.dom.ext.reader import HtmlLib, Sgmlop, PyExpat


__version__ = "0.3"
UAString = "HtmlDom.py/%s (http://www.mnot.net/python/HtmlDom.py)" % __version__


def fetch(uri, timeout=60):
    """Fetch a URI to an (X)HTML resource and return the DOM"""
    urllib.URLopener.version = UAString
    signal.signal(signal.SIGALRM, _alrm_handler)
    reader = _HTMLReader()
    try:
        signal.alarm(timeout)
        try:
            m = reader.fromUri(uri)
        finally:
            signal.alarm(0)
    except xml.dom.NamespaceErr:   # it's XML
        reader = _XHTMLReader()
        signal.alarm(timeout)
        try:
            m = reader.fromUri(uri)
        finally:
            signal.alarm(0)
    return m


def _alrm_handler(signum, frame):
    raise IOError, 'timeout'


class _XHTMLReader(PyExpat.Reader):
    pass


class _HTMLReader(HtmlLib.Reader):
    def __init__(self):
        self.parser = _HtmlParser()
        
        
class _HtmlParser(Sgmlop.HtmlParser):
    """
    Overrride the handle_special method in HtmlParser so that we don't 
    choke on HTML doctype declarations.
    
    Also, override handle_charref, as javascript seems to make later
    versions of PyXML extremely unhappy. 
    """
    
    def handle_special(self, data):
        pass
        
    def handle_charref(self, data):
        pass


if __name__ == '__main__':
    import sys, xml.xpath
    d = fetch(sys.argv[1])
    if d.isHtml():
      title = xml.xpath.Evaluate("/HTML/HEAD/TITLE/text()", d)
    elif d.isXml():
      title = xml.xpath.Evaluate("/html/head/title/text()", d)
    xml.dom.ext.PrettyPrint(title[0])