#!/usr/bin/env python """ HtmlDom.py - Fetch a URI to an (X)HTML resource and return its DOM Fetching arbitrary HTML from the Web triggers a number of bugs and limitations in PyXML's DOM parsers. This convenience function tries to navigate around as many of them as possible, in order to return a navigable DOM for arbitrary URIs that return HTML or XHTML representations. Applications which use this library should set UAString appropriately, so that they can be reliably identified. Caveats: * This library has only been tested with PyXML 0.71; other versions may or may not work as designed. * PyXML's HTML parser returns a DOM with uppercased element names, while XHTML uses lowercase element names. As a result, code which uses the resultant DOM needs to be aware of its source; this can be achieved by using the .isXml() and .isHtml() methods. For example: d = fetch(uri) if d.isHtml(): title = xml.xpath.Evaluate("/HTML/HEAD/TITLE/text()", d) elif d.isXml(): title = xml.xpath.Evaluate("/html/head/title/text()", d) * HTML is transformed into a DOM, sometimes losing (ususally unimportant) information, such as the DTD, the lang attribute of HTML, certain character references, etc. As a result, it is NOT recommended to create HTML for display from the returned DOM. * If a URI returns an XHTML representation, it may be fetched from the server twice. """ # THIS SOFTWARE IS SUPPLIED WITHOUT WARRANTY OF ANY KIND, AND MAY BE # COPIED, MODIFIED OR DISTRIBUTED IN ANY WAY, AS LONG AS THIS NOTICE # AND ACKNOWLEDGEMENT OF AUTHORSHIP REMAIN. import signal, xml, urllib from xml.dom.ext.reader import HtmlLib, Sgmlop, PyExpat __version__ = "0.3" UAString = "HtmlDom.py/%s (http://www.mnot.net/python/HtmlDom.py)" % __version__ def fetch(uri, timeout=60): """Fetch a URI to an (X)HTML resource and return the DOM""" urllib.URLopener.version = UAString signal.signal(signal.SIGALRM, _alrm_handler) reader = _HTMLReader() try: signal.alarm(timeout) try: m = reader.fromUri(uri) finally: signal.alarm(0) except xml.dom.NamespaceErr: # it's XML reader = _XHTMLReader() signal.alarm(timeout) try: m = reader.fromUri(uri) finally: signal.alarm(0) return m def _alrm_handler(signum, frame): raise IOError, 'timeout' class _XHTMLReader(PyExpat.Reader): pass class _HTMLReader(HtmlLib.Reader): def __init__(self): self.parser = _HtmlParser() class _HtmlParser(Sgmlop.HtmlParser): """ Overrride the handle_special method in HtmlParser so that we don't choke on HTML doctype declarations. Also, override handle_charref, as javascript seems to make later versions of PyXML extremely unhappy. """ def handle_special(self, data): pass def handle_charref(self, data): pass if __name__ == '__main__': import sys, xml.xpath d = fetch(sys.argv[1]) if d.isHtml(): title = xml.xpath.Evaluate("/HTML/HEAD/TITLE/text()", d) elif d.isXml(): title = xml.xpath.Evaluate("/html/head/title/text()", d) xml.dom.ext.PrettyPrint(title[0])