#!/usr/bin/env python
"""
HtmlDom.py - Fetch a URI to an (X)HTML resource and return its DOM
Fetching arbitrary HTML from the Web triggers a number of bugs and
limitations in PyXML's DOM parsers. This convenience function tries
to navigate around as many of them as possible, in order to return
a navigable DOM for arbitrary URIs that return HTML or XHTML
representations.
Applications which use this library should set UAString appropriately,
so that they can be reliably identified.
Caveats:
* This library has only been tested with PyXML 0.71; other versions
may or may not work as designed.
* PyXML's HTML parser returns a DOM with uppercased element names,
while XHTML uses lowercase element names. As a result, code which
uses the resultant DOM needs to be aware of its source; this can
be achieved by using the .isXml() and .isHtml() methods. For
example:
d = fetch(uri)
if d.isHtml():
title = xml.xpath.Evaluate("/HTML/HEAD/TITLE/text()", d)
elif d.isXml():
title = xml.xpath.Evaluate("/html/head/title/text()", d)
* HTML is transformed into a DOM, sometimes losing (ususally
unimportant) information, such as the DTD, the lang attribute of
HTML, certain character references, etc. As a result, it is NOT
recommended to create HTML for display from the returned DOM.
* If a URI returns an XHTML representation, it may be fetched from
the server twice.
"""
# THIS SOFTWARE IS SUPPLIED WITHOUT WARRANTY OF ANY KIND, AND MAY BE
# COPIED, MODIFIED OR DISTRIBUTED IN ANY WAY, AS LONG AS THIS NOTICE
# AND ACKNOWLEDGEMENT OF AUTHORSHIP REMAIN.
import signal, xml, urllib
from xml.dom.ext.reader import HtmlLib, Sgmlop, PyExpat
__version__ = "0.3"
UAString = "HtmlDom.py/%s (http://www.mnot.net/python/HtmlDom.py)" % __version__
def fetch(uri, timeout=60):
"""Fetch a URI to an (X)HTML resource and return the DOM"""
urllib.URLopener.version = UAString
signal.signal(signal.SIGALRM, _alrm_handler)
reader = _HTMLReader()
try:
signal.alarm(timeout)
try:
m = reader.fromUri(uri)
finally:
signal.alarm(0)
except xml.dom.NamespaceErr: # it's XML
reader = _XHTMLReader()
signal.alarm(timeout)
try:
m = reader.fromUri(uri)
finally:
signal.alarm(0)
return m
def _alrm_handler(signum, frame):
raise IOError, 'timeout'
class _XHTMLReader(PyExpat.Reader):
pass
class _HTMLReader(HtmlLib.Reader):
def __init__(self):
self.parser = _HtmlParser()
class _HtmlParser(Sgmlop.HtmlParser):
"""
Overrride the handle_special method in HtmlParser so that we don't
choke on HTML doctype declarations.
Also, override handle_charref, as javascript seems to make later
versions of PyXML extremely unhappy.
"""
def handle_special(self, data):
pass
def handle_charref(self, data):
pass
if __name__ == '__main__':
import sys, xml.xpath
d = fetch(sys.argv[1])
if d.isHtml():
title = xml.xpath.Evaluate("/HTML/HEAD/TITLE/text()", d)
elif d.isXml():
title = xml.xpath.Evaluate("/html/head/title/text()", d)
xml.dom.ext.PrettyPrint(title[0])