"""Functions to run cherrypy.response through Tidy or NSGML.""" import cgi import os import StringIO import traceback import cherrypy def tidy(temp_dir, tidy_path, strict_xml=False, errors_to_ignore=None, indent=False, wrap=False, warnings=True): """Run cherrypy.response through Tidy. If either 'indent' or 'wrap' are specified, then response.body will be set to the output of tidy. Otherwise, only errors (including warnings, if warnings is True) will change the body. Note that we use the standalone Tidy tool rather than the python mxTidy module. This is because this module does not seem to be stable and it crashes on some HTML pages (which means that the server would also crash) """ response = cherrypy.response # the tidy tool, by its very nature it's not generator friendly, # so we just collapse the body and work with it. orig_body = response.collapse_body() fct = response.headers.get('Content-Type', '') ct = fct.split(';')[0] encoding = '' i = fct.find('charset=') if i != -1: encoding = fct[i + 8:] if ct == 'text/html': page_file = os.path.join(temp_dir, 'page.html') open(page_file, 'wb').write(orig_body) out_file = os.path.join(temp_dir, 'tidy.out') err_file = os.path.join(temp_dir, 'tidy.err') tidy_enc = encoding.replace('-', '') if tidy_enc: tidy_enc = '-' + tidy_enc strict_xml = ("", " -xml")[bool(strict_xml)] if indent: indent = ' -indent' else: indent = '' if wrap is False: wrap = '' else: try: wrap = ' -wrap %d' % int(tidyWrap) except: wrap = '' result = os.system('"%s" %s%s%s%s -f %s -o %s %s' % (tidy_path, tidy_enc, strict_xml, indent, wrap, err_file, out_file, page_file)) use_output = bool(indent or wrap) and not result if use_output: output = open(out_file, 'rb').read() new_errs = [] for err in open(err_file, 'rb').read().splitlines(): if (err.find('Error') != -1 or (warnings and err.find('Warning') != -1)): ignore = 0 for err_ign in errors_to_ignore or []: if err.find(err_ign) != -1: ignore = 1 break if not ignore: new_errs.append(err) if new_errs: response.body = wrong_content('
'.join(new_errs), orig_body) if response.headers.has_key("Content-Length"): # Delete Content-Length header so finalize() recalcs it. del response.headers["Content-Length"] return elif strict_xml: # The HTML is OK, but is it valid XML? # Use elementtree to parse XML from elementtree.ElementTree import parse tag_list = ['nbsp', 'quot'] for tag in tag_list: orig_body = orig_body.replace('&' + tag + ';', tag.upper()) if encoding: enctag = '' % encoding orig_body = enctag + orig_body f = StringIO.StringIO(orig_body) try: tree = parse(f) except: # Wrong XML body_file = StringIO.StringIO() traceback.print_exc(file = body_file) body_file = '
'.join(body_file.getvalue()) response.body = wrong_content(body_file, orig_body, "XML") if response.headers.has_key("Content-Length"): # Delete Content-Length header so finalize() recalcs it. del response.headers["Content-Length"] return if use_output: response.body = [output] if response.headers.has_key("Content-Length"): # Delete Content-Length header so finalize() recalcs it. del response.headers["Content-Length"] def html_space(text): """Escape text, replacing space with nbsp and tab with 4 nbsp's.""" return cgi.escape(text).replace('\t', ' ').replace(' ', ' ') def html_break(text): """Escape text, replacing newline with HTML br element.""" return cgi.escape(text).replace('\n', '
') def wrong_content(header, body, content_type="HTML"): output = ["Wrong %s:
%s
" % (content_type, html_break(header))] for i, line in enumerate(body.splitlines()): output.append("%03d - %s" % (i + 1, html_space(line))) return "
".join(output) def nsgmls(temp_dir, nsgmls_path, catalog_path, errors_to_ignore=None): response = cherrypy.response # the tidy tool, by its very nature it's not generator friendly, # so we just collect the body and work with it. orig_body = response.collapse_body() fct = response.headers.get('Content-Type', '') ct = fct.split(';')[0] encoding = '' i = fct.find('charset=') if i != -1: encoding = fct[i + 8:] if ct == 'text/html': # Remove bits of Javascript (nsgmls doesn't seem to handle # them correctly (for instance, if ', i) if j == -1: break orig_body = orig_body[:i] + orig_body[j+9:] page_file = os.path.join(temp_dir, 'page.html') open(page_file, 'wb').write(orig_body) err_file = os.path.join(temp_dir, 'nsgmls.err') command = ('%s -c%s -f%s -s -E10 %s' % (nsgmls_path, catalog_path, err_file, page_file)) command = command.replace('\\', '/') os.system(command) errs = open(err_file, 'rb').read() new_errs = [] for err in errs.splitlines(): ignore = False for err_ign in errors_to_ignore or []: if err.find(err_ign) != -1: ignore = True break if not ignore: new_errs.append(err) if new_errs: response.body = wrong_content('
'.join(new_errs), orig_body) if response.headers.has_key("Content-Length"): # Delete Content-Length header so finalize() recalcs it. del response.headers["Content-Length"]