#!/usr/bin/env python """Automatically maintain tables of contents (TOCs) in HTML documents. For a similar idea restricted to a single file, see Andrew Hardwick's HtmlHeadingsToTableOfContents.pl http://duramecho.com/ComputerPrograms/HtmlHeadingsToTableOfContents/ """ import os.path from StringIO import StringIO import sys from lxml import etree class ToCTree (object): def __init__(self, path, index_file='index.html', toc_file='.html_toc', stack=[]): self._begin_tag = '' self._end_tag = '' self._index_file = index_file self._toc_file = toc_file self._stack = stack self._dir,self._path = self._normalize_path(path) try: self._lines = [x.rstrip() for x in open(self._path, 'rb').readlines()] except IOError: self._lines = [] self._etree,self._prefix_lines = self._parse_page() self._title = self._page_title() self._descendants = list(self._get_descendants()) def _normalize_path(self, path): if os.path.isdir(path): dir_ = path path = os.path.join(dir_, index_file) else: dir_ = os.path.dirname(path) return (dir_, path) def _parse_page(self): lines = ['', '', ] + self._lines + [''] parser = etree.XMLParser(resolve_entities=False, recover=True) tree = etree.parse(StringIO('\n'.join(lines)), parser=parser) if len(parser.error_log) > 0: print >> sys.stderr, '%s: parsing error' % self._path for error in parser.error_log: print >> sys.stderr, ' '+'\n '.join([ '%s (line %d)' % (error.message, error.line-2), ' '+lines[error.line][:error.column], ' '+' '*error.column + lines[error.line][error.column:], ]) return (tree, 2) def _page_title(self): heading = self._etree.find('.//h1') if heading is None: return os.path.basename(self._dir) return heading.text def _heading_nodes(self, path=''): h1s = self._etree.findall('.//h1') if len(h1s) == 0: return [] elif len(h1s) > 1: raise ValueError('multiple h1 headings in %s: %s' % (self._path, [h.text for h in h1s])) h1 = h1s[0] h2s = self._etree.findall('.//h2') h3s = self._etree.findall('.//h3') if len(h2s) == 0: return [] iZ = max([h.sourceline for h in h2s+h3s]) + 1 lines = [h.sourceline for h in [h1]+h2s+h3s] if len(set(lines)) < len(lines): dup_lines = [x for x in lines if lines.count(x) > 1] if len(dup_lines) > 1: location = 'lines %s' % ','.join( [str(x-self._prefix_lines) for x in dup_lines]) else: location = 'line %d' % (dup_lines[0] - self._prefix_lines) raise ValueError('multiple headings on %s of %s' % (location, self._path)) nodes = [] for i2,h2 in enumerate(h2s): n2 = etree.Element('li') nodes.append(n2) n2a = etree.Element('a', href='%s#%s' % (path, h2.get('id'))) n2a.text = h2.text n2.append(n2a) if i2+1 == len(h2s): next_h2_line = iZ else: next_h2_line = h2s[i2+1].sourceline _h3s = [h3 for h3 in h3s if (h3.sourceline > h2.sourceline and h3.sourceline < next_h2_line)] if len(_h3s) > 0: n2ul = etree.Element('ul') n2.append(n2ul) for h3 in _h3s: h3s.remove(h3) n3 = etree.Element('li') n2ul.append(n3) n3a = etree.Element( 'a', href='%s#%s' % (path, h3.get('id'))) n3a.text = h3.text n3.append(n3a) if len(h3s) > 0: raise ValueError('misplaced h3 on line %d of %s' % (h3s[0].sourceline - self._prefix_lines, self._path)) return nodes def _get_descendants(self): """Depth first traversal.""" toc_file_path = os.path.join(self._dir, self._toc_file) try: children = open(toc_file_path, 'r').readlines() except IOError, e: return for child_file_name in children: child_path = os.path.join(self._dir, child_file_name.strip()) child_tree = ToCTree( child_path, index_file=self._index_file, toc_file=self._toc_file, stack=self._stack+[self]) yield child_tree for c in child_tree._get_descendants(): yield c def set_tocs(self, clean=False): self._set_toc(clean=clean) for d in self._descendants: d._set_toc(clean=clean) def _set_toc(self, clean=False): lines = list(self._lines) try: begin = lines.index(self._begin_tag) end = lines[begin:].index(self._end_tag) + begin except ValueError, e: print >> sys.stderr, '%s\tskip (no tags)' % self._path return if clean == True: toc_lines = [] else: toc_lines = self._toc_lines() lines = lines[:begin+1] + toc_lines + lines[end:] if lines != self._lines: print >> sys.stderr, '%s\tupdate' % self._path open(self._path, 'wb').write('\n'.join(lines)+'\n') else: print >> sys.stderr, '%s\tno change' % self._path def _toc_lines(self): toc = etree.Element('ul') toc.extend(self._heading_nodes()) stack = [toc] for d in self._descendants: while len(stack) > len(d._stack): stack.pop() node,node_ul = d._entry(from_=self) stack[-1].append(node) if len(d._descendants) > 0: node.append(node_ul) stack.append(node_ul) elif len(node_ul) == 0: node.remove(node_ul) text = etree.tostring(toc, pretty_print=True) return text.splitlines() def _entry(self, from_): path = os.path.relpath(self._dir, start=from_._dir)+os.path.sep if os.path.sep != '/': path.replace(os.path.sep, '/') node = etree.Element('li') link = etree.Element('a', href=path) link.text = self._title node.append(link) ul = etree.Element('ul') node.append(ul) ul.extend(self._heading_nodes(path=path)) return (node, ul) if __name__ == '__main__': import optparse p = optparse.OptionParser(usage='%prog [options] [ROOT]') p.add_option('-c', '--clean', default=False, action='store_true', help="Clean TOCs instead of building them.") options,args = p.parse_args() index_file = 'index.shtml' root = '.' if len(args) > 0: root = args[0] tree = ToCTree(path=root) tree.set_tocs(clean=options.clean)