#!/usr/bin/env python

"""Automatically maintain tables of contents (TOCs) in HTML documents.

For a similar idea restricted to a single file, see Andrew Hardwick's
  HtmlHeadingsToTableOfContents.pl
  http://duramecho.com/ComputerPrograms/HtmlHeadingsToTableOfContents/
"""

import os.path
from StringIO import StringIO
import sys

from lxml import etree


class ToCTree (object):
    def __init__(self, path, index_file='index.html', toc_file='.html_toc',
                 stack=[]):
        self._begin_tag = '<!--TableOfContents:Begin-->'
        self._end_tag = '<!--TableOfContents:End-->'
        self._index_file = index_file
        self._toc_file = toc_file
        self._stack = stack
        self._dir,self._path = self._normalize_path(path)
        try:
            self._lines = [x.rstrip() for x in open(self._path, 'rb').readlines()]
        except IOError:
            self._lines = []
        self._etree,self._prefix_lines = self._parse_page()
        self._title = self._page_title()
        self._descendants = list(self._get_descendants())

    def _normalize_path(self, path):
        if os.path.isdir(path):
            dir_ = path
            path = os.path.join(dir_, index_file)
        else:
            dir_ = os.path.dirname(path)
        return (dir_, path)

    def _parse_page(self):
        lines = ['<?xml version="1.0" encoding="UTF-8"?>',
                 '<toc-wrapper>',
                 ] + self._lines + ['</toc-wrapper>']
        parser = etree.XMLParser(resolve_entities=False, recover=True)
        tree = etree.parse(StringIO('\n'.join(lines)), parser=parser)
        if len(parser.error_log) > 0:
            print >> sys.stderr, '%s: parsing error' % self._path
            for error in parser.error_log:
                print >> sys.stderr, '  '+'\n  '.join([
                        '%s (line %d)' % (error.message, error.line-2),
                        '  '+lines[error.line][:error.column],
                        '  '+' '*error.column + lines[error.line][error.column:],
                        ])
        return (tree, 2)

    def _page_title(self):
        heading = self._etree.find('.//h1')
        if heading is None:
            return os.path.basename(self._dir)
        return heading.text

    def _heading_nodes(self, path=''):
        h1s = self._etree.findall('.//h1')
        if len(h1s) == 0:
             return []
        elif len(h1s) > 1:
            raise ValueError('multiple h1 headings in %s: %s'
                             % (self._path, [h.text for h in h1s]))
        h1 = h1s[0]
        h2s = self._etree.findall('.//h2')
        h3s = self._etree.findall('.//h3')
        if len(h2s) == 0:
            return []
        iZ = max([h.sourceline for h in h2s+h3s]) + 1
        lines = [h.sourceline for h in [h1]+h2s+h3s]
        if len(set(lines)) < len(lines):
            dup_lines = [x for x in lines if lines.count(x) > 1]
            if len(dup_lines) > 1:
                location = 'lines %s' % ','.join(
                    [str(x-self._prefix_lines) for x in dup_lines])
            else:
                location = 'line %d' % (dup_lines[0] - self._prefix_lines)
            raise ValueError('multiple headings on %s of %s'
                             % (location, self._path))
        nodes = []
        for i2,h2 in enumerate(h2s):
            n2 = etree.Element('li')
            nodes.append(n2)
            n2a = etree.Element('a', href='%s#%s' % (path, h2.get('id')))
            n2a.text = h2.text
            n2.append(n2a)

            if i2+1 == len(h2s):
                next_h2_line = iZ
            else:
                next_h2_line = h2s[i2+1].sourceline
            _h3s = [h3 for h3 in h3s if
                    (h3.sourceline > h2.sourceline
                     and  h3.sourceline < next_h2_line)]
            if len(_h3s) > 0:
                n2ul = etree.Element('ul')
                n2.append(n2ul)
                for h3 in _h3s:
                    h3s.remove(h3)
                    n3 = etree.Element('li')
                    n2ul.append(n3)
                    n3a = etree.Element(
                        'a', href='%s#%s' % (path, h3.get('id')))
                    n3a.text = h3.text
                    n3.append(n3a)
        if len(h3s) > 0:
            raise ValueError('misplaced h3 on line %d of %s'
                             % (h3s[0].sourceline - self._prefix_lines,
                                self._path))
        return nodes

    def _get_descendants(self):
        """Depth first traversal."""
        toc_file_path = os.path.join(self._dir, self._toc_file)
        try:
            children = open(toc_file_path, 'r').readlines()
        except IOError, e:
            return
        for child_file_name in children:
            child_path = os.path.join(self._dir, child_file_name.strip())
            child_tree = ToCTree(
                child_path, index_file=self._index_file,
                toc_file=self._toc_file, stack=self._stack+[self])
            yield child_tree
            for c in child_tree._get_descendants():
                yield c

    def set_tocs(self, clean=False):
        self._set_toc(clean=clean)
        for d in self._descendants:   
            d._set_toc(clean=clean)

    def _set_toc(self, clean=False):
        lines = list(self._lines)
        try:
            begin = lines.index(self._begin_tag)
            end = lines[begin:].index(self._end_tag) + begin
        except ValueError, e:
            print >> sys.stderr, '%s\tskip (no tags)' % self._path
            return
        if clean == True:
            toc_lines = []
        else:
            toc_lines = self._toc_lines()
        lines = lines[:begin+1] + toc_lines + lines[end:]
        if lines != self._lines:
            print >> sys.stderr, '%s\tupdate' % self._path
            open(self._path, 'wb').write('\n'.join(lines)+'\n')
        else:
            print >> sys.stderr, '%s\tno change' % self._path

    def _toc_lines(self):
        toc = etree.Element('ul')
        toc.extend(self._heading_nodes())
        stack = [toc]
        for d in self._descendants:
            while len(stack) > len(d._stack):
                stack.pop()
            node,node_ul = d._entry(from_=self)
            stack[-1].append(node)
            if len(d._descendants) > 0:
                node.append(node_ul)
                stack.append(node_ul)
            elif len(node_ul) == 0:
                node.remove(node_ul)
        text = etree.tostring(toc, pretty_print=True)
        return text.splitlines()

    def _entry(self, from_):
        path = os.path.relpath(self._dir, start=from_._dir)+os.path.sep
        if os.path.sep != '/':
            path.replace(os.path.sep, '/')
        node = etree.Element('li')
        link = etree.Element('a', href=path)
        link.text = self._title
        node.append(link)
        ul = etree.Element('ul')
        node.append(ul)
        ul.extend(self._heading_nodes(path=path))
        return (node, ul)


if __name__ == '__main__':
    import optparse

    p = optparse.OptionParser(usage='%prog [options] [ROOT]')
    p.add_option('-c', '--clean', default=False, action='store_true',
                 help="Clean TOCs instead of building them.")
    options,args = p.parse_args()

    index_file = 'index.shtml'
    root = '.'
    if len(args) > 0:
        root = args[0]

    tree = ToCTree(path=root)
    tree.set_tocs(clean=options.clean)