#!/usr/bin/env python
# 
# Copyright (C) 2011 W. Trevor King <wking@drexel.edu>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this program.  If not, see
# <http://www.gnu.org/licenses/>.

'Generate CSS mimicking a specified page'

from __future__ import with_statement  # support Python 2.5

from codecs import BOM_UTF8
from logging import CRITICAL, DEBUG, getLogger, StreamHandler, Formatter
from mimetypes import guess_extension
from os import mkdir
import os.path
try:  # Python 3
    from io import StringIO
except ImportError:
    from StringIO import StringIO
import sys
try:  # Python 3
    from urllib.request import urlopen
    from urllib.parse import urljoin
except ImportError:
    from urllib2 import urlopen
    from urlparse import urljoin

from lxml import etree
from cssutils import CSSParser, replaceUrls, resolveImports
from cssutils import log as _cssutils_log
import cssutils  # for MonkeyCSSParser


DATA_DIR = 'data'
DATA_URL = '/data'

LOG = getLogger('get_css')
LOG.setLevel(DEBUG)
_STREAM_HANDLER = StreamHandler()
_STREAM_HANDLER.setLevel(CRITICAL)
_STREAM_HANDLER.setFormatter(
    Formatter('%(levelname)s - %(message)s'))
LOG.addHandler(_STREAM_HANDLER)


def _standardize_text(text):
    # remove byte-order marker (BOM) if present
    # possible Python parsing bug.  See
    #   http://evanjones.ca/python-utf8.html#bom
    text = text.lstrip(unicode(BOM_UTF8, 'utf-8'))

    for nl in ['\r\n', '\r']:  # standardize newlines
        text = text.replace(nl, '\n')
    return text

def get_page(url, standardize_text=True):
    LOG.info('get %s' % url)
    f = urlopen(url)
    info = f.info()
    _url = f.geturl()
    if _url != url:
        LOG.info('%s redirected to %s' % (url, _url))
    ctype = f.headers['content-type']
    body = f.read()
    f.close()
    if info.getmaintype() == 'text' and standardize_text == True:
        try:
            _type,encoding = ctype.split('charset=')
        except ValueError:
            encoding = 'utf-8'
        body = unicode(body, encoding)
        body = _standardize_text(body)
    return (info, body)


def is_stylesheet(link):
    "Return `True` if the `etree._Element` `link` is a stylesheet."
    for attr,value in [('rel', 'stylesheet'), ('type', 'text/css')]:
        v = link.get(attr).lower()
        if v != value:
            return False
    return True

def get_css(url):
    "Return urls for all CSS linked to from the (X)HTML at `url`."
    info,body = get_page(url, standardize_text=False)
    assert info.getmaintype() == 'text', 'invalid type %s' % info.gettype()
    if info.getsubtype() == 'html':
        parser = etree.HTMLParser()
    elif info.getsubtype() == 'xhtml':
        parser = etree.XMLParser()
    else:
        raise ValueError('invalid page type %s' % info.gettype())
    x = etree.parse(StringIO(body), parser)
    for link in x.iterfind('.//link[@rel]'):
        if is_stylesheet(link):
            LOG.info('page %s links to %s' % (url, link.get('href')))
            yield urljoin(url, link.get('href'))


def _fetch_css(url):
    "Get CSS from `url`, check type, and print a log message."
    info,body = get_page(url)
    if info.gettype() != 'text/css':
        LOG.warn('invalid type for %s: %s' % (url, info.gettype()))
        return (None, None)
    LOG.info('returning CSS for %s (type %s)' % (url, type(body)))
    return (None, body)

class MonkeyCSSParser (CSSParser):
    """Fix issue 48.

    http://code.google.com/p/cssutils/issues/detail?id=48
    """
    def __init__(self, *args, **kwargs):
        super(MonkeyCSSParser, self).__init__(*args, **kwargs)
        self.__fetcher = kwargs['fetcher']

    def parseUrl(self, href, encoding=None, media=None, title=None):
        encoding, enctype, text = cssutils.util._readUrl(
            href, fetcher=self.__fetcher,
            overrideEncoding=encoding)
        if enctype == 5:
            # do not used if defaulting to UTF-8                                
            encoding = None

        if text is not None:
            return self.parseString(text, encoding=encoding,
                                    href=href, media=media, title=title)


class CSSReplacer (object):
    """Replace `url(...)` references in stylesheets with local values.

    Downloads the files, adjusting the extension if necessary, and
    update reference to point to the local copies.
    """
    _mime_overrides = {
        'image/jpeg': '.jpg',
        }

    def __init__(self, href=None, data_dir=None, data_url=None):
        self._href = href
        if data_dir == None:
            data_dir = DATA_DIR
        self._data_dir = data_dir
        if data_url == None:
            data_url = DATA_URL
        if not data_url.endswith('/'):
            data_url += '/'  # urlljoin needs trailing slash
        self._data_url = data_url

    def __call__(self, url):
        full_url = urljoin(self._href, url)
        _url = os.path.basename(url)
        root,ext = os.path.splitext(_url)
        info,data = get_page(full_url)
        _type = info.gettype()
        if _type in self._mime_overrides:
            expected_ext = self._mime_overrides[_type]
        else:
            expected_ext = guess_extension(_type)
        if expected_ext != ext:
            LOG.info('changing extension for %s from %s to %s'
                     % (full_url, ext, expected_ext))
        filename = root + expected_ext
        target = urljoin(self._data_url, filename)
        LOG.info('replace url %s -> %s' % (full_url, target))
        LOG.debug('download %s' % full_url)
        if not os.path.exists(self._data_dir):
            mkdir(self._data_dir)
        with open(os.path.join(self._data_dir, filename), 'wb') as f:
            f.write(data)
        return target


def _standardize_css(sheet, **kwargs):
    "Post-process `sheet` to adapt it to to the local environment."
    sheet = resolveImports(sheet)
    replaceUrls(sheet, CSSReplacer(href=sheet.href, **kwargs))
    return sheet

def consolidate_css(urls, parser=None, **kwargs):
    """Get a single, standardized stylesheet combining each URL in `urls`.

    Missing URLs are ignored.
    """
    if parser == None:
        parser = MonkeyCSSParser(fetcher=_fetch_css)
    lines = []
    for url in urls:
        sheet = parser.parseUrl(url)
        if sheet == None:
            continue
        sheet = _standardize_css(sheet, **kwargs)
        lines.extend(['/* %s */' % url, '', sheet.cssText, ''])
    return '\n'.join(lines)


if __name__ == '__main__':
    try:  # argparse code is untested
        from argparse import ArgumentParser

        p = ArgumentParser(description=__doc__)
        p.add_argument('-v', '--verbose', default=0)  # TODO: count
        p.add_argument(
            '-d', '--data-dir', default=DATA_DIR, dest='data_dir',
            help='path to downloaded image directory (%(default)).')
        p.add_argument(
            '-u', '--data-url', default=DATA_URL, dest='data_url',
            help='URL to downloaded image directory (%(default)).')
        p.add_argument(
            '-o', '--output',
            help='path to the consolidated output file (`stdout`)')
        p.add_argument('url', metavar='URL', help='page to mimic')
        args = p.parse_args()
    except ImportError:
        from optparse import OptionParser
        p = OptionParser(description=__doc__)
        p.add_option('-v', '--verbose', default=0, action='count')
        p.add_option(
            '-d', '--data-dir', default=DATA_DIR, dest='data_dir',
            help='path to downloaded images directory (%default).')
        p.add_option(
            '-u', '--data-url', default=DATA_URL, dest='data_url',
            help='URL to downloaded image directory (%default).')
        p.add_option(
            '-o', '--output',
            help='path to the consolidated output file (`stdout`)')
        options,args = p.parse_args()
        options.url = args[0]
        args = options

    log_level = CRITICAL - 10*args.verbose
    _STREAM_HANDLER.setLevel(log_level)
    _cssutils_log.setLevel(log_level)

    urls = get_css(args.url)
    full = consolidate_css(
        urls, data_dir=args.data_dir, data_url=args.data_url)
    bytes = full  #full.encode('utf-8')

    if args.output == None:
        sys.stdout.write(bytes)
    else:
        with open(args.output, 'w') as f:
            f.write(bytes)