#!/usr/bin/env python # # Copyright (C) 2011 W. Trevor King # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Lesser General Public License as # published by the Free Software Foundation, either version 3 of the # License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public # License along with this program. If not, see # . 'Generate CSS mimicking a specified page' from __future__ import with_statement # support Python 2.5 from codecs import BOM_UTF8 from logging import CRITICAL, DEBUG, getLogger, StreamHandler, Formatter from mimetypes import guess_extension from os import mkdir import os.path try: # Python 3 from io import StringIO except ImportError: from StringIO import StringIO import sys try: # Python 3 from urllib.request import urlopen from urllib.parse import urljoin except ImportError: from urllib2 import urlopen from urlparse import urljoin from lxml import etree from cssutils import CSSParser, replaceUrls, resolveImports from cssutils import log as _cssutils_log import cssutils # for MonkeyCSSParser DATA_DIR = 'data' DATA_URL = '/data' LOG = getLogger('get_css') LOG.setLevel(DEBUG) _STREAM_HANDLER = StreamHandler() _STREAM_HANDLER.setLevel(CRITICAL) _STREAM_HANDLER.setFormatter( Formatter('%(levelname)s - %(message)s')) LOG.addHandler(_STREAM_HANDLER) def _standardize_text(text): # remove byte-order marker (BOM) if present # possible Python parsing bug. See # http://evanjones.ca/python-utf8.html#bom text = text.lstrip(unicode(BOM_UTF8, 'utf-8')) for nl in ['\r\n', '\r']: # standardize newlines text = text.replace(nl, '\n') return text def get_page(url, standardize_text=True): LOG.info('get %s' % url) f = urlopen(url) info = f.info() _url = f.geturl() if _url != url: LOG.info('%s redirected to %s' % (url, _url)) ctype = f.headers['content-type'] body = f.read() f.close() if info.getmaintype() == 'text' and standardize_text == True: try: _type,encoding = ctype.split('charset=') except ValueError: encoding = 'utf-8' body = unicode(body, encoding) body = _standardize_text(body) return (info, body) def is_stylesheet(link): "Return `True` if the `etree._Element` `link` is a stylesheet." for attr,value in [('rel', 'stylesheet'), ('type', 'text/css')]: v = link.get(attr).lower() if v != value: return False return True def get_css(url): "Return urls for all CSS linked to from the (X)HTML at `url`." info,body = get_page(url, standardize_text=False) assert info.getmaintype() == 'text', 'invalid type %s' % info.gettype() if info.getsubtype() == 'html': parser = etree.HTMLParser() elif info.getsubtype() == 'xhtml': parser = etree.XMLParser() else: raise ValueError('invalid page type %s' % info.gettype()) x = etree.parse(StringIO(body), parser) for link in x.iterfind('.//link[@rel]'): if is_stylesheet(link): LOG.info('page %s links to %s' % (url, link.get('href'))) yield urljoin(url, link.get('href')) def _fetch_css(url): "Get CSS from `url`, check type, and print a log message." info,body = get_page(url) if info.gettype() != 'text/css': LOG.warn('invalid type for %s: %s' % (url, info.gettype())) return (None, None) LOG.info('returning CSS for %s (type %s)' % (url, type(body))) return (None, body) class MonkeyCSSParser (CSSParser): """Fix issue 48. http://code.google.com/p/cssutils/issues/detail?id=48 """ def __init__(self, *args, **kwargs): super(MonkeyCSSParser, self).__init__(*args, **kwargs) self.__fetcher = kwargs['fetcher'] def parseUrl(self, href, encoding=None, media=None, title=None): encoding, enctype, text = cssutils.util._readUrl( href, fetcher=self.__fetcher, overrideEncoding=encoding) if enctype == 5: # do not used if defaulting to UTF-8 encoding = None if text is not None: return self.parseString(text, encoding=encoding, href=href, media=media, title=title) class CSSReplacer (object): """Replace `url(...)` references in stylesheets with local values. Downloads the files, adjusting the extension if necessary, and update reference to point to the local copies. """ _mime_overrides = { 'image/jpeg': '.jpg', } def __init__(self, href=None, data_dir=None, data_url=None): self._href = href if data_dir == None: data_dir = DATA_DIR self._data_dir = data_dir if data_url == None: data_url = DATA_URL if not data_url.endswith('/'): data_url += '/' # urlljoin needs trailing slash self._data_url = data_url def __call__(self, url): full_url = urljoin(self._href, url) _url = os.path.basename(url) root,ext = os.path.splitext(_url) info,data = get_page(full_url) _type = info.gettype() if _type in self._mime_overrides: expected_ext = self._mime_overrides[_type] else: expected_ext = guess_extension(_type) if expected_ext != ext: LOG.info('changing extension for %s from %s to %s' % (full_url, ext, expected_ext)) filename = root + expected_ext target = urljoin(self._data_url, filename) LOG.info('replace url %s -> %s' % (full_url, target)) LOG.debug('download %s' % full_url) if not os.path.exists(self._data_dir): mkdir(self._data_dir) with open(os.path.join(self._data_dir, filename), 'wb') as f: f.write(data) return target def _standardize_css(sheet, **kwargs): "Post-process `sheet` to adapt it to to the local environment." sheet = resolveImports(sheet) replaceUrls(sheet, CSSReplacer(href=sheet.href, **kwargs)) return sheet def consolidate_css(urls, parser=None, **kwargs): """Get a single, standardized stylesheet combining each URL in `urls`. Missing URLs are ignored. """ if parser == None: parser = MonkeyCSSParser(fetcher=_fetch_css) lines = [] for url in urls: sheet = parser.parseUrl(url) if sheet == None: continue sheet = _standardize_css(sheet, **kwargs) lines.extend(['/* %s */' % url, '', sheet.cssText, '']) return '\n'.join(lines) if __name__ == '__main__': try: # argparse code is untested from argparse import ArgumentParser p = ArgumentParser(description=__doc__) p.add_argument('-v', '--verbose', default=0) # TODO: count p.add_argument( '-d', '--data-dir', default=DATA_DIR, dest='data_dir', help='path to downloaded image directory (%(default)).') p.add_argument( '-u', '--data-url', default=DATA_URL, dest='data_url', help='URL to downloaded image directory (%(default)).') p.add_argument( '-o', '--output', help='path to the consolidated output file (`stdout`)') p.add_argument('url', metavar='URL', help='page to mimic') args = p.parse_args() except ImportError: from optparse import OptionParser p = OptionParser(description=__doc__) p.add_option('-v', '--verbose', default=0, action='count') p.add_option( '-d', '--data-dir', default=DATA_DIR, dest='data_dir', help='path to downloaded images directory (%default).') p.add_option( '-u', '--data-url', default=DATA_URL, dest='data_url', help='URL to downloaded image directory (%default).') p.add_option( '-o', '--output', help='path to the consolidated output file (`stdout`)') options,args = p.parse_args() options.url = args[0] args = options log_level = CRITICAL - 10*args.verbose _STREAM_HANDLER.setLevel(log_level) _cssutils_log.setLevel(log_level) urls = get_css(args.url) full = consolidate_css( urls, data_dir=args.data_dir, data_url=args.data_url) bytes = full #full.encode('utf-8') if args.output == None: sys.stdout.write(bytes) else: with open(args.output, 'w') as f: f.write(bytes)