1 """classes and functions used by cssutils scripts
2 """
3 __all__ = ['CSSCapture', 'csscombine']
4 __docformat__ = 'restructuredtext'
5 __version__ = '$Id: parse.py 1323 2008-07-06 18:13:57Z cthedot $'
6
7 import codecs
8 import errno
9 import HTMLParser
10 import logging
11 import os
12 import sys
13 import urllib2
14 import urlparse
15
16 import cssutils
17 try:
18 import cssutils.encutils as encutils
19 except ImportError:
20 try:
21 import encutils
22 except ImportError:
23 sys.exit("You need encutils from http://cthedot.de/encutils/")
24
25
26 LINK = 0
27 STYLE = 1
28
30 """CSSCapture helper: Parse given data for link and style elements"""
31 curtag = u''
32 sheets = []
33
35 return dict([(a.lower(), v.lower()) for a, v in atts])
36
51
53 if self.curtag == u'style':
54 self.sheets[-1][1][1] = data
55
59
63
64
66 """
67 Retrieve all CSS stylesheets including embedded for a given URL.
68 Optional setting of User-Agent used for retrieval possible
69 to handle browser sniffing servers.
70
71 raises urllib2.HTTPError
72 """
73 - def __init__(self, ua=None, log=None, defaultloglevel=logging.INFO):
74 """
75 initialize a new Capture object
76
77 ua
78 init User-Agent to use for requests
79 log
80 supply a log object which is used instead of the default
81 log which writes to sys.stderr
82 defaultloglevel
83 constant of logging package which defines the level of the
84 default log if no explicit log given
85 """
86 self._ua = ua
87
88 if log:
89 self._log = log
90 else:
91 self._log = logging.getLogger('CSSCapture')
92 hdlr = logging.StreamHandler(sys.stderr)
93 formatter = logging.Formatter('%(message)s')
94 hdlr.setFormatter(formatter)
95 self._log.addHandler(hdlr)
96 self._log.setLevel(defaultloglevel)
97 self._log.debug(u'Using default log')
98
99 self._htmlparser = CSSCaptureHTMLParser()
100 self._cssparser = cssutils.CSSParser(log = self._log)
101
103 """Do an HTTP request
104
105 Return (url, rawcontent)
106 url might have been changed by server due to redirects etc
107 """
108 self._log.debug(u' CSSCapture._doRequest\n * URL: %s' % url)
109
110 req = urllib2.Request(url)
111 if self._ua:
112 req.add_header('User-agent', self._ua)
113 self._log.info(' * Using User-Agent: %s', self._ua)
114
115 try:
116 res = urllib2.urlopen(req)
117 except urllib2.HTTPError, e:
118 self._log.critical(' %s\n%s %s\n%s' % (
119 e.geturl(), e.code, e.msg, e.headers))
120 return None, None
121
122
123 if url != res.geturl():
124 url = res.geturl()
125 self._log.info(' URL retrieved: %s', url)
126
127 return url, res
128
129 - def _createStyleSheet(self, href=None,
130 media=None,
131 parentStyleSheet=None,
132 title=u'',
133 cssText=None,
134 encoding=None):
155
157 """
158 parse text for stylesheets
159 fills stylesheetlist with all found StyleSheets
160
161 docurl
162 to build a full url of found StyleSheets @href
163 doctext
164 to parse
165 """
166
167 self._htmlparser.feed(doctext)
168
169 for typ, data in self._htmlparser.sheets:
170 sheet = None
171
172 if LINK == typ:
173 self._log.info(u'+ PROCESSING <link> %r' % data)
174
175 atts = data
176 href = urlparse.urljoin(docurl, atts.get(u'href', None))
177 sheet = self._createStyleSheet(href=href,
178 media=atts.get(u'media', None),
179 title=atts.get(u'title', None))
180 elif STYLE == typ:
181 self._log.info(u'+ PROCESSING <style> %r' % data)
182
183 atts, cssText = data
184 sheet = self._createStyleSheet(cssText=cssText,
185 href = docurl,
186 media=atts.get(u'media', None),
187 title=atts.get(u'title', None),
188 encoding=self.docencoding)
189 if sheet:
190 sheet._href = None
191 print sheet.cssText
192
193 if sheet:
194 self.stylesheetlist.append(sheet)
195 self._doImports(sheet, base=docurl)
196
197
198 - def _doImports(self, parentStyleSheet, base=None):
215
217 """
218 Capture all stylesheets at given URL's HTML document.
219 Any HTTPError is raised to caller.
220
221 url
222 to capture CSS from
223
224 Returns ``cssutils.stylesheets.StyleSheetList``.
225 """
226 self._log.info(u'\nCapturing CSS from URL:\n %s\n', url)
227 self._nonparsed = {}
228 self.stylesheetlist = cssutils.stylesheets.StyleSheetList()
229
230
231 scheme, loc, path, query, fragment = urlparse.urlsplit(url)
232 self._filename = os.path.basename(path)
233
234
235 url, res = self._doRequest(url)
236 if not res:
237 sys.exit(1)
238
239 rawdoc = res.read()
240
241 self.docencoding = encutils.getEncodingInfo(
242 res, rawdoc, log=self._log).encoding
243 self._log.info(u'\nUsing Encoding: %s\n', self.docencoding)
244
245 doctext = rawdoc.decode(self.docencoding)
246
247
248 self._findStyleSheets(url, doctext)
249
250 return self.stylesheetlist
251
252 - def saveto(self, dir, saveraw=False, minified=False):
253 """
254 saves css in "dir" in the same layout as on the server
255 internal stylesheets are saved as "dir/__INLINE_STYLE__.html.css"
256
257 dir
258 directory to save files to
259 saveparsed
260 save literal CSS from server or save the parsed CSS
261 minified
262 save minified CSS
263
264 Both parsed and minified (which is also parsed of course) will
265 loose information which cssutils is unable to understand or where
266 it is simple buggy. You might to first save the raw version before
267 parsing of even minifying it.
268 """
269 msg = 'parsed'
270 if saveraw:
271 msg = 'raw'
272 if minified:
273 cssutils.ser.prefs.useMinified()
274 msg = 'minified'
275
276 inlines = 0
277 for i, sheet in enumerate(self.stylesheetlist):
278 url = sheet.href
279 if not url:
280 inlines += 1
281 url = u'%s_INLINE_%s.css' % (self._filename, inlines)
282
283
284 scheme, loc, path, query, fragment = urlparse.urlsplit(url)
285
286 if path and path.startswith('/'):
287 path = path[1:]
288 path = os.path.normpath(path)
289 path, fn = os.path.split(path)
290 savepath = os.path.join(dir, path)
291 savefn = os.path.join(savepath, fn)
292 try:
293 os.makedirs(savepath)
294 except OSError, e:
295 if e.errno != errno.EEXIST:
296 raise e
297 self._log.debug(u'Path "%s" already exists.', savepath)
298
299 self._log.info(u'SAVING %s, %s %r' % (i+1, msg, savefn))
300
301 sf = open(savefn, 'wb')
302 if saveraw:
303 cssText = self._nonparsed[sheet]
304 uf = codecs.getwriter('css')(sf)
305 uf.write(cssText)
306 else:
307 sf.write(sheet.cssText)
308 sf.close()
309
310
311 -def csscombine(proxypath, sourceencoding=None, targetencoding='utf-8',
312 minify=True):
313 """Combine sheets referred to by @import rules in given CSS proxy sheet
314 into a single new sheet.
315
316 :returns: combined cssText, normal or minified
317 :Parameters:
318 `proxypath`
319 url or path to a CSSStyleSheet which imports other sheets which
320 are then combined into one sheet
321 `sourceencoding`
322 encoding of the source sheets including the proxy sheet
323 `targetencoding`
324 encoding of the combined stylesheet, default 'utf-8'
325 `minify`
326 defines if the combined sheet should be minified, default True
327 """
328 log = cssutils.log
329
330 log.info('Combining files in proxy %r' % proxypath, neverraise=True)
331
332 if sourceencoding is not None:
333 log.info('Using source encoding %r' % sourceencoding,
334 neverraise=True)
335
336 src = cssutils.parseFile(proxypath, encoding=sourceencoding)
337 srcpath = os.path.dirname(proxypath)
338 combined = cssutils.css.CSSStyleSheet()
339 for rule in src.cssRules:
340 if rule.type == rule.IMPORT_RULE:
341 fn = os.path.join(srcpath, rule.href)
342 log.info('Processing @import %r' % fn,
343 neverraise=True)
344 importsheet = cssutils.parseFile(fn, encoding=sourceencoding)
345 importsheet.encoding = None
346 combined.add(cssutils.css.CSSComment(cssText=u'/* %s */' %
347 rule.cssText))
348 for x in importsheet.cssRules:
349 if x.type == x.IMPORT_RULE:
350 log.info('Nested @imports are not combined: %s' % x.cssText,
351 neverraise=True)
352
353 combined.add(x)
354
355 else:
356 combined.add(rule)
357
358 log.info('Setting target encoding %r' % targetencoding, neverraise=True)
359 combined.encoding = targetencoding
360
361 if minify:
362
363 oldser = cssutils.ser
364 cssutils.setSerializer(cssutils.serialize.CSSSerializer())
365 cssutils.ser.prefs.useMinified()
366 cssText = combined.cssText
367 cssutils.setSerializer(oldser)
368 else:
369 cssText = combined.cssText
370
371 return cssText
372