Tryag File Manager
Home
-
Turbo Force
Current Path :
/
usr
/
lib
/
python2.4
/
site-packages
/
lxml
/
Upload File :
New :
File
Dir
//usr/lib/python2.4/site-packages/lxml/cssselect.py
"""CSS Selectors based on XPath. This module supports selecting XML/HTML tags based on CSS selectors. See the `CSSSelector` class for details. """ import re from lxml import etree __all__ = ['SelectorSyntaxError', 'ExpressionError', 'CSSSelector'] class SelectorSyntaxError(SyntaxError): pass class ExpressionError(RuntimeError): pass class CSSSelector(etree.XPath): """A CSS selector. Usage:: >>> from lxml import etree, cssselect >>> select = cssselect.CSSSelector("a tag > child") >>> root = etree.XML("<a><b><c/><tag><child>TEXT</child></tag></b></a>") >>> [ el.tag for el in select(root) ] ['child'] """ def __init__(self, css): path = css_to_xpath(css) etree.XPath.__init__(self, path) self.css = css def __repr__(self): return '<%s %s for %r>' % ( self.__class__.__name__, hex(abs(id(self)))[2:], self.css) ############################## ## Token objects: class _UniToken(unicode): def __new__(cls, contents, pos): obj = unicode.__new__(cls, contents) obj.pos = pos return obj def __repr__(self): return '%s(%s, %r)' % ( self.__class__.__name__, unicode.__repr__(self), self.pos) class Symbol(_UniToken): pass class String(_UniToken): pass class Token(_UniToken): pass ############################################################ ## Parsing ############################################################ ############################## ## Syntax objects: class Class(object): """ Represents selector.class_name """ def __init__(self, selector, class_name): self.selector = selector self.class_name = class_name def __repr__(self): return '%s[%r.%s]' % ( self.__class__.__name__, self.selector, self.class_name) def xpath(self): sel_xpath = self.selector.xpath() sel_xpath.add_condition( "contains(concat(' ', normalize-space(@class), ' '), %s)" % xpath_repr(' '+self.class_name+' ')) return sel_xpath class Function(object): """ Represents selector:name(expr) """ unsupported = [ 'target', 'lang', 'enabled', 'disabled',] def __init__(self, selector, type, name, expr): self.selector = selector self.type = type self.name = name self.expr = expr def __repr__(self): return '%s[%r%s%s(%r)]' % ( self.__class__.__name__, self.selector, self.type, self.name, self.expr) def xpath(self): sel_path = self.selector.xpath() if self.name in self.unsupported: raise ExpressionError( "The psuedo-class %r is not supported" % self.name) method = '_xpath_' + self.name.replace('-', '_') if not hasattr(self, method): raise ExpressionError( "The psuedo-class %r is unknown" % self.name) method = getattr(self, method) return method(sel_path, self.expr) def _xpath_nth_child(self, xpath, expr, last=False, add_name_test=True): a, b = parse_series(expr) if not a and not b and not last: # a=0 means nothing is returned... xpath.add_condition('false() and position() = 0') return xpath if add_name_test: xpath.add_name_test() xpath.add_star_prefix() if a == 0: if last: b = 'last() - %s' % b xpath.add_condition('position() = %s' % b) return xpath if last: # FIXME: I'm not sure if this is right a = -a b = -b if b > 0: b_neg = str(-b) else: b_neg = '+%s' % (-b) if a != 1: expr = ['(position() %s) mod %s = 0' % (b_neg, a)] else: expr = [] if b >= 0: expr.append('position() >= %s' % b) elif b < 0 and last: expr.append('position() < (last() %s)' % b) expr = ' and '.join(expr) if expr: xpath.add_condition(expr) return xpath # FIXME: handle an+b, odd, even # an+b means every-a, plus b, e.g., 2n+1 means odd # 0n+b means b # n+0 means a=1, i.e., all elements # an means every a elements, i.e., 2n means even # -n means -1n # -1n+6 means elements 6 and previous def _xpath_nth_last_child(self, xpath, expr): return self._xpath_nth_child(xpath, expr, last=True) def _xpath_nth_of_type(self, xpath, expr): if xpath.element == '*': raise NotImplementedError( "*:nth-of-type() is not implemented") return self._xpath_nth_child(xpath, expr, add_name_test=False) def _xpath_nth_last_of_type(self, xpath, expr): return self._xpath_nth_child(xpath, expr, last=True, add_name_test=False) def _xpath_contains(self, xpath, expr): # text content, minus tags, must contain expr if isinstance(expr, Element): expr = expr._format_element() xpath.add_condition('contains(css:lower-case(string(.)), %s)' % xpath_repr(expr.lower())) # FIXME: Currently case insensitive matching doesn't seem to be happening return xpath def _xpath_not(self, xpath, expr): # everything for which not expr applies expr = expr.xpath() cond = expr.condition # FIXME: should I do something about element_path? xpath.add_condition('not(%s)' % cond) return xpath def _make_lower_case(context, s): return s.lower() ns = etree.FunctionNamespace('http://codespeak.net/lxml/css/') ns.prefix = 'css' ns['lower-case'] = _make_lower_case class Pseudo(object): """ Represents selector:ident """ unsupported = ['indeterminate', 'first-line', 'first-letter', 'selection', 'before', 'after', 'link', 'visited', 'active', 'focus', 'hover'] def __init__(self, element, type, ident): self.element = element assert type in (':', '::') self.type = type self.ident = ident def __repr__(self): return '%s[%r%s%s]' % ( self.__class__.__name__, self.element, self.type, self.ident) def xpath(self): el_xpath = self.element.xpath() if self.ident in self.unsupported: raise ExpressionError( "The psuedo-class %r is unsupported" % self.ident) method = '_xpath_' + self.ident.replace('-', '_') if not hasattr(self, method): raise ExpressionError( "The psuedo-class %r is unknown" % self.ident) method = getattr(self, method) el_xpath = method(el_xpath) return el_xpath def _xpath_checked(self, xpath): # FIXME: is this really all the elements? xpath.add_condition("(@selected or @checked) and (name(.) = 'input' or name(.) = 'option')") return xpath def _xpath_root(self, xpath): # if this element is the root element raise NotImplementedError def _xpath_first_child(self, xpath): xpath.add_star_prefix() xpath.add_name_test() xpath.add_condition('position() = 1') return xpath def _xpath_last_child(self, xpath): xpath.add_star_prefix() xpath.add_name_test() xpath.add_condition('position() = last()') return xpath def _xpath_first_of_type(self, xpath): if xpath.element == '*': raise NotImplementedError( "*:first-of-type is not implemented") xpath.add_star_prefix() xpath.add_condition('position() = 1') return xpath def _xpath_last_of_type(self, xpath): if xpath.element == '*': raise NotImplementedError( "*:last-of-type is not implemented") xpath.add_star_prefix() xpath.add_condition('position() = last()') return xpath def _xpath_only_child(self, xpath): xpath.add_name_test() xpath.add_star_prefix() xpath.add_condition('last() = 1') return xpath def _xpath_only_of_type(self, xpath): if xpath.element == '*': raise NotImplementedError( "*:only-of-type is not implemented") xpath.add_condition('last() = 1') return xpath def _xpath_empty(self, xpath): xpath.add_condition("not(*) and not(normalize-space())") return xpath class Attrib(object): """ Represents selector[namespace|attrib operator value] """ def __init__(self, selector, namespace, attrib, operator, value): self.selector = selector self.namespace = namespace self.attrib = attrib self.operator = operator self.value = value def __repr__(self): if self.operator == 'exists': return '%s[%r[%s]]' % ( self.__class__.__name__, self.selector, self._format_attrib()) else: return '%s[%r[%s %s %r]]' % ( self.__class__.__name__, self.selector, self._format_attrib(), self.operator, self.value) def _format_attrib(self): if self.namespace == '*': return self.attrib else: return '%s|%s' % (self.namespace, self.attrib) def _xpath_attrib(self): # FIXME: if attrib is *? if self.namespace == '*': return '@' + self.attrib else: return '@%s:%s' % (self.namespace, self.attrib) def xpath(self): path = self.selector.xpath() attrib = self._xpath_attrib() value = self.value if self.operator == 'exists': assert not value path.add_condition(attrib) elif self.operator == '=': path.add_condition('%s = %s' % (attrib, xpath_repr(value))) elif self.operator == '!=': # FIXME: this seems like a weird hack... if value: path.add_condition('not(%s) or %s != %s' % (attrib, attrib, xpath_repr(value))) else: path.add_condition('%s != %s' % (attrib, xpath_repr(value))) #path.add_condition('%s != %s' % (attrib, xpath_repr(value))) elif self.operator == '~=': path.add_condition("contains(concat(' ', normalize-space(%s), ' '), %s)" % (attrib, xpath_repr(' '+value+' '))) elif self.operator == '|=': # Weird, but true... path.add_condition('%s = %s or starts-with(%s, %s)' % ( attrib, xpath_repr(value), attrib, xpath_repr(value + '-'))) elif self.operator == '^=': path.add_condition('starts-with(%s, %s)' % ( attrib, xpath_repr(value))) elif self.operator == '$=': # Oddly there is a starts-with in XPath 1.0, but not ends-with path.add_condition('substring(%s, string-length(%s)-%s) = %s' % (attrib, attrib, len(value)-1, xpath_repr(value))) elif self.operator == '*=': # FIXME: case sensitive? path.add_condition('contains(%s, %s)' % ( attrib, xpath_repr(value))) else: assert 0, ("Unknown operator: %r" % self.operator) return path class Element(object): """ Represents namespace|element """ def __init__(self, namespace, element): self.namespace = namespace self.element = element def __repr__(self): return '%s[%s]' % ( self.__class__.__name__, self._format_element()) def _format_element(self): if self.namespace == '*': return self.element else: return '%s|%s' % (self.namespace, self.element) def xpath(self): if self.namespace == '*': el = self.element.lower() else: # FIXME: Should we lowercase here? el = '%s:%s' % (self.namespace, self.element) return XPathExpr(element=el) class Hash(object): """ Represents selector#id """ def __init__(self, selector, id): self.selector = selector self.id = id def __repr__(self): return '%s[%r#%s]' % ( self.__class__.__name__, self.selector, self.id) def xpath(self): path = self.selector.xpath() path.add_condition('@id = %s' % xpath_repr(self.id)) return path class Or(object): def __init__(self, items): self.items = items def __repr__(self): return '%s(%r)' % ( self.__class__.__name__, self.items) def xpath(self): paths = [item.xpath() for item in self.items] return XPathExprOr(paths) class CombinedSelector(object): _method_mapping = { ' ': 'descendant', '>': 'child', '+': 'direct_adjacent', '~': 'indirect_adjacent', } def __init__(self, selector, combinator, subselector): assert selector is not None self.selector = selector self.combinator = combinator self.subselector = subselector def __repr__(self): if self.combinator == ' ': comb = '<followed>' else: comb = self.combinator return '%s[%r %s %r]' % ( self.__class__.__name__, self.selector, comb, self.subselector) def xpath(self): if self.combinator not in self._method_mapping: raise ExpressionError( "Unknown combinator: %r" % self.combinator) method = '_xpath_' + self._method_mapping[self.combinator] method = getattr(self, method) path = self.selector.xpath() return method(path, self.subselector) def _xpath_descendant(self, xpath, sub): # when sub is a descendant in any way of xpath xpath.join('/descendant::', sub.xpath()) return xpath def _xpath_child(self, xpath, sub): # when sub is an immediate child of xpath xpath.join('/', sub.xpath()) return xpath def _xpath_direct_adjacent(self, xpath, sub): # when sub immediately follows xpath xpath.join('/following-sibling::', sub.xpath()) xpath.add_name_test() xpath.add_condition('position() = 1') return xpath def _xpath_indirect_adjacent(self, xpath, sub): # when sub comes somewhere after xpath as a sibling xpath.join('/following-sibling::', sub.xpath()) return xpath ############################## ## XPathExpr objects: _el_re = re.compile(r'^\w+\s*$') _id_re = re.compile(r'^(\w*)#(\w+)\s*$') _class_re = re.compile(r'^(\w*)\.(\w+)\s*$') def css_to_xpath(css_expr, prefix='descendant-or-self::'): if isinstance(css_expr, basestring): match = _el_re.search(css_expr) if match is not None: return '%s%s' % (prefix, match.group(0).strip()) match = _id_re.search(css_expr) if match is not None: return "%s%s[@id = '%s']" % ( prefix, match.group(1) or '*', match.group(2)) match = _class_re.search(css_expr) if match is not None: return "%s%s[contains(concat(' ', normalize-space(@class), ' '), ' %s ')]" % ( prefix, match.group(1) or '*', match.group(2)) css_expr = parse(css_expr) expr = css_expr.xpath() assert expr is not None, ( "Got None for xpath expression from %s" % repr(css_expr)) if prefix: expr.add_prefix(prefix) return str(expr) class XPathExpr(object): def __init__(self, prefix=None, path=None, element='*', condition=None, star_prefix=False): self.prefix = prefix self.path = path self.element = element self.condition = condition self.star_prefix = star_prefix def __str__(self): path = '' if self.prefix is not None: path += str(self.prefix) if self.path is not None: path += str(self.path) path += str(self.element) if self.condition: path += '[%s]' % self.condition return path def __repr__(self): return '%s[%s]' % ( self.__class__.__name__, self) def add_condition(self, condition): if self.condition: self.condition = '%s and (%s)' % (self.condition, condition) else: self.condition = condition def add_path(self, part): if self.path is None: self.path = self.element else: self.path += self.element self.element = part def add_prefix(self, prefix): if self.prefix: self.prefix = prefix + self.prefix else: self.prefix = prefix def add_name_test(self): if self.element == '*': # We weren't doing a test anyway return self.add_condition("name() = %s" % xpath_repr(self.element)) self.element = '*' def add_star_prefix(self): """ Adds a /* prefix if there is no prefix. This is when you need to keep context's constrained to a single parent. """ if self.path: self.path += '*/' else: self.path = '*/' self.star_prefix = True def join(self, combiner, other): prefix = str(self) prefix += combiner path = (other.prefix or '') + (other.path or '') # We don't need a star prefix if we are joining to this other # prefix; so we'll get rid of it if other.star_prefix and path == '*/': path = '' self.prefix = prefix self.path = path self.element = other.element self.condition = other.condition class XPathExprOr(XPathExpr): """ Represents |'d expressions. Note that unfortunately it isn't the union, it's the sum, so duplicate elements will appear. """ def __init__(self, items, prefix=None): for item in items: assert item is not None self.items = items self.prefix = prefix def __str__(self): prefix = self.prefix or '' return ' | '.join([prefix + str(i) for i in self.items]) def xpath_repr(s): # FIXME: I don't think this is right, but lacking any reasonable # specification on what XPath literals look like (which doesn't seem # to be in the XPath specification) it is hard to do 'right' if isinstance(s, Element): # This is probably a symbol that looks like an expression... s = s._format_element() return repr(str(s)) ############################## ## Parsing functions def parse(string): stream = TokenStream(tokenize(string)) stream.source = string try: return parse_selector_group(stream) except SelectorSyntaxError, e: e.args = tuple(["%s at %s -> %s" % ( e, stream.used, list(stream))]) raise def parse_selector_group(stream): result = [] while 1: result.append(parse_selector(stream)) if stream.peek() == ',': stream.next() else: break if len(result) == 1: return result[0] else: return Or(result) def parse_selector(stream): result = parse_simple_selector(stream) while 1: peek = stream.peek() if peek == ',' or peek is None: return result elif peek in ('+', '>', '~'): # A combinator combinator = stream.next() else: combinator = ' ' next_selector = parse_simple_selector(stream) result = CombinedSelector(result, combinator, next_selector) return result def parse_simple_selector(stream): peek = stream.peek() if peek != '*' and not isinstance(peek, Symbol): element = namespace = '*' else: next = stream.next() if next != '*' and not isinstance(next, Symbol): raise SelectorSyntaxError( "Expected symbol, got %r" % next) if stream.peek() == '|': namespace = next stream.next() element = stream.next() if element != '*' and not isinstance(next, Symbol): raise SelectorSyntaxError( "Expected symbol, got %r" % next) else: namespace = '*' element = next result = Element(namespace, element) has_hash = False while 1: peek = stream.peek() if peek == '#': if has_hash: # You can't have two hashes # (FIXME: is there some more general rule I'm missing?) break stream.next() result = Hash(result, stream.next()) has_hash = True continue elif peek == '.': stream.next() result = Class(result, stream.next()) continue elif peek == '[': stream.next() result = parse_attrib(result, stream) next = stream.next() if not next == ']': raise SelectorSyntaxError( "] expected, got %r" % next) continue elif peek == ':' or peek == '::': type = stream.next() ident = stream.next() if not isinstance(ident, Symbol): raise SelectorSyntaxError( "Expected symbol, got %r" % ident) if stream.peek() == '(': stream.next() peek = stream.peek() if isinstance(peek, String): selector = stream.next() elif isinstance(peek, Symbol) and is_int(peek): selector = int(stream.next()) else: # FIXME: parse_simple_selector, or selector, or...? selector = parse_simple_selector(stream) next = stream.next() if not next == ')': raise SelectorSyntaxError( "Expected ), got %r and %r" % (next, selector)) result = Function(result, type, ident, selector) else: result = Pseudo(result, type, ident) continue else: if peek == ' ': stream.next() break # FIXME: not sure what "negation" is return result def is_int(v): try: int(v) except ValueError: return False else: return True def parse_attrib(selector, stream): attrib = stream.next() if stream.peek() == '|': namespace = attrib stream.next() attrib = stream.next() else: namespace = '*' if stream.peek() == ']': return Attrib(selector, namespace, attrib, 'exists', None) op = stream.next() if not op in ('^=', '$=', '*=', '=', '~=', '|=', '!='): raise SelectorSyntaxError( "Operator expected, got %r" % op) value = stream.next() if not isinstance(value, (Symbol, String)): raise SelectorSyntaxError( "Expected string or symbol, got %r" % value) return Attrib(selector, namespace, attrib, op, value) def parse_series(s): """ Parses things like '1n+2', or 'an+b' generally, returning (a, b) """ if isinstance(s, Element): s = s._format_element() if not s or s == '*': # Happens when there's nothing, which the CSS parser thinks of as * return (0, 0) if isinstance(s, int): # Happens when you just get a number return (0, s) if s == 'odd': return (2, 1) elif s == 'even': return (2, 0) elif s == 'n': return (1, 0) if 'n' not in s: # Just a b return (0, int(s)) a, b = s.split('n', 1) if not a: a = 1 elif a == '-' or a == '+': a = int(a+'1') else: a = int(a) if not b: b = 0 elif b == '-' or b == '+': b = int(b+'1') else: b = int(b) return (a, b) ############################################################ ## Tokenizing ############################################################ _whitespace_re = re.compile(r'\s+') _comment_re = re.compile(r'/\*.*?\*/', re.S) _count_re = re.compile(r'[+-]?\d*n(?:[+-]\d+)?') def tokenize(s): pos = 0 s = _comment_re.sub('', s) while 1: match = _whitespace_re.match(s, pos=pos) if match: preceding_whitespace_pos = pos pos = match.end() else: preceding_whitespace_pos = 0 if pos >= len(s): return match = _count_re.match(s, pos=pos) if match and match.group() != 'n': sym = s[pos:match.end()] yield Symbol(sym, pos) pos = match.end() continue c = s[pos] c2 = s[pos:pos+2] if c2 in ('~=', '|=', '^=', '$=', '*=', '::', '!='): yield Token(c2, pos) pos += 2 continue if c in '>+~,.*=[]()|:#': if c in '.#' and preceding_whitespace_pos > 0: yield Token(' ', preceding_whitespace_pos) yield Token(c, pos) pos += 1 continue if c == '"' or c == "'": # Quoted string old_pos = pos sym, pos = tokenize_escaped_string(s, pos) yield String(sym, old_pos) continue old_pos = pos sym, pos = tokenize_symbol(s, pos) yield Symbol(sym, old_pos) continue def tokenize_escaped_string(s, pos): quote = s[pos] assert quote in ('"', "'") pos = pos+1 start = pos while 1: next = s.find(quote, pos) if next == -1: raise SelectorSyntaxError( "Expected closing %s for string in: %r" % (quote, s[start:])) result = s[start:next] try: result = result.decode('unicode_escape') except UnicodeDecodeError: # Probably a hanging \ pos = next+1 else: return result, next+1 _illegal_symbol = re.compile(r'[^\w\\-]', re.UNICODE) def tokenize_symbol(s, pos): start = pos match = _illegal_symbol.search(s, pos=pos) if not match: # Goes to end of s return s[start:], len(s) if match.start() == pos: assert 0, ( "Unexpected symbol: %r at %s" % (s[pos], pos)) if not match: result = s[start:] pos = len(s) else: result = s[start:match.start()] pos = match.start() try: result = result.decode('unicode_escape') except UnicodeDecodeError, e: raise SelectorSyntaxError( "Bad symbol %r: %s" % (result, e)) return result, pos class TokenStream(object): def __init__(self, tokens, source=None): self.used = [] self.tokens = iter(tokens) self.source = source self.peeked = None self._peeking = False def next(self): if self._peeking: self._peeking = False self.used.append(self.peeked) return self.peeked else: try: next = self.tokens.next() self.used.append(next) return next except StopIteration: return None def __iter__(self): return iter(self.next, None) def peek(self): if not self._peeking: try: self.peeked = self.tokens.next() except StopIteration: return None self._peeking = True return self.peeked