jpayne@68: # jpayne@68: # ElementTree jpayne@68: # $Id: ElementPath.py 3375 2008-02-13 08:05:08Z fredrik $ jpayne@68: # jpayne@68: # limited xpath support for element trees jpayne@68: # jpayne@68: # history: jpayne@68: # 2003-05-23 fl created jpayne@68: # 2003-05-28 fl added support for // etc jpayne@68: # 2003-08-27 fl fixed parsing of periods in element names jpayne@68: # 2007-09-10 fl new selection engine jpayne@68: # 2007-09-12 fl fixed parent selector jpayne@68: # 2007-09-13 fl added iterfind; changed findall to return a list jpayne@68: # 2007-11-30 fl added namespaces support jpayne@68: # 2009-10-30 fl added child element value filter jpayne@68: # jpayne@68: # Copyright (c) 2003-2009 by Fredrik Lundh. All rights reserved. jpayne@68: # jpayne@68: # fredrik@pythonware.com jpayne@68: # http://www.pythonware.com jpayne@68: # jpayne@68: # -------------------------------------------------------------------- jpayne@68: # The ElementTree toolkit is jpayne@68: # jpayne@68: # Copyright (c) 1999-2009 by Fredrik Lundh jpayne@68: # jpayne@68: # By obtaining, using, and/or copying this software and/or its jpayne@68: # associated documentation, you agree that you have read, understood, jpayne@68: # and will comply with the following terms and conditions: jpayne@68: # jpayne@68: # Permission to use, copy, modify, and distribute this software and jpayne@68: # its associated documentation for any purpose and without fee is jpayne@68: # hereby granted, provided that the above copyright notice appears in jpayne@68: # all copies, and that both that copyright notice and this permission jpayne@68: # notice appear in supporting documentation, and that the name of jpayne@68: # Secret Labs AB or the author not be used in advertising or publicity jpayne@68: # pertaining to distribution of the software without specific, written jpayne@68: # prior permission. jpayne@68: # jpayne@68: # SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD jpayne@68: # TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT- jpayne@68: # ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR jpayne@68: # BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY jpayne@68: # DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, jpayne@68: # WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS jpayne@68: # ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE jpayne@68: # OF THIS SOFTWARE. jpayne@68: # -------------------------------------------------------------------- jpayne@68: jpayne@68: # Licensed to PSF under a Contributor Agreement. jpayne@68: # See http://www.python.org/psf/license for licensing details. jpayne@68: jpayne@68: ## jpayne@68: # Implementation module for XPath support. There's usually no reason jpayne@68: # to import this module directly; the ElementTree does this for jpayne@68: # you, if needed. jpayne@68: ## jpayne@68: jpayne@68: import re jpayne@68: jpayne@68: xpath_tokenizer_re = re.compile( jpayne@68: r"(" jpayne@68: r"'[^']*'|\"[^\"]*\"|" jpayne@68: r"::|" jpayne@68: r"//?|" jpayne@68: r"\.\.|" jpayne@68: r"\(\)|" jpayne@68: r"[/.*:\[\]\(\)@=])|" jpayne@68: r"((?:\{[^}]+\})?[^/\[\]\(\)@=\s]+)|" jpayne@68: r"\s+" jpayne@68: ) jpayne@68: jpayne@68: def xpath_tokenizer(pattern, namespaces=None): jpayne@68: default_namespace = namespaces.get('') if namespaces else None jpayne@68: parsing_attribute = False jpayne@68: for token in xpath_tokenizer_re.findall(pattern): jpayne@68: ttype, tag = token jpayne@68: if tag and tag[0] != "{": jpayne@68: if ":" in tag: jpayne@68: prefix, uri = tag.split(":", 1) jpayne@68: try: jpayne@68: if not namespaces: jpayne@68: raise KeyError jpayne@68: yield ttype, "{%s}%s" % (namespaces[prefix], uri) jpayne@68: except KeyError: jpayne@68: raise SyntaxError("prefix %r not found in prefix map" % prefix) from None jpayne@68: elif default_namespace and not parsing_attribute: jpayne@68: yield ttype, "{%s}%s" % (default_namespace, tag) jpayne@68: else: jpayne@68: yield token jpayne@68: parsing_attribute = False jpayne@68: else: jpayne@68: yield token jpayne@68: parsing_attribute = ttype == '@' jpayne@68: jpayne@68: jpayne@68: def get_parent_map(context): jpayne@68: parent_map = context.parent_map jpayne@68: if parent_map is None: jpayne@68: context.parent_map = parent_map = {} jpayne@68: for p in context.root.iter(): jpayne@68: for e in p: jpayne@68: parent_map[e] = p jpayne@68: return parent_map jpayne@68: jpayne@68: jpayne@68: def _is_wildcard_tag(tag): jpayne@68: return tag[:3] == '{*}' or tag[-2:] == '}*' jpayne@68: jpayne@68: jpayne@68: def _prepare_tag(tag): jpayne@68: _isinstance, _str = isinstance, str jpayne@68: if tag == '{*}*': jpayne@68: # Same as '*', but no comments or processing instructions. jpayne@68: # It can be a surprise that '*' includes those, but there is no jpayne@68: # justification for '{*}*' doing the same. jpayne@68: def select(context, result): jpayne@68: for elem in result: jpayne@68: if _isinstance(elem.tag, _str): jpayne@68: yield elem jpayne@68: elif tag == '{}*': jpayne@68: # Any tag that is not in a namespace. jpayne@68: def select(context, result): jpayne@68: for elem in result: jpayne@68: el_tag = elem.tag jpayne@68: if _isinstance(el_tag, _str) and el_tag[0] != '{': jpayne@68: yield elem jpayne@68: elif tag[:3] == '{*}': jpayne@68: # The tag in any (or no) namespace. jpayne@68: suffix = tag[2:] # '}name' jpayne@68: no_ns = slice(-len(suffix), None) jpayne@68: tag = tag[3:] jpayne@68: def select(context, result): jpayne@68: for elem in result: jpayne@68: el_tag = elem.tag jpayne@68: if el_tag == tag or _isinstance(el_tag, _str) and el_tag[no_ns] == suffix: jpayne@68: yield elem jpayne@68: elif tag[-2:] == '}*': jpayne@68: # Any tag in the given namespace. jpayne@68: ns = tag[:-1] jpayne@68: ns_only = slice(None, len(ns)) jpayne@68: def select(context, result): jpayne@68: for elem in result: jpayne@68: el_tag = elem.tag jpayne@68: if _isinstance(el_tag, _str) and el_tag[ns_only] == ns: jpayne@68: yield elem jpayne@68: else: jpayne@68: raise RuntimeError(f"internal parser error, got {tag}") jpayne@68: return select jpayne@68: jpayne@68: jpayne@68: def prepare_child(next, token): jpayne@68: tag = token[1] jpayne@68: if _is_wildcard_tag(tag): jpayne@68: select_tag = _prepare_tag(tag) jpayne@68: def select(context, result): jpayne@68: def select_child(result): jpayne@68: for elem in result: jpayne@68: yield from elem jpayne@68: return select_tag(context, select_child(result)) jpayne@68: else: jpayne@68: if tag[:2] == '{}': jpayne@68: tag = tag[2:] # '{}tag' == 'tag' jpayne@68: def select(context, result): jpayne@68: for elem in result: jpayne@68: for e in elem: jpayne@68: if e.tag == tag: jpayne@68: yield e jpayne@68: return select jpayne@68: jpayne@68: def prepare_star(next, token): jpayne@68: def select(context, result): jpayne@68: for elem in result: jpayne@68: yield from elem jpayne@68: return select jpayne@68: jpayne@68: def prepare_self(next, token): jpayne@68: def select(context, result): jpayne@68: yield from result jpayne@68: return select jpayne@68: jpayne@68: def prepare_descendant(next, token): jpayne@68: try: jpayne@68: token = next() jpayne@68: except StopIteration: jpayne@68: return jpayne@68: if token[0] == "*": jpayne@68: tag = "*" jpayne@68: elif not token[0]: jpayne@68: tag = token[1] jpayne@68: else: jpayne@68: raise SyntaxError("invalid descendant") jpayne@68: jpayne@68: if _is_wildcard_tag(tag): jpayne@68: select_tag = _prepare_tag(tag) jpayne@68: def select(context, result): jpayne@68: def select_child(result): jpayne@68: for elem in result: jpayne@68: for e in elem.iter(): jpayne@68: if e is not elem: jpayne@68: yield e jpayne@68: return select_tag(context, select_child(result)) jpayne@68: else: jpayne@68: if tag[:2] == '{}': jpayne@68: tag = tag[2:] # '{}tag' == 'tag' jpayne@68: def select(context, result): jpayne@68: for elem in result: jpayne@68: for e in elem.iter(tag): jpayne@68: if e is not elem: jpayne@68: yield e jpayne@68: return select jpayne@68: jpayne@68: def prepare_parent(next, token): jpayne@68: def select(context, result): jpayne@68: # FIXME: raise error if .. is applied at toplevel? jpayne@68: parent_map = get_parent_map(context) jpayne@68: result_map = {} jpayne@68: for elem in result: jpayne@68: if elem in parent_map: jpayne@68: parent = parent_map[elem] jpayne@68: if parent not in result_map: jpayne@68: result_map[parent] = None jpayne@68: yield parent jpayne@68: return select jpayne@68: jpayne@68: def prepare_predicate(next, token): jpayne@68: # FIXME: replace with real parser!!! refs: jpayne@68: # http://effbot.org/zone/simple-iterator-parser.htm jpayne@68: # http://javascript.crockford.com/tdop/tdop.html jpayne@68: signature = [] jpayne@68: predicate = [] jpayne@68: while 1: jpayne@68: try: jpayne@68: token = next() jpayne@68: except StopIteration: jpayne@68: return jpayne@68: if token[0] == "]": jpayne@68: break jpayne@68: if token == ('', ''): jpayne@68: # ignore whitespace jpayne@68: continue jpayne@68: if token[0] and token[0][:1] in "'\"": jpayne@68: token = "'", token[0][1:-1] jpayne@68: signature.append(token[0] or "-") jpayne@68: predicate.append(token[1]) jpayne@68: signature = "".join(signature) jpayne@68: # use signature to determine predicate type jpayne@68: if signature == "@-": jpayne@68: # [@attribute] predicate jpayne@68: key = predicate[1] jpayne@68: def select(context, result): jpayne@68: for elem in result: jpayne@68: if elem.get(key) is not None: jpayne@68: yield elem jpayne@68: return select jpayne@68: if signature == "@-='": jpayne@68: # [@attribute='value'] jpayne@68: key = predicate[1] jpayne@68: value = predicate[-1] jpayne@68: def select(context, result): jpayne@68: for elem in result: jpayne@68: if elem.get(key) == value: jpayne@68: yield elem jpayne@68: return select jpayne@68: if signature == "-" and not re.match(r"\-?\d+$", predicate[0]): jpayne@68: # [tag] jpayne@68: tag = predicate[0] jpayne@68: def select(context, result): jpayne@68: for elem in result: jpayne@68: if elem.find(tag) is not None: jpayne@68: yield elem jpayne@68: return select jpayne@68: if signature == ".='" or (signature == "-='" and not re.match(r"\-?\d+$", predicate[0])): jpayne@68: # [.='value'] or [tag='value'] jpayne@68: tag = predicate[0] jpayne@68: value = predicate[-1] jpayne@68: if tag: jpayne@68: def select(context, result): jpayne@68: for elem in result: jpayne@68: for e in elem.findall(tag): jpayne@68: if "".join(e.itertext()) == value: jpayne@68: yield elem jpayne@68: break jpayne@68: else: jpayne@68: def select(context, result): jpayne@68: for elem in result: jpayne@68: if "".join(elem.itertext()) == value: jpayne@68: yield elem jpayne@68: return select jpayne@68: if signature == "-" or signature == "-()" or signature == "-()-": jpayne@68: # [index] or [last()] or [last()-index] jpayne@68: if signature == "-": jpayne@68: # [index] jpayne@68: index = int(predicate[0]) - 1 jpayne@68: if index < 0: jpayne@68: raise SyntaxError("XPath position >= 1 expected") jpayne@68: else: jpayne@68: if predicate[0] != "last": jpayne@68: raise SyntaxError("unsupported function") jpayne@68: if signature == "-()-": jpayne@68: try: jpayne@68: index = int(predicate[2]) - 1 jpayne@68: except ValueError: jpayne@68: raise SyntaxError("unsupported expression") jpayne@68: if index > -2: jpayne@68: raise SyntaxError("XPath offset from last() must be negative") jpayne@68: else: jpayne@68: index = -1 jpayne@68: def select(context, result): jpayne@68: parent_map = get_parent_map(context) jpayne@68: for elem in result: jpayne@68: try: jpayne@68: parent = parent_map[elem] jpayne@68: # FIXME: what if the selector is "*" ? jpayne@68: elems = list(parent.findall(elem.tag)) jpayne@68: if elems[index] is elem: jpayne@68: yield elem jpayne@68: except (IndexError, KeyError): jpayne@68: pass jpayne@68: return select jpayne@68: raise SyntaxError("invalid predicate") jpayne@68: jpayne@68: ops = { jpayne@68: "": prepare_child, jpayne@68: "*": prepare_star, jpayne@68: ".": prepare_self, jpayne@68: "..": prepare_parent, jpayne@68: "//": prepare_descendant, jpayne@68: "[": prepare_predicate, jpayne@68: } jpayne@68: jpayne@68: _cache = {} jpayne@68: jpayne@68: class _SelectorContext: jpayne@68: parent_map = None jpayne@68: def __init__(self, root): jpayne@68: self.root = root jpayne@68: jpayne@68: # -------------------------------------------------------------------- jpayne@68: jpayne@68: ## jpayne@68: # Generate all matching objects. jpayne@68: jpayne@68: def iterfind(elem, path, namespaces=None): jpayne@68: # compile selector pattern jpayne@68: if path[-1:] == "/": jpayne@68: path = path + "*" # implicit all (FIXME: keep this?) jpayne@68: jpayne@68: cache_key = (path,) jpayne@68: if namespaces: jpayne@68: cache_key += tuple(sorted(namespaces.items())) jpayne@68: jpayne@68: try: jpayne@68: selector = _cache[cache_key] jpayne@68: except KeyError: jpayne@68: if len(_cache) > 100: jpayne@68: _cache.clear() jpayne@68: if path[:1] == "/": jpayne@68: raise SyntaxError("cannot use absolute path on element") jpayne@68: next = iter(xpath_tokenizer(path, namespaces)).__next__ jpayne@68: try: jpayne@68: token = next() jpayne@68: except StopIteration: jpayne@68: return jpayne@68: selector = [] jpayne@68: while 1: jpayne@68: try: jpayne@68: selector.append(ops[token[0]](next, token)) jpayne@68: except StopIteration: jpayne@68: raise SyntaxError("invalid path") from None jpayne@68: try: jpayne@68: token = next() jpayne@68: if token[0] == "/": jpayne@68: token = next() jpayne@68: except StopIteration: jpayne@68: break jpayne@68: _cache[cache_key] = selector jpayne@68: # execute selector pattern jpayne@68: result = [elem] jpayne@68: context = _SelectorContext(elem) jpayne@68: for select in selector: jpayne@68: result = select(context, result) jpayne@68: return result jpayne@68: jpayne@68: ## jpayne@68: # Find first matching object. jpayne@68: jpayne@68: def find(elem, path, namespaces=None): jpayne@68: return next(iterfind(elem, path, namespaces), None) jpayne@68: jpayne@68: ## jpayne@68: # Find all matching objects. jpayne@68: jpayne@68: def findall(elem, path, namespaces=None): jpayne@68: return list(iterfind(elem, path, namespaces)) jpayne@68: jpayne@68: ## jpayne@68: # Find text for first matching object. jpayne@68: jpayne@68: def findtext(elem, path, default=None, namespaces=None): jpayne@68: try: jpayne@68: elem = next(iterfind(elem, path, namespaces)) jpayne@68: return elem.text or "" jpayne@68: except StopIteration: jpayne@68: return default