jpayne@68: #
jpayne@68: # ElementTree
jpayne@68: # $Id: ElementPath.py 3375 2008-02-13 08:05:08Z fredrik $
jpayne@68: #
jpayne@68: # limited xpath support for element trees
jpayne@68: #
jpayne@68: # history:
jpayne@68: # 2003-05-23 fl   created
jpayne@68: # 2003-05-28 fl   added support for // etc
jpayne@68: # 2003-08-27 fl   fixed parsing of periods in element names
jpayne@68: # 2007-09-10 fl   new selection engine
jpayne@68: # 2007-09-12 fl   fixed parent selector
jpayne@68: # 2007-09-13 fl   added iterfind; changed findall to return a list
jpayne@68: # 2007-11-30 fl   added namespaces support
jpayne@68: # 2009-10-30 fl   added child element value filter
jpayne@68: #
jpayne@68: # Copyright (c) 2003-2009 by Fredrik Lundh.  All rights reserved.
jpayne@68: #
jpayne@68: # fredrik@pythonware.com
jpayne@68: # http://www.pythonware.com
jpayne@68: #
jpayne@68: # --------------------------------------------------------------------
jpayne@68: # The ElementTree toolkit is
jpayne@68: #
jpayne@68: # Copyright (c) 1999-2009 by Fredrik Lundh
jpayne@68: #
jpayne@68: # By obtaining, using, and/or copying this software and/or its
jpayne@68: # associated documentation, you agree that you have read, understood,
jpayne@68: # and will comply with the following terms and conditions:
jpayne@68: #
jpayne@68: # Permission to use, copy, modify, and distribute this software and
jpayne@68: # its associated documentation for any purpose and without fee is
jpayne@68: # hereby granted, provided that the above copyright notice appears in
jpayne@68: # all copies, and that both that copyright notice and this permission
jpayne@68: # notice appear in supporting documentation, and that the name of
jpayne@68: # Secret Labs AB or the author not be used in advertising or publicity
jpayne@68: # pertaining to distribution of the software without specific, written
jpayne@68: # prior permission.
jpayne@68: #
jpayne@68: # SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD
jpayne@68: # TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-
jpayne@68: # ABILITY AND FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR
jpayne@68: # BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY
jpayne@68: # DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
jpayne@68: # WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
jpayne@68: # ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
jpayne@68: # OF THIS SOFTWARE.
jpayne@68: # --------------------------------------------------------------------
jpayne@68: 
jpayne@68: # Licensed to PSF under a Contributor Agreement.
jpayne@68: # See http://www.python.org/psf/license for licensing details.
jpayne@68: 
jpayne@68: ##
jpayne@68: # Implementation module for XPath support.  There's usually no reason
jpayne@68: # to import this module directly; the <b>ElementTree</b> does this for
jpayne@68: # you, if needed.
jpayne@68: ##
jpayne@68: 
jpayne@68: import re
jpayne@68: 
jpayne@68: xpath_tokenizer_re = re.compile(
jpayne@68:     r"("
jpayne@68:     r"'[^']*'|\"[^\"]*\"|"
jpayne@68:     r"::|"
jpayne@68:     r"//?|"
jpayne@68:     r"\.\.|"
jpayne@68:     r"\(\)|"
jpayne@68:     r"[/.*:\[\]\(\)@=])|"
jpayne@68:     r"((?:\{[^}]+\})?[^/\[\]\(\)@=\s]+)|"
jpayne@68:     r"\s+"
jpayne@68:     )
jpayne@68: 
jpayne@68: def xpath_tokenizer(pattern, namespaces=None):
jpayne@68:     default_namespace = namespaces.get('') if namespaces else None
jpayne@68:     parsing_attribute = False
jpayne@68:     for token in xpath_tokenizer_re.findall(pattern):
jpayne@68:         ttype, tag = token
jpayne@68:         if tag and tag[0] != "{":
jpayne@68:             if ":" in tag:
jpayne@68:                 prefix, uri = tag.split(":", 1)
jpayne@68:                 try:
jpayne@68:                     if not namespaces:
jpayne@68:                         raise KeyError
jpayne@68:                     yield ttype, "{%s}%s" % (namespaces[prefix], uri)
jpayne@68:                 except KeyError:
jpayne@68:                     raise SyntaxError("prefix %r not found in prefix map" % prefix) from None
jpayne@68:             elif default_namespace and not parsing_attribute:
jpayne@68:                 yield ttype, "{%s}%s" % (default_namespace, tag)
jpayne@68:             else:
jpayne@68:                 yield token
jpayne@68:             parsing_attribute = False
jpayne@68:         else:
jpayne@68:             yield token
jpayne@68:             parsing_attribute = ttype == '@'
jpayne@68: 
jpayne@68: 
jpayne@68: def get_parent_map(context):
jpayne@68:     parent_map = context.parent_map
jpayne@68:     if parent_map is None:
jpayne@68:         context.parent_map = parent_map = {}
jpayne@68:         for p in context.root.iter():
jpayne@68:             for e in p:
jpayne@68:                 parent_map[e] = p
jpayne@68:     return parent_map
jpayne@68: 
jpayne@68: 
jpayne@68: def _is_wildcard_tag(tag):
jpayne@68:     return tag[:3] == '{*}' or tag[-2:] == '}*'
jpayne@68: 
jpayne@68: 
jpayne@68: def _prepare_tag(tag):
jpayne@68:     _isinstance, _str = isinstance, str
jpayne@68:     if tag == '{*}*':
jpayne@68:         # Same as '*', but no comments or processing instructions.
jpayne@68:         # It can be a surprise that '*' includes those, but there is no
jpayne@68:         # justification for '{*}*' doing the same.
jpayne@68:         def select(context, result):
jpayne@68:             for elem in result:
jpayne@68:                 if _isinstance(elem.tag, _str):
jpayne@68:                     yield elem
jpayne@68:     elif tag == '{}*':
jpayne@68:         # Any tag that is not in a namespace.
jpayne@68:         def select(context, result):
jpayne@68:             for elem in result:
jpayne@68:                 el_tag = elem.tag
jpayne@68:                 if _isinstance(el_tag, _str) and el_tag[0] != '{':
jpayne@68:                     yield elem
jpayne@68:     elif tag[:3] == '{*}':
jpayne@68:         # The tag in any (or no) namespace.
jpayne@68:         suffix = tag[2:]  # '}name'
jpayne@68:         no_ns = slice(-len(suffix), None)
jpayne@68:         tag = tag[3:]
jpayne@68:         def select(context, result):
jpayne@68:             for elem in result:
jpayne@68:                 el_tag = elem.tag
jpayne@68:                 if el_tag == tag or _isinstance(el_tag, _str) and el_tag[no_ns] == suffix:
jpayne@68:                     yield elem
jpayne@68:     elif tag[-2:] == '}*':
jpayne@68:         # Any tag in the given namespace.
jpayne@68:         ns = tag[:-1]
jpayne@68:         ns_only = slice(None, len(ns))
jpayne@68:         def select(context, result):
jpayne@68:             for elem in result:
jpayne@68:                 el_tag = elem.tag
jpayne@68:                 if _isinstance(el_tag, _str) and el_tag[ns_only] == ns:
jpayne@68:                     yield elem
jpayne@68:     else:
jpayne@68:         raise RuntimeError(f"internal parser error, got {tag}")
jpayne@68:     return select
jpayne@68: 
jpayne@68: 
jpayne@68: def prepare_child(next, token):
jpayne@68:     tag = token[1]
jpayne@68:     if _is_wildcard_tag(tag):
jpayne@68:         select_tag = _prepare_tag(tag)
jpayne@68:         def select(context, result):
jpayne@68:             def select_child(result):
jpayne@68:                 for elem in result:
jpayne@68:                     yield from elem
jpayne@68:             return select_tag(context, select_child(result))
jpayne@68:     else:
jpayne@68:         if tag[:2] == '{}':
jpayne@68:             tag = tag[2:]  # '{}tag' == 'tag'
jpayne@68:         def select(context, result):
jpayne@68:             for elem in result:
jpayne@68:                 for e in elem:
jpayne@68:                     if e.tag == tag:
jpayne@68:                         yield e
jpayne@68:     return select
jpayne@68: 
jpayne@68: def prepare_star(next, token):
jpayne@68:     def select(context, result):
jpayne@68:         for elem in result:
jpayne@68:             yield from elem
jpayne@68:     return select
jpayne@68: 
jpayne@68: def prepare_self(next, token):
jpayne@68:     def select(context, result):
jpayne@68:         yield from result
jpayne@68:     return select
jpayne@68: 
jpayne@68: def prepare_descendant(next, token):
jpayne@68:     try:
jpayne@68:         token = next()
jpayne@68:     except StopIteration:
jpayne@68:         return
jpayne@68:     if token[0] == "*":
jpayne@68:         tag = "*"
jpayne@68:     elif not token[0]:
jpayne@68:         tag = token[1]
jpayne@68:     else:
jpayne@68:         raise SyntaxError("invalid descendant")
jpayne@68: 
jpayne@68:     if _is_wildcard_tag(tag):
jpayne@68:         select_tag = _prepare_tag(tag)
jpayne@68:         def select(context, result):
jpayne@68:             def select_child(result):
jpayne@68:                 for elem in result:
jpayne@68:                     for e in elem.iter():
jpayne@68:                         if e is not elem:
jpayne@68:                             yield e
jpayne@68:             return select_tag(context, select_child(result))
jpayne@68:     else:
jpayne@68:         if tag[:2] == '{}':
jpayne@68:             tag = tag[2:]  # '{}tag' == 'tag'
jpayne@68:         def select(context, result):
jpayne@68:             for elem in result:
jpayne@68:                 for e in elem.iter(tag):
jpayne@68:                     if e is not elem:
jpayne@68:                         yield e
jpayne@68:     return select
jpayne@68: 
jpayne@68: def prepare_parent(next, token):
jpayne@68:     def select(context, result):
jpayne@68:         # FIXME: raise error if .. is applied at toplevel?
jpayne@68:         parent_map = get_parent_map(context)
jpayne@68:         result_map = {}
jpayne@68:         for elem in result:
jpayne@68:             if elem in parent_map:
jpayne@68:                 parent = parent_map[elem]
jpayne@68:                 if parent not in result_map:
jpayne@68:                     result_map[parent] = None
jpayne@68:                     yield parent
jpayne@68:     return select
jpayne@68: 
jpayne@68: def prepare_predicate(next, token):
jpayne@68:     # FIXME: replace with real parser!!! refs:
jpayne@68:     # http://effbot.org/zone/simple-iterator-parser.htm
jpayne@68:     # http://javascript.crockford.com/tdop/tdop.html
jpayne@68:     signature = []
jpayne@68:     predicate = []
jpayne@68:     while 1:
jpayne@68:         try:
jpayne@68:             token = next()
jpayne@68:         except StopIteration:
jpayne@68:             return
jpayne@68:         if token[0] == "]":
jpayne@68:             break
jpayne@68:         if token == ('', ''):
jpayne@68:             # ignore whitespace
jpayne@68:             continue
jpayne@68:         if token[0] and token[0][:1] in "'\"":
jpayne@68:             token = "'", token[0][1:-1]
jpayne@68:         signature.append(token[0] or "-")
jpayne@68:         predicate.append(token[1])
jpayne@68:     signature = "".join(signature)
jpayne@68:     # use signature to determine predicate type
jpayne@68:     if signature == "@-":
jpayne@68:         # [@attribute] predicate
jpayne@68:         key = predicate[1]
jpayne@68:         def select(context, result):
jpayne@68:             for elem in result:
jpayne@68:                 if elem.get(key) is not None:
jpayne@68:                     yield elem
jpayne@68:         return select
jpayne@68:     if signature == "@-='":
jpayne@68:         # [@attribute='value']
jpayne@68:         key = predicate[1]
jpayne@68:         value = predicate[-1]
jpayne@68:         def select(context, result):
jpayne@68:             for elem in result:
jpayne@68:                 if elem.get(key) == value:
jpayne@68:                     yield elem
jpayne@68:         return select
jpayne@68:     if signature == "-" and not re.match(r"\-?\d+$", predicate[0]):
jpayne@68:         # [tag]
jpayne@68:         tag = predicate[0]
jpayne@68:         def select(context, result):
jpayne@68:             for elem in result:
jpayne@68:                 if elem.find(tag) is not None:
jpayne@68:                     yield elem
jpayne@68:         return select
jpayne@68:     if signature == ".='" or (signature == "-='" and not re.match(r"\-?\d+$", predicate[0])):
jpayne@68:         # [.='value'] or [tag='value']
jpayne@68:         tag = predicate[0]
jpayne@68:         value = predicate[-1]
jpayne@68:         if tag:
jpayne@68:             def select(context, result):
jpayne@68:                 for elem in result:
jpayne@68:                     for e in elem.findall(tag):
jpayne@68:                         if "".join(e.itertext()) == value:
jpayne@68:                             yield elem
jpayne@68:                             break
jpayne@68:         else:
jpayne@68:             def select(context, result):
jpayne@68:                 for elem in result:
jpayne@68:                     if "".join(elem.itertext()) == value:
jpayne@68:                         yield elem
jpayne@68:         return select
jpayne@68:     if signature == "-" or signature == "-()" or signature == "-()-":
jpayne@68:         # [index] or [last()] or [last()-index]
jpayne@68:         if signature == "-":
jpayne@68:             # [index]
jpayne@68:             index = int(predicate[0]) - 1
jpayne@68:             if index < 0:
jpayne@68:                 raise SyntaxError("XPath position >= 1 expected")
jpayne@68:         else:
jpayne@68:             if predicate[0] != "last":
jpayne@68:                 raise SyntaxError("unsupported function")
jpayne@68:             if signature == "-()-":
jpayne@68:                 try:
jpayne@68:                     index = int(predicate[2]) - 1
jpayne@68:                 except ValueError:
jpayne@68:                     raise SyntaxError("unsupported expression")
jpayne@68:                 if index > -2:
jpayne@68:                     raise SyntaxError("XPath offset from last() must be negative")
jpayne@68:             else:
jpayne@68:                 index = -1
jpayne@68:         def select(context, result):
jpayne@68:             parent_map = get_parent_map(context)
jpayne@68:             for elem in result:
jpayne@68:                 try:
jpayne@68:                     parent = parent_map[elem]
jpayne@68:                     # FIXME: what if the selector is "*" ?
jpayne@68:                     elems = list(parent.findall(elem.tag))
jpayne@68:                     if elems[index] is elem:
jpayne@68:                         yield elem
jpayne@68:                 except (IndexError, KeyError):
jpayne@68:                     pass
jpayne@68:         return select
jpayne@68:     raise SyntaxError("invalid predicate")
jpayne@68: 
jpayne@68: ops = {
jpayne@68:     "": prepare_child,
jpayne@68:     "*": prepare_star,
jpayne@68:     ".": prepare_self,
jpayne@68:     "..": prepare_parent,
jpayne@68:     "//": prepare_descendant,
jpayne@68:     "[": prepare_predicate,
jpayne@68:     }
jpayne@68: 
jpayne@68: _cache = {}
jpayne@68: 
jpayne@68: class _SelectorContext:
jpayne@68:     parent_map = None
jpayne@68:     def __init__(self, root):
jpayne@68:         self.root = root
jpayne@68: 
jpayne@68: # --------------------------------------------------------------------
jpayne@68: 
jpayne@68: ##
jpayne@68: # Generate all matching objects.
jpayne@68: 
jpayne@68: def iterfind(elem, path, namespaces=None):
jpayne@68:     # compile selector pattern
jpayne@68:     if path[-1:] == "/":
jpayne@68:         path = path + "*" # implicit all (FIXME: keep this?)
jpayne@68: 
jpayne@68:     cache_key = (path,)
jpayne@68:     if namespaces:
jpayne@68:         cache_key += tuple(sorted(namespaces.items()))
jpayne@68: 
jpayne@68:     try:
jpayne@68:         selector = _cache[cache_key]
jpayne@68:     except KeyError:
jpayne@68:         if len(_cache) > 100:
jpayne@68:             _cache.clear()
jpayne@68:         if path[:1] == "/":
jpayne@68:             raise SyntaxError("cannot use absolute path on element")
jpayne@68:         next = iter(xpath_tokenizer(path, namespaces)).__next__
jpayne@68:         try:
jpayne@68:             token = next()
jpayne@68:         except StopIteration:
jpayne@68:             return
jpayne@68:         selector = []
jpayne@68:         while 1:
jpayne@68:             try:
jpayne@68:                 selector.append(ops[token[0]](next, token))
jpayne@68:             except StopIteration:
jpayne@68:                 raise SyntaxError("invalid path") from None
jpayne@68:             try:
jpayne@68:                 token = next()
jpayne@68:                 if token[0] == "/":
jpayne@68:                     token = next()
jpayne@68:             except StopIteration:
jpayne@68:                 break
jpayne@68:         _cache[cache_key] = selector
jpayne@68:     # execute selector pattern
jpayne@68:     result = [elem]
jpayne@68:     context = _SelectorContext(elem)
jpayne@68:     for select in selector:
jpayne@68:         result = select(context, result)
jpayne@68:     return result
jpayne@68: 
jpayne@68: ##
jpayne@68: # Find first matching object.
jpayne@68: 
jpayne@68: def find(elem, path, namespaces=None):
jpayne@68:     return next(iterfind(elem, path, namespaces), None)
jpayne@68: 
jpayne@68: ##
jpayne@68: # Find all matching objects.
jpayne@68: 
jpayne@68: def findall(elem, path, namespaces=None):
jpayne@68:     return list(iterfind(elem, path, namespaces))
jpayne@68: 
jpayne@68: ##
jpayne@68: # Find text for first matching object.
jpayne@68: 
jpayne@68: def findtext(elem, path, default=None, namespaces=None):
jpayne@68:     try:
jpayne@68:         elem = next(iterfind(elem, path, namespaces))
jpayne@68:         return elem.text or ""
jpayne@68:     except StopIteration:
jpayne@68:         return default