Source code for gavo.grammars.xmlgrammar
"""
A grammar for generic XML documents.
"""
#c Copyright 2008-2023, the GAVO project <gavo@ari.uni-heidelberg.de>
#c
#c This program is free software, covered by the GNU GPL. See the
#c COPYING file in the source distribution.
from lxml import etree
from gavo import base
from gavo.grammars.common import Grammar, RowIterator
[docs]def iterEventsCounting(inputFile, normalizeWhitespace):
"""wraps etree.iterparse so [ct] elements are appended to element names
when they are repeated.
This currently takes some pains to strip namespaces, which probably just
uglify the keys in almost all applications I can see for this.
"""
curPath = []
seenTags = [{}]
contentStack = [[]]
for action, elem in etree.iterparse(
inputFile,
events=("start", "end"),
remove_blank_text=normalizeWhitespace):
curTag = elem.tag
if curTag.startswith('{'):
curTag = curTag[curTag.index('}')+1:]
if action=="start":
if curTag in seenTags[-1]:
curPath.append("%s[%d]"%(curTag, seenTags[-1][curTag]))
seenTags[-1][curTag] += 1
else:
curPath.append(curTag)
seenTags[-1][curTag] = 0
seenTags.append({})
contentStack.append([])
elif action=="end":
if elem.text is not None:
contentStack[-1][:0] = [elem.text]
content = "".join(contentStack.pop()) or None
if content and normalizeWhitespace:
content = " ".join(content.split()) or None
basePath = "/".join(curPath)
yield basePath, content
for key, value in list(elem.items()):
yield basePath+"/@"+key, value
if elem.tail is not None:
contentStack[-1].append(elem.tail)
curPath.pop()
seenTags.pop()
[docs]class XMLRowIterator(RowIterator):
"""an iterator for XMLGrammars.
"""
def _iterRows(self):
if hasattr(self.sourceToken, "read"):
f, keepopen = self.sourceToken, True
else:
f, keepopen = open(self.sourceToken, "rb"), False
try:
yield dict(iterEventsCounting(
f,
self.grammar.normalizeWhitespace))
except etree.LxmlSyntaxError as ex:
raise base.SourceParseError(
ex.msg,
location=ex.position,
source=ex.filename)
finally:
if not keepopen:
f.close()
[docs]class XMLGrammar(Grammar):
"""A grammar parsing from generic XML files.
Use this grammar to parse from generic XML files. For now, one rawdict
per document is returned (later extensions might let you define elements
that will yield rows).
The keys are xpaths (e.g., root/element or root/element/@attr), the values
the (joined) text nodes that are immediate children or the element.
When elements are repeated within an element, [ct] is appended to the path
element (e.g., root/element([0]).
For now, this grammar ignores namespaces.
Because most of the keys are not valid python identifiers, you cannot
use the @key syntax when mapping this. Use vars[key] instead (or
<map key="dest" source="path"/>).
Do not use this for VOTables; use VOTableGrammar instead.
"""
name_ = "xmlGrammar"
_nsw = base.BooleanAttribute("normalizeWhitespace",
description="By default, the parser will return whitespace-only"
" content as None and will turn internal whitespace to a single"
" blank. Set this to False to preserve whitespace as present"
" in the document.",
default=True,
copyable=True)
rowIterator = XMLRowIterator