"""
A stan-like model for building namespaced XML trees.
The main reason for this module is that much of the VO's XML mess is based
on XML schema and thus has namespaced attributes. This single design
decision ruins the entire XML design. To retain some rests of
sanity, I treat the prefixes themselves as namespaces and maintain
a single central registry from prefixes to namespaces in this module.
Then, the elements only use these prefixes, and this module makes sure
that during serialization the instance document's root element contains
the namespace mapping (and the schema locations) required.
"""
#c Copyright 2008-2023, the GAVO project <gavo@ari.uni-heidelberg.de>
#c
#c This program is free software, covered by the GNU GPL. See the
#c COPYING file in the source distribution.
import io
from xml.etree import ElementTree
from gavo.utils import autonode
from gavo.utils import excs
from gavo.utils import misctricks
from gavo.utils import texttricks
[docs]class Error(Exception):
pass
[docs]class ChildNotAllowed(Error):
pass
# you probably can't just change encoding here, as DaCHS assumes UTF-8
# essentially almost everywhere. So, consider the pretense legacy.
encoding = "utf-8"
XML_HEADER = texttricks.bytify('<?xml version="1.0" encoding="%s"?>'%encoding)
class _Autoconstructor(autonode.AutoNodeType):
"""A metaclass used for Elements.
On the one hand, it does autonode's constructor magic with _a_<attrname>
attributes, on the other, it will instantiate itself when indexed
-- that we want for convenient stan-like notation.
"""
def __init__(cls, name, bases, dict):
autonode.AutoNodeType.__init__(cls, name, bases, dict)
if hasattr(cls, "_childSequence") and cls._childSequence is not None:
cls._allowedChildren = set(cls._childSequence)
else:
cls._childSequence = None
def __getitem__(cls, items):
return cls()[items]
[docs]class Stub(object):
"""A sentinel class for embedding objects not yet existing into
stanxml trees.
These have a single opaque object and need to be dealt with by the
user. One example of how these can be used is the ColRefs in stc to
utype conversion.
Stubs are equal to each othter if their handles are identical.
"""
name_ = "stub"
text_ = None
def __init__(self, dest):
self.dest = dest
def __repr__(self):
return "%s(%s)"%(self.__class__.__name__, repr(self.dest))
def __eq__(self, other):
return self.dest==getattr(other, "dest", Stub)
def __ne__(self, other):
return not self==other
def __hash__(self):
return hash(self.dest)
[docs] def isEmpty(self):
return False
[docs] def shouldBeSkipped(self):
return False
[docs] def getChildDict(self):
return {}
[docs] def iterAttNames(self):
if False:
yield
[docs] def apply(self, func):
"""does nothing.
Stubs don't have what Element.apply needs, so we don't even pretend.
"""
return
[docs]class Element(object, metaclass=_Autoconstructor):
"""An element for serialization into XML.
This is loosely modelled after t.w.template stan.
Don't add to the children attribute directly, use addChild or (more
usually) __getitem__.
Elements have attributes and children. The attributes are defined,
complete with defaults, in _a_<name> attributes as in AutoNodes.
Attributes are checked.
Children are not usually checked, but you can set a _childSequence
attribute containing a list of (unqualified) element names. These
children will be emitted in the sequence given.
When deriving from Elements, you may need attribute names that are not
python identifiers (e.g., with dashes in them). In that case, define
an attribute _name_a_<att> and point it to any string you want as the
attribute.
When serializing these, empty elements (i.e. those having an empty text and
having no non-empty children) are usually discarded. If you need such an
element (e.g., for attributes), set mayBeEmpty to True.
Since insane XSD mandates that local elements must not be qualified when
elementFormDefault is unqualified, you need to set _local=True on
such local elements to suppress the namespace prefix. Attribute names
are never qualified here. If you need qualified attributes, you'll
have to use attribute name translation.
The content of the DOM may be anything recognized by addChild.
In particular, you can give objects a serializeToXMLStan method returning
strings or an Element to make them good DOM citizens.
Elements cannot harbor mixed content (or rather, there is only
one piece of text).
"""
name_ = None
_a_id = None
_prefix = ""
_additionalPrefixes = frozenset()
_mayBeEmpty = False
_local = False
_stringifyContent = False
# should probably do this in the elements needing it (quite a lot of them
# do, however...)
_name_a_xsi_type = "xsi:type"
# for type dispatching in addChild.
_generator_t = type((x for x in ()))
# see _setupNode below for __init__
def __getitem__(self, children):
self.addChild(children)
return self
def __call__(self, **kw):
if not kw:
return self
# XXX TODO: namespaced attributes?
for k, v in kw.items():
# Only allow setting attributes already present
getattr(self, k)
setattr(self, k, v)
return self
def __iter__(self):
raise NotImplementedError("Element instances are not iterable.")
def __bool__(self):
return self.isEmpty()
def _setupNodeNext(self, cls):
try:
pc = super(cls, self)._setupNode
except AttributeError:
pass
else:
pc()
def _setupNode(self):
self._isEmptyCache = None
self._children = []
self.text_ = ""
if self.name_ is None:
self.name_ = self.__class__.__name__.split(".")[-1]
self._setupNodeNext(Element)
def _makeAttrDict(self):
res = {}
for name, attName in self.iterAttNames():
if getattr(self, name, None) is not None:
res[attName] = str(getattr(self, name))
return res
def _iterChildrenInSequence(self):
cDict = self.getChildDict()
for cName in self._childSequence:
if cName in cDict:
for c in cDict[cName]:
yield c
[docs] def bailIfBadChild(self, child):
if (self._childSequence is not None
and getattr(child, "name_", None) not in self._allowedChildren
and type(child) not in self._allowedChildren):
raise ChildNotAllowed("No %s children in %s"%(
getattr(child, "name_", "text"), self.name_))
[docs] def deepcopy(self):
"""returns a deep copy of self.
"""
copy = self.__class__(**self._makeAttrDict())
for child in self.iterChildren():
if isinstance(child, Element):
copy.addChild(child.deepcopy())
else:
copy.addChild(child)
return copy
# t.w.template compatibility
clone = deepcopy
[docs] def addChild(self, child):
"""adds child to the list of children.
Child may be an Element, a string, or a list or tuple of Elements and
strings. Finally, child may be None, in which case nothing will be
added.
"""
self._isEmptyCache = None
if child is None:
pass
elif hasattr(child, "serializeToXMLStan"):
self.addChild(child.serializeToXMLStan())
elif isinstance(child, str):
self.bailIfBadChild(child)
self.text_ = child
elif isinstance(child, (Element, Stub)):
self.bailIfBadChild(child)
self._children.append(child)
elif isinstance(child, (list, tuple, self._generator_t)):
for c in child:
self.addChild(c)
elif isinstance(child, _Autoconstructor):
self.addChild(child())
elif self._stringifyContent:
self.addChild(str(child))
elif isinstance(child, Exception):
self.addChild("EXCEPTION:\n%s: %s"%(
child.__class__.__name__,
str(child)))
elif isinstance(child, bytes):
# we assume our standard encoding of utf-8 and an actual string;
# not much else would make sense within a stan tree
self.addChild(child.decode("utf-8"))
else:
raise Error("%s element %s cannot be added to %s node"%(
type(child), repr(child), self.name_))
[docs] def isEmpty(self):
"""returns true if the current node has no non-empty children and no
non-whitespace text content.
"""
if self._isEmptyCache is None:
self._isEmptyCache = True
if self.text_.strip():
self._isEmptyCache = False
if self._isEmptyCache:
for c in self._children:
if not c.shouldBeSkipped():
self._isEmptyCache = False
break
return self._isEmptyCache
[docs] def shouldBeSkipped(self):
"""returns true if the current node should be part of an output.
That is true if it is either non-empty or _mayBeEmpty is true.
An empty element is one that has only empty children and no
non-whitespace text content.
"""
if self._mayBeEmpty:
return False
return self.isEmpty()
[docs] def iterAttNames(self):
"""iterates over the defined attribute names of this node.
Each element returned is a pair of the node attribute name and the
xml name (which may be translated via _a_name_<att>
"""
for name, default in self._nodeAttrs:
xmlName = getattr(self, "_name_a_"+name, name)
yield name, xmlName
[docs] def addAttribute(self, attName, attValue):
"""adds attName, attValue to this Element's attributes when instantiated.
You cannot add _a_<attname> attributes to instances. Thus, when
in a pinch, use this.
"""
attName = str(attName)
if not hasattr(self, attName):
self._nodeAttrs.append((attName, attValue))
setattr(self, attName, attValue)
[docs] def iterChildrenOfType(self, type):
"""iterates over all children having type.
"""
for c in self._children:
if isinstance(c, type):
yield c
[docs] def iterChildren(self):
return iter(self._children)
[docs] def getChildDict(self):
cDict = {}
for c in self._children:
cDict.setdefault(c.name_, []).append(c)
return cDict
[docs] def iterChildrenWithName(self, elName):
"""iterates over children whose element name is elName.
This always does a linear search through the children and hence
may be slow.
"""
for c in self._children:
if c.name_==elName:
yield c
def _getChildIter(self):
if self._childSequence is None:
return iter(self._children)
else:
return self._iterChildrenInSequence()
[docs] def apply(self, func):
"""calls func(node, text, attrs, childIter).
This is a building block for tree traversals; the expectation is that
func does something like func(node, text, attrDict, childSequence).
"""
try:
if self.shouldBeSkipped():
return
attrs = self._makeAttrDict()
return func(self, self.text_,
attrs, self._getChildIter())
except Error:
raise
except Exception:
misctricks.sendUIEvent("Info",
"Internal failure while building XML; context is"
" %s node with children %s"%(
self.name_,
texttricks.makeEllipsis(repr(self._children), 60)))
raise
[docs] def render(self,
prefixForEmpty=None,
includeSchemaLocation=True,
xmlDecl=False,
prolog=None):
"""returns this and its children as a string.
"""
f = io.BytesIO()
xmlwrite(self,
f,
prefixForEmpty=prefixForEmpty,
xmlDecl=xmlDecl,
includeSchemaLocation=includeSchemaLocation,
prolog=texttricks.bytify(prolog))
return f.getvalue()
[docs]class NSRegistry(object):
"""A container for a registry of namespace prefixes to namespaces.
This is used to have fixed namespace prefixes (IMHO the only way
to have namespaced attribute values and retain sanity). The
class is never instantiated. It is used through the module-level
method registerPrefix.
"""
_registry = {}
_reverseRegistry = {}
_schemaLocations = {}
[docs] @classmethod
def registerPrefix(cls, prefix, ns, schemaLocation):
if prefix in cls._registry:
if ns!=cls._registry[prefix]:
raise ValueError("Prefix %s is already allocated for namespace %s"%
(prefix, ns))
cls._registry[prefix] = ns
cls._reverseRegistry[ns] = prefix
cls._schemaLocations[prefix] = schemaLocation
[docs] @classmethod
def getPrefixForNS(cls, ns):
try:
return cls._reverseRegistry[ns]
except KeyError:
raise excs.NotFoundError(ns, "XML namespace",
"registry of XML namespaces.", hint="The registry is filled"
" by modules as they are imported -- maybe you need to import"
" the right module?")
[docs] @classmethod
def getNSForPrefix(cls, prefix):
try:
return cls._registry[prefix]
except KeyError:
raise excs.NotFoundError(prefix, "XML namespace prefix",
"registry of prefixes.", hint="The registry is filled"
" by modules as they are imported -- maybe you need to import"
" the right module?")
@classmethod
def _iterNSAttrs(cls, prefixes, prefixForEmpty, includeSchemaLocation):
"""iterates over pairs of (attrName, attrVal) for declaring
prefixes.
"""
# null prefixes are ignored here; prefixForEmpty, if non-null, gives
# the prefix the namespace would normally be bound to.
prefixes.discard("")
schemaLocations = []
for pref in sorted(prefixes):
yield "xmlns:%s"%pref, cls._registry[pref]
if includeSchemaLocation and cls._schemaLocations[pref]:
schemaLocations.append("%s %s"%(
cls._registry[pref],
cls._schemaLocations[pref]))
if prefixForEmpty:
yield "xmlns", cls._registry[prefixForEmpty]
if schemaLocations:
if not "xsi" in prefixes:
yield "xmlns:xsi", cls._registry["xsi"]
yield "xsi:schemaLocation", " ".join(schemaLocations)
[docs] @classmethod
def addNamespaceDeclarationsETree(cls, root, prefixes, prefixForEmpty=None,
includeSchemaLocation=True):
"""adds xmlns declarations for prefixes to the etree node root.
With stanxml and the global-prefix scheme, xmlns declarations
only come at the root element; thus, root should indeed be root
rather than some random element.
Deprecated, don't use ElementTree with stanxml any more.
"""
for attName, attVal in cls._iterNSAttrs(prefixes, prefixForEmpty,
includeSchemaLocation):
root.attrib[attName] = attVal
[docs] @classmethod
def addNamespaceDeclarations(cls, root, prefixes, prefixForEmpty=None,
includeSchemaLocation=True):
"""adds xmlns declarations for prefixes to the stanxml node root.
With stanxml and the global-prefix scheme, xmlns declarations
only come at the root element; thus, root should indeed be root
rather than some random element.
"""
for attName, attVal in cls._iterNSAttrs(prefixes, prefixForEmpty,
includeSchemaLocation):
root.addAttribute(attName, attVal)
[docs] @classmethod
def getPrefixInfo(cls, prefix):
return (cls._registry[prefix], cls._schemaLocations[prefix])
[docs] @classmethod
def getSchemaForNS(self, ns):
try:
return self._schemaLocations[self._reverseRegistry[ns]]
except KeyError:
raise excs.NotFoundError(ns, "XML namespace",
"registry of XML namespaces.", hint="The registry is filled"
" by modules as they are imported -- maybe you need to import"
" the right module?")
registerPrefix = NSRegistry.registerPrefix
getPrefixInfo = NSRegistry.getPrefixInfo
[docs]def schemaURL(xsdName):
"""returns the URL to the local mirror of the schema xsdName.
This is used by the various xmlstan clients to make schemaLocations.
"""
return "http://vo.ari.uni-heidelberg.de/docs/schemata/"+xsdName
registerPrefix("xsi","http://www.w3.org/2001/XMLSchema-instance", None)
# convenience for _additionalPrefixes of elements needing the xsi prefix
# (and no others) in their attributes.
xsiPrefix = frozenset(["xsi"])
[docs]class NillableMixin(object):
"""An Element mixin making the element XSD nillable.
This element will automatically have an xsi:nil="true" attribute
on empty elements (rather than leave them out entirely).
This overrides apply, so the mixin must be before the base class in
the inheritance list.
"""
_mayBeEmpty = True
[docs] def apply(self, func):
attrs = self._makeAttrDict()
if self.text_:
return Element.apply(self, func)
else:
attrs = self._makeAttrDict()
attrs["xsi:nil"] = "true"
self._additionalPrefixes = self._additionalPrefixes|set(["xsi"])
return func(self, "", attrs, ())
[docs] def isEmpty(self):
return False
[docs]def escapePCDATA(val):
return (val
).replace("&", "&"
).replace('<', '<'
).replace('>', '>'
).replace("\0", "&x00;")
[docs]def escapeAttrVal(val):
return '"%s"'%(escapePCDATA(val).replace('"', '"'))
def _makeVisitor(outputFile, prefixForEmpty):
"""returns a function writing nodes to outputFile.
"""
def visit(node, text, attrs, childIter):
attrRepr = " ".join(sorted("%s=%s"%(k, escapeAttrVal(attrs[k]))
for k in attrs))
if attrRepr:
attrRepr = " "+attrRepr
if getattr(node, "_fixedTagMaterial", None):
attrRepr = attrRepr+" "+node._fixedTagMaterial
if not node._prefix or node._local or node._prefix==prefixForEmpty:
name = node.name_
else:
name = "%s:%s"%(node._prefix, node.name_)
if node.isEmpty():
if node._mayBeEmpty:
outputFile.write(texttricks.bytify("<%s%s/>"%(name, attrRepr)))
else:
outputFile.write(texttricks.bytify("<%s%s>"%(name, attrRepr)))
try:
try:
if text:
outputFile.write(texttricks.bytify(escapePCDATA(text)))
for c in childIter:
if hasattr(c, "write"):
c.write(outputFile)
else:
c.apply(visit)
except Exception as ex:
if hasattr(node, "writeErrorElement"):
node.writeErrorElement(outputFile, ex)
raise
finally:
outputFile.write(texttricks.bytify("</%s>"%name))
return visit
[docs]def xmlwrite(root, outputFile, prefixForEmpty=None, nsRegistry=NSRegistry,
xmlDecl=True, includeSchemaLocation=True, prolog=None):
"""writes an xmlstan tree starting at root to outputFile.
prefixForEmpty is a namespace URI that should have no prefix at all.
outputFile must be opened in binary mode.
"""
# since namespaces only enter here through prefixes, I just need to
# figure out which ones are used.
prefixesUsed = set()
def collectPrefixes(node, text, attrs, childIter,
prefixesUsed=prefixesUsed):
prefixesUsed |= node._additionalPrefixes
prefixesUsed.add(node._prefix)
for child in childIter:
child.apply(collectPrefixes)
root.apply(collectPrefixes)
# An incredibly nasty hack for VOTable generation; we need a better
# way to handle with the 1.1/1.2 namespaces: Root may declare it
# handles all NS declarations itself. Die, die, die.
if getattr(root, "_fixedTagMaterial", None) is None:
nsRegistry.addNamespaceDeclarations(root, prefixesUsed, prefixForEmpty,
includeSchemaLocation)
if xmlDecl:
outputFile.write(b"<?xml version='1.0' encoding='utf-8'?>\n")
if prolog:
outputFile.write(texttricks.bytify(prolog)+b"\n")
root.apply(_makeVisitor(outputFile, prefixForEmpty))
[docs]def xmlrender(tree, prolog=None, prefixForEmpty=None):
"""returns a unicode object containing tree in serialized forms.
tree can be any object with a render method or some sort of string.
If it's a byte string, it must not contain any non-ASCII.
If prolog is given, it must be a string that will be prepended to the
serialization of tree. The way ElementTree currently is implemented,
you can use this for xml declarations or stylesheet processing
instructions.
"""
if hasattr(tree, "render"):
res = tree.render(prefixForEmpty=prefixForEmpty)
elif hasattr(tree, "getchildren"): # hopefully an xml.etree Element
res = ElementTree.tostring(tree)
elif isinstance(tree, bytes):
res = tree.decode("utf-8")
elif isinstance(tree, str):
res = tree
else:
raise ValueError("Cannot render %s"%repr(tree))
if prolog:
res = texttricks.bytify(prolog)+res
return res