Package gavo :: Package votable :: Module votparse
[frames] | no frames]

Source Code for Module gavo.votable.votparse

  1  """ 
  2  Stream parsing of VOTables. 
  3   
  4  This module builds on a shallow wrapping of expat in utils.iterparse. 
  5  There is an "almost-tight" parsing loop in the parse method.  It 
  6  builds an xmlstan tree (mainly through the _processNodeDefault method). 
  7  """ 
  8   
  9  #c Copyright 2008-2019, the GAVO project 
 10  #c 
 11  #c This program is free software, covered by the GNU GPL.  See the 
 12  #c COPYING file in the source distribution. 
 13   
 14   
 15  # To fiddle with the nodes as they are generated, define an 
 16  # _end_ELEMENTNAME method.  If you do this, you will have to do 
 17  # any adding of children to parents yourself (it happens in  
 18  # _processNodeDefault, which is called when no custom handler is 
 19  # present. 
 20  from cStringIO import StringIO 
 21   
 22  from gavo import utils 
 23  from gavo.utils import ElementTree 
 24  from gavo.votable import common 
 25  from gavo.votable import model 
 26  from gavo.votable import tableparser 
 27   
 28   
 29  DEFAULT_WATCHSET = [] 
 30   
 31  # We treat all VOTable versions as equal. 
 32  VOTABLE_NAMESPACES = [ 
 33          "http://www.ivoa.net/xml/VOTable/v1.0", 
 34          "http://www.ivoa.net/xml/VOTable/v1.1", 
 35          "http://www.ivoa.net/xml/VOTable/v1.2", 
 36          "http://www.ivoa.net/xml/VOTable/v1.3", 
 37  ] 
 38   
 39   
40 -class IGNORE(object):
41 """this is a sentinel element used when an element is not known 42 but robust parsing is requested. 43 44 These should not end up in a DOM, but if they do, they're silent. 45 46 They're designed to largely behave like stanxml Elements; it can't 47 autoconstruct, though. 48 """
49 - def __init__(self):
50 pass
51
52 - def __call__(self, **kwargs):
53 return self
54
55 - def __getitem__(self, item):
56 pass
57
58 - def isEmpty(self):
59 return True
60
61 - def shouldBeSkipped(self):
62 return True
63
64 - def apply(self, func):
65 return
66 67
68 -def _processNodeDefault(text, child, parent):
69 """the default node processor: Append child to parent, return child. 70 """ 71 assert not (text and text.strip()), ( 72 "Content '%s' in must-empty VOTable element %s"%(text, repr(child))) 73 parent[child] 74 return child
75 76
77 -def _processNodeWithContent(text, child, parent):
78 """the node processor for nodes with text content. 79 """ 80 if text and text.strip(): 81 child[text] # Attention: mixed content not supported 82 parent[child] 83 return child
84 85 86 _end_DESCRIPTION = _processNodeWithContent 87 _end_INFO = _processNodeWithContent 88 _end_MODEL = _processNodeWithContent 89 _end_URL = _processNodeWithContent 90 _end_LITERAL = _processNodeWithContent 91 _end_NAME = _processNodeWithContent 92 # STREAMs and TABLEDATA should ordinarily be processed by the table 93 # iterator, so this really is only interesting for special applications: 94 _end_STREAM = _processNodeWithContent 95 _end_TD = _processNodeWithContent 96 _end_IDREF = _processNodeWithContent 97 _end_LITERAL = _processNodeWithContent 98 99
100 -def _end_VOTABLE(text, child, parent):
101 # VOTABLEs have no useful parents. 102 return child
103 104
105 -def _computeEndProcessorsImpl():
106 """returns a dictionary of tag names to end processors. 107 108 Each processor as defined using _end_XXXX has an entry each for 109 each namespace we're likely to encounter, and one non-namespaced. 110 """ 111 res, globs = {}, globals() 112 for n, v in globs.iteritems(): 113 if n.startswith("_end_"): 114 elName = n[5:] 115 res[elName] = v 116 for ns in VOTABLE_NAMESPACES: 117 res["%s:%s"%(ns, elName)] = v 118 return res
119 120 computeEndProcessors = utils.CachedGetter(_computeEndProcessorsImpl) 121 122
123 -def _computeElementsImpl():
124 """returns a dictionary of tag names to xmlstan elements building them. 125 126 All elements are present for each VOTABLE_NAMESPACE, plus once non-namespaced. 127 """ 128 res = {} 129 for n in dir(model.VOTable): 130 if not n.startswith("_"): 131 val = getattr(model.VOTable, n) 132 res[n] = val 133 for ns in VOTABLE_NAMESPACES: 134 res[ElementTree.QName(ns, n)] = val 135 return res
136 137 computeElements = utils.CachedGetter(_computeElementsImpl) 138 139
140 -def _cleanAttributes(attrDict, element, raiseOnInvalid):
141 """returns a sanitised version of attDict for element. 142 143 We force attribute keys to be byte strings (since they're being used 144 as keyword arguments), and we drop everything that's namespace related 145 -- it's not necessary for VOTables and people mess it up anyway. 146 147 Also, we complain about or filter out attributes that element 148 cannot deal with. 149 """ 150 cleaned = {} 151 for key, value in attrDict.iteritems(): 152 if ":" in key or key=="xmlns": 153 continue 154 key = str(key.replace("-", "_")) 155 if not hasattr(element, "_a_"+key): 156 if raiseOnInvalid: 157 raise KeyError(key) 158 else: 159 continue 160 cleaned[key] = value 161 return cleaned
162 163
164 -def parse(inFile, watchset=DEFAULT_WATCHSET, raiseOnInvalid=True):
165 """returns an iterator yielding items of interest. 166 167 inFile is a something that supports read(bytes) 168 169 watchset is a sequence of items of VOTable you want yielded. By 170 default, that's just VOTable.TABLE. You may want to see INFO 171 or PARAM of certain protocols. 172 """ 173 # This parser has gotten a bit too fat. Maybe move the whole thing 174 # to a class? All this isn't terribly critical to performance... 175 watchset = set(watchset) 176 idmap = {} 177 processors = computeEndProcessors() 178 elements = computeElements() 179 elementStack = [None] # None is VOTABLE's parent 180 iterator = utils.iterparse(inFile, common.VOTableParseError) 181 content = [] 182 183 for type, tag, payload in iterator: 184 if type=="data": 185 content.append(payload) 186 187 elif type=="start": 188 # Element open: push new node on the stack... 189 if tag not in elements: 190 if raiseOnInvalid: 191 raise iterator.getParseError("Unknown tag: %s"%tag) 192 else: 193 element = IGNORE() 194 else: 195 element = elements[tag]() 196 197 if payload: 198 try: 199 payload = _cleanAttributes(payload, element, raiseOnInvalid) 200 except KeyError as msg: 201 raise iterator.getParseError("Attribute %s invalid on %s"%( 202 str(msg), element.name_)) 203 elementStack.append(element(**payload)) 204 205 # ...prepare for new content,... 206 content = [] 207 208 # ...add the node to the id map if it has an ID... 209 elId = payload.get("ID") 210 if elId is not None: 211 idmap[elId] = elementStack[-1] 212 213 # ...and pass control to special iterator if DATA is coming in. 214 if tag=="DATA": 215 yield tableparser.Rows(elementStack[-2], iterator) 216 217 elif type=="end": 218 # Element close: process text content... 219 if content: 220 text = "".join(content) 221 content = [] 222 else: 223 text = None 224 225 # ...see if we have any special procssing to do for the node type... 226 nodeProc = processors.get(tag, _processNodeDefault) 227 preChild = elementStack.pop() 228 if not isinstance(preChild, IGNORE): 229 # ...call handler with the current node and its future parent... 230 child = nodeProc(text, preChild, elementStack[-1]) 231 232 # ...and let user do something with the element if she ordered it. 233 if child is not None and child.__class__ in watchset: 234 child.idmap = idmap 235 yield child 236 237 else: 238 assert False
239 240
241 -def readRaw(inFile):
242 """returns a V.VOTABLE instance with filled-in data for the input from 243 inFile. 244 """ 245 for el in parse(inFile, [model.VOTable.TABLE, model.VOTable.VOTABLE]): 246 if isinstance(el, tableparser.Rows): 247 el.tableDefinition.rows = list(el) 248 return el
249 250
251 -def parseString(string, watchset=DEFAULT_WATCHSET, raiseOnInvalid=True):
252 """returns an iterator yielding pairs of (table definition, row iterator). 253 254 string contains a VOTable literal. 255 """ 256 return parse(StringIO(string), watchset, raiseOnInvalid)
257