Package gavo :: Package votable :: Module tableparser
[frames] | no frames]

Source Code for Module gavo.votable.tableparser

  1  """ 
  2  Parsing various forms of tabular data embedded in VOTables. 
  3   
  4  WARNING: This will fail if the parser exposes namespaces in its 
  5  events (utils.iterparse doesn't). 
  6  """ 
  7   
  8  #c Copyright 2008-2019, the GAVO project 
  9  #c 
 10  #c This program is free software, covered by the GNU GPL.  See the 
 11  #c COPYING file in the source distribution. 
 12   
 13   
 14  from gavo.votable import coding 
 15  from gavo.votable import common 
 16  from gavo.votable import dec_binary 
 17  from gavo.votable import dec_binary2 
 18  from gavo.votable import dec_tabledata 
 19   
 20   
21 -class DataIterator(object):
22 """A base for the classes actually doing the iteration. 23 24 You need to give a decoderModule attribute and implement _getRawRow. 25 """
26 - def __init__(self, tableDefinition, nodeIterator):
27 self.nodeIterator = nodeIterator 28 self._decodeRawRow = coding.buildDecoder( 29 tableDefinition, 30 self.decoderModule)
31
32 - def __iter__(self):
33 while True: 34 rawRow = self._getRawRow() 35 if rawRow is None: 36 break 37 yield self._decodeRawRow(rawRow)
38 39
40 -class TableDataIterator(DataIterator):
41 """An internal class used by Rows to actually iterate over rows 42 in TABLEDATA serialization. 43 """ 44 decoderModule = dec_tabledata 45
46 - def _getRawRow(self):
47 """returns a row in strings or None. 48 """ 49 # Wait for TR open 50 for type, tag, payload in self.nodeIterator: 51 if type=="end" and tag=="TABLEDATA": 52 return None 53 elif type=="start": 54 if tag=="TR": 55 break 56 else: 57 raise self.nodeIterator.getParseError( 58 "Unexpected element %s"%tag) 59 # ignore everything else; we're not validating, and sensible stuff 60 # might yet follow (usually, it's whitespace data anyway) 61 62 rawRow = [] 63 dataBuffer = [] 64 for type, tag, payload in self.nodeIterator: 65 if type=="start": # new TD 66 dataBuffer = [] 67 if tag!="TD": 68 raise self.nodeIterator.getParseError( 69 "Unexpected element %s"%tag) 70 71 elif type=="data": # TD content 72 dataBuffer.append(payload) 73 74 elif type=="end": # could be row end or cell end 75 if tag=="TR": 76 break 77 elif tag=="TD": 78 rawRow.append("".join(dataBuffer)) 79 else: 80 assert False 81 dataBuffer = [] 82 83 else: 84 assert False 85 return rawRow
86 87
88 -class _StreamData(object):
89 """A stand-in for a file that decodes VOTable stream data on 90 an as-needed basis. 91 """ 92 minChunk = 20000 # min length of encoded data decoded at a time 93 lastRes = None # last thing read (convenient for error msgs) 94
95 - def __init__(self, nodeIterator):
96 self.nodeIterator = nodeIterator 97 self.curChunk = "" # binary data already decoded 98 self.leftover = "" # undecoded base64 data 99 self.fPos = 0 # index of next char to be returned 100 self._eof = False # True when we've seen the </STREAM> event
101
102 - def _setEOF(self):
103 """cleans up at end of stream and sets eof flag. 104 105 This is called by _fillBuffer exclusively. 106 """ 107 for evtype, element, payload in self.nodeIterator: 108 if evtype!="data": 109 break 110 self._eof = True
111
112 - def _fillBuffer(self, nBytes):
113 """obtains events from node iterator fo fill curChunk. 114 """ 115 if self._eof: 116 return 117 destBytes = max(nBytes*2, self.minChunk) 118 curBytes, hadLf = 0, False 119 encoded = [self.leftover] 120 121 for type, tag, payload in self.nodeIterator: 122 if type=="end": # must be </STREAM> or expat would've crapped. 123 self._setEOF() 124 break 125 assert type=="data" 126 encoded.append(payload) 127 curBytes += len(payload) 128 hadLf = hadLf or "\n" in payload or "\r" in payload 129 if hadLf and curBytes>destBytes: 130 break 131 132 return self._decodeBase64("".join(encoded))
133
134 - def _decodeBase64(self, input):
135 """decodes input and sets curChunk, leftover, and fPos accordingly. 136 137 The method behaves slightly differently when the _eof attribute is 138 true -- normally, it will leave anything after the last line feed 139 alone, but at _eof, it will decode even that. 140 141 It is an error to pass in anything that has no line break unless 142 at _eof. 143 """ 144 if not self._eof: # put back anything after the last break mid-stream 145 try: 146 lastBreak = input.rindex("\n")+1 147 except ValueError: 148 lastBreak = input.rindex("\r")+1 149 self.leftover = input[lastBreak:] 150 input = input[:lastBreak] 151 152 self.curChunk = self.curChunk[self.fPos:]+input.decode("base64") 153 self.fPos = 0
154
155 - def read(self, nBytes):
156 """returns a string containing the next nBytes of the input 157 stream. 158 159 The function raises an IOError if there's not enough data left. 160 """ 161 if self.fPos+nBytes>len(self.curChunk): 162 self._fillBuffer(nBytes) 163 if self.fPos+nBytes>len(self.curChunk): 164 raise IOError("No data left") 165 self.lastRes = self.curChunk[self.fPos:self.fPos+nBytes] 166 self.fPos += nBytes 167 return self.lastRes
168
169 - def atEnd(self):
170 return self._eof and self.fPos==len(self.curChunk)
171 172
173 -class BinaryIteratorBase(DataIterator):
174 """A base class used by Rows to actually iterate over rows 175 in BINARY(2) serialization. 176 177 Since the VOTable binary serialization has no framing, we need to 178 present the data stream coming from the parser as a file to the decoder. 179 """ 180 181 # I need to override __iter__ since we're not actually doing XML parsing 182 # here; almost all of our work is done within the stream element.
183 - def __iter__(self):
184 for type, tag, payload in self.nodeIterator: 185 if type!="data": 186 break 187 if not (type=="start" 188 and tag=="STREAM" 189 and payload.get("encoding")=="base64"): 190 raise common.VOTableError("Can only read BINARY data from base64" 191 " encoded streams") 192 193 inF = _StreamData(self.nodeIterator) 194 while not inF.atEnd(): 195 row = self._decodeRawRow(inF) 196 if row is not None: 197 yield row
198 199
200 -class BinaryIterator(BinaryIteratorBase):
201 decoderModule = dec_binary
202 203
204 -class Binary2Iterator(BinaryIteratorBase):
205 decoderModule = dec_binary2
206 207
208 -def _makeTableIterator(elementName, tableDefinition, nodeIterator):
209 """returns an iterator for the rows contained within node. 210 """ 211 if elementName=='TABLEDATA': 212 return iter(TableDataIterator(tableDefinition, nodeIterator)) 213 elif elementName=='BINARY': 214 return iter(BinaryIterator(tableDefinition, nodeIterator)) 215 elif elementName=='BINARY2': 216 return iter(Binary2Iterator(tableDefinition, nodeIterator)) 217 218 else: 219 raise common.VOTableError("Unknown table serialization: %s"% 220 elementName, hint="We only support TABLEDATA, BINARY2," 221 " and BINARY coding")
222 223
224 -class Rows(object):
225 """a wrapper for data within a VOTable. 226 227 Tabledatas are constructed with a model.VOTable.TABLE instance and 228 the iterator maintained by parser.parse. They yield individual 229 table lines. 230 231 In reality, __iter__ just dispatches to the various deserializers. 232 """
233 - def __init__(self, tableDefinition, nodeIterator):
234 self.tableDefinition, self.nodeIterator = tableDefinition, nodeIterator
235
236 - def __iter__(self):
237 for type, tag, payload in self.nodeIterator: 238 if type=="data": # ignore whitespace (or other stuff...) 239 pass 240 elif tag=="INFO": 241 pass # XXX TODO: What do we do with those INFOs? 242 else: 243 return _makeTableIterator(tag, 244 self.tableDefinition, self.nodeIterator)
245