Source code for gavo.grammars.freeregrammar
"""
A grammar based on repeated application of REs
"""
#c Copyright 2008-2023, the GAVO project <gavo@ari.uni-heidelberg.de>
#c
#c This program is free software, covered by the GNU GPL. See the
#c COPYING file in the source distribution.
import re
from gavo import base
from gavo import utils
from gavo.grammars import common
from gavo.grammars import regrammar
_onlyWhitespaceLeft = re.compile(r"\s*$")
[docs]class RowIterator(common.FileRowIterator):
chunkSize = 8192
def _iterRecords(self):
curPos, buffer = 0, ""
recPat = self.grammar.rowProduction
if self.grammar.ignoreJunk:
getNext = recPat.search
else:
getNext = recPat.match
while True:
mat = getNext(buffer, curPos)
if not mat: # no match, fetch new stuff.
newStuff = self.inputFile.read(self.chunkSize)
if not newStuff: # file exhausted
break
buffer = buffer[curPos:]+newStuff
curPos = 0
continue
res = mat.group()
yield res
curPos = mat.end()
self.curLine += res.count("\n")
buffer = buffer[curPos:]
if not self.grammar.ignoreJunk and not _onlyWhitespaceLeft.match(buffer):
raise utils.SourceParseError("Junk at end of file",
location=self.getLocator(),
offending=buffer)
def _iterRows(self):
for rawRec in self._iterRecords():
try:
res = self.grammar.parseRE.match(rawRec).groupdict()
if self.grammar.stripTokens:
res = dict((k, v.strip()) for k, v in res.items())
yield res
except AttributeError:
raise base.ui.logOldExc(
utils.SourceParseError("Malformed input, parseRE did not match.",
location=self.getLocator(),
offending=rawRec))
[docs] def getLocator(self):
return "%s, line %d"%(self.sourceToken, self.curLine)
[docs]class FreeREGrammar(common.Grammar):
"""A grammar allowing "free" regular expressions to parse a document.
Basically, you give a rowProduction to match individual records in the
document. All matches of rowProduction will then be matched with
parseRE, which in turn must have named groups. The dictionary from
named groups to their matches makes up the input row.
For writing the parseRE, we recommend writing an element, using a
CDATA construct, and taking advantage of python's "verbose" regular
expressions. Here's an example::
<parseRE><![CDATA[(?xsm)^name::(?P<name>.*)
^query::(?P<query>.*)
^description::(?P<description>.*)\.\.
]]></parseRE>
"""
name_ = "freeREGrammar"
_rowProduction = regrammar.REAttribute("rowProduction",
default=re.compile(r"(?m)^.+$\n"), description="RE matching a complete"
" record.")
_parseRE = regrammar.REAttribute("parseRE", default=base.Undefined,
description="RE containing named groups matching a record")
_stripTokens = base.BooleanAttribute("stripTokens", default=False,
description="Strip whitespace from result tokens?")
_ignoreJunk = base.BooleanAttribute("ignoreJunk", default=False,
description="Ignore everything outside of the row production")
rowIterator = RowIterator