Source code for gavo.grammars.freeregrammar

"""
A grammar based on repeated application of REs
"""

#c Copyright 2008-2023, the GAVO project <gavo@ari.uni-heidelberg.de>
#c
#c This program is free software, covered by the GNU GPL.  See the
#c COPYING file in the source distribution.


import re

from gavo import base
from gavo import utils
from gavo.grammars import common
from gavo.grammars import regrammar


_onlyWhitespaceLeft = re.compile(r"\s*$")

[docs]class RowIterator(common.FileRowIterator):
	chunkSize = 8192

	def _iterRecords(self):
		curPos, buffer = 0, ""
		recPat = self.grammar.rowProduction

		if self.grammar.ignoreJunk:
			getNext = recPat.search
		else:
			getNext = recPat.match

		while True:
			mat = getNext(buffer, curPos)
			if not mat:  # no match, fetch new stuff.
				newStuff = self.inputFile.read(self.chunkSize)
				if not newStuff:  # file exhausted
					break
				buffer = buffer[curPos:]+newStuff
				curPos = 0
				continue
			res = mat.group()
			yield res
			curPos = mat.end()
			self.curLine += res.count("\n")
		buffer = buffer[curPos:]
		if not self.grammar.ignoreJunk and not _onlyWhitespaceLeft.match(buffer):
			raise utils.SourceParseError("Junk at end of file",
				location=self.getLocator(),
				offending=buffer)

	def _iterRows(self):
		for rawRec in self._iterRecords():
			try:
				res = self.grammar.parseRE.match(rawRec).groupdict()
				if self.grammar.stripTokens:
					res = dict((k, v.strip()) for k, v in res.items())
				yield res
			except AttributeError:
				raise base.ui.logOldExc(
					utils.SourceParseError("Malformed input, parseRE did not match.",
						location=self.getLocator(),
						offending=rawRec))

[docs]	def getLocator(self):
		return "%s, line %d"%(self.sourceToken, self.curLine)


[docs]class FreeREGrammar(common.Grammar):
	"""A grammar allowing "free" regular expressions to parse a document.

	Basically, you give a rowProduction to match individual records in the
	document.  All matches of rowProduction will then be matched with
	parseRE, which in turn must have named groups.  The dictionary from
	named groups to their matches makes up the input row.

	For writing the parseRE, we recommend writing an element, using a
	CDATA construct, and taking advantage of python's "verbose" regular
	expressions.  Here's an example::

		<parseRE><![CDATA[(?xsm)^name::(?P<name>.*)
			^query::(?P<query>.*)
			^description::(?P<description>.*)\.\.
		]]></parseRE>
	"""
	name_ = "freeREGrammar"

	_rowProduction = regrammar.REAttribute("rowProduction",
		default=re.compile(r"(?m)^.+$\n"), description="RE matching a complete"
		" record.")
	_parseRE = regrammar.REAttribute("parseRE", default=base.Undefined,
		description="RE containing named groups matching a record")
	_stripTokens = base.BooleanAttribute("stripTokens", default=False,
		description="Strip whitespace from result tokens?")
	_ignoreJunk = base.BooleanAttribute("ignoreJunk", default=False,
		description="Ignore everything outside of the row production")
	rowIterator = RowIterator
Source code for gavo.grammars.freeregrammar

gavo

Navigation

Related Topics