Source code for gavo.grammars.hdf5grammar

"""
A grammar producing rows from a table within an HDF5 file.
"""

#c Copyright 2008-2023, the GAVO project <gavo@ari.uni-heidelberg.de>
#c
#c This program is free software, covered by the GNU GPL.  See the
#c COPYING file in the source distribution.


import h5py

from gavo.grammars import common
from gavo import base


[docs]class AstropyHDF5TableIterator(common.RowIterator):
	"""A row iterator generating rawdicts from Astropy-serialised HDF5
	tables.

	The table is assumed to contain record arrays; NULL values are properly
	handled through associated .mask columns.
	"""
	def _makeRowBuilder(self, sourceDS):
		"""returns a function that builds rawdicts from a h5py dataset
		sourceDS.

		It is aware of the astropy convention of adding boolean .mask
		columns and uses these to produced Nones.
		"""
		names, nullMap = [], {}
		
		for index, name in enumerate(sourceDS.dtype.names):
			if name.endswith(".mask"):
				nullMap[index] = name[:-5]
				names.append(None)

			else:
				names.append(name)

		if nullMap:
			def makeRow(row):
				res = dict(zip(names, row))
				del res[None]
				for index, name in nullMap.items():
					if row[index]:
						res[name] = None
				return res

			return makeRow

		else:
			return lambda row: dict(zip(names, row))

	def _iterRows(self):
		hdf = h5py.File(self.sourceToken, "r")
		try:
			sourceDS = hdf[self.grammar.dataset]
		except KeyError:
			raise base.ReportableError(f"Dataset {self.grammar.dataset} not"
				f" found in {self.sourceToken}.  The following datasets"
				" are visible in the root: "+(", ".join(hdf.keys())))

		buildRow = self._makeRowBuilder(sourceDS)

		for row in sourceDS:
			yield buildRow(row)


[docs]class VaexHDF5TableIterator(common.RowIterator):
	"""A row iterator generating rawdicts from Vaex-serialised HDF5
	tables.

	Here, the columns come in separate arrays, much like FITS tables.
	"""
	_chunkSize = 10000

	def _iterRows(self):
		hdf = h5py.File(self.sourceToken, "r")
		try:
			sourceDS = hdf[self.grammar.dataset]
		except KeyError:
			raise base.ReportableError(f"Dataset {self.grammar.dataset} not"
				f" found in {self.sourceToken}.  Note that we want the"
				" parent of the columns group here.  The following datasets"
				" are visible in the root: "+(", ".join(hdf.keys())))

		cols = sourceDS["columns"]
		names = cols.keys()
		arrs = [c["data"] for c in cols.values()]

		for offset in range(0, arrs[0].shape[0], self._chunkSize):
			curArrs = [arr[offset:offset+self._chunkSize]
				for arr in arrs]
			for index in range(len(curArrs[0])):
				yield dict(zip(names, [arr[index] for arr in curArrs]))


[docs]class HDF5Grammar(common.Grammar):
	"""a grammar for parsing single tables from HDF5 files.

	These result in typed records, i.e., values normally come in
	the types they are supposed to have.  The keys in the rows are
	the column names as given in the HDF file.

	Regrettably, there are about as many conventions to serialise tables
	in HDF5 as there are programmes writing HDF5.  This grammar supports
	a few styles; ask to have more included.

	Styles currently implemented:

	:astropy:
		The table comes as a record array.  The grammar is aware of the astropy
		convention of using adding mask columns as name+".mask" and will turn
		masked values to Nones.
	:vaex:
		The table comes as a group with the columns as individual arrays
		in the group member's data dataset.  Put the parent of the columns
		group into the dataset attribute here.

	This class is not intended for ingesting large HDF5 files, as it will only
	process a few thousand rows per second on usual hardware.  Use
	`Element directgrammar`_ for large files.
	"""
	name_ = "hdf5Grammar"

	_dataset = base.UnicodeAttribute("dataset",
		default=base.Undefined,
		description="The name of the HDF5 dataset/group containing the table."
			"  At this point, only datasets that are children of root are"
			" supported.",
			copyable=True)
	_style = base.EnumeratedUnicodeAttribute("style",
		default="astropy",
		validValues=["astropy", "vaex"],
		description="Style of the table serialisation.",
		copyable=True)
		
	rowIterator = AstropyHDF5TableIterator

[docs]	def onElementComplete(self):
		if self.style=="vaex":
			self.rowIterator = VaexHDF5TableIterator


if __name__=="__main__":
	f = h5py.File("/home/msdemlei/tmp/hdparse/hdex.hdf5", "r")
	table = f["testdata"]
	print(table.shape)
	print(table.dtype)
	for row in table:
		print(row)
Source code for gavo.grammars.hdf5grammar

gavo

Navigation

Related Topics