"""
Support code for linetap.
This, in particular, contains bindings to the InChI library.
Use getILib() to obtain the bindings, and see the _InChIBindings
docstrings on what you can all. It probably helps to be aware of
https://www.inchi-trust.org/wp/download/104/InChI_API_Reference.pdf
although the binding is substantially different.
This binding probably leaks memory left and right; I have not given
this a great deal of thought so far, as I don't expect InChI manipulations
will be common in long-running code. If that's overly optimistic, then
we'll need a round of leak hunting.
"""
#c Copyright 2008-2023, the GAVO project <gavo@ari.uni-heidelberg.de>
#c
#c This program is free software, covered by the GNU GPL. See the
#c COPYING file in the source distribution.
import contextlib
import ctypes
import functools
import re
import warnings
from ctypes.util import find_library
from gavo import base
from gavo import utils
[docs]class InChIError(utils.Error):
"""is raised when a libinchi function indicates an error has
occurred.
This is usually rather cryptic but ought to return a numeric
code that can be decoded using the libinchi API document.
"""
_EL_DATA_RE = re.compile(r'\{\s*"([A-Za-z]+)",' # Element name
r"\s*(\d+)," # average mass
r"\s*(\d+)," # norm mass
r"\s*([\d.]+)," # avg mass
r"\s*(METAL2?|0)\s*," # type
r"\s*(\d+)," # electronegativity
r"\s*(\d)," # "No H" (no clue:-)
r"\s*([0-9,{}]+)"
r"\}")
def _parseInchiElData(line):
"""returns a dictionary of parsed items for a line in data/element_data.txt.
These are C structure literals, but I blindly kill things with REs.
"""
mat = _EL_DATA_RE.match(line)
return {
"name": mat.group(1),
"avg weight": int(mat.group(2)),
"weight": int(mat.group(3)),
"exact weight": float(mat.group(4)),
"type": mat.group(5),
"el. neg": int(mat.group(6)),
# TODO: parse out valences, perhaps do something with "No H"
}
[docs]@functools.lru_cache(None)
def getElementData():
"""returns a dictionary from element names to a dictionary of
element properties.
"""
elementData = {}
with base.openDistFile("data/element_data.txt") as f:
for lineNumber, line in utils.iterSimpleText(f):
el = _parseInchiElData(line)
elementData[el["name"]] = el
return elementData
# The following values should really be parsed from /usr/include/inchi_api.h
# Maximum bonds per atom
MAXVAL = 20
ATOM_EL_LEN = 6
class _inchi_Atom(ctypes.Structure):
_fields_ = [
("x", ctypes.c_double),
("y", ctypes.c_double),
("z", ctypes.c_double),
("neighbor", ctypes.c_short*MAXVAL),
("bond_type", ctypes.c_byte*MAXVAL),
("bond_stereo", ctypes.c_char*MAXVAL),
("elname", ctypes.c_char*ATOM_EL_LEN),
("num_bonds", ctypes.c_short),
("num_iso_H", ctypes.c_byte*4),
("isotopic_mass", ctypes.c_short),
("radical", ctypes.c_char),
("charge", ctypes.c_byte)]
def connect(self, otherAtomIndex, bond_type):
"""Adds a connection to otherAtomIndex.
User code will probably rather use _inchi_Input's connect method,
as it usually cannot know otherAtomIndex.
"""
if self.num_bonds==MAXVAL:
raise InChIError("Too many connections")
self.neighbor[self.num_bonds] = otherAtomIndex
self.bond_type[self.num_bonds] = bond_type
self.num_bonds += 1
def getElementInfo(self):
res = getElementData()[utils.debytify(self.elname)]
if self.isotopic_mass:
res = res.copy()
if self.isotopic_mass>1000:
res["weight"] += self.isotopic_mass-10000
else:
res["weight"] = self.isotopic_mass
return res
class _inchi_Stereo0D(ctypes.Structure):
_fields_ = [
("neighbor", ctypes.c_short*4),
("central_atom", ctypes.c_short),
("type", ctypes.c_char),
("parity", ctypes.c_char),
]
class _MoleculeMixin:
"""a mixin for structs having atoms.
"""
def iterAtoms(self):
for atomIndex in range(self.num_atoms):
yield self.atom[atomIndex]
def getTotalWeight(self):
"""returns the total molecular weight of the substance.
This takes isotopy into account.
"""
totalWeight = 0
for atom in self.iterAtoms():
totalWeight += atom.getElementInfo()["weight"]
for hIso in range(1, 4):
totalWeight += atom.num_iso_H[hIso]*hIso
# no idea what the implicit H-s should really count as.
totalWeight += atom.num_iso_H[0]
return totalWeight
def getTotalCharge(self):
"""returns the total charge of all compounds.
"""
return sum(atom.charge
for atom in self.iterAtoms())
class _inchi_Input(ctypes.Structure, _MoleculeMixin):
_fields_ = [
("atom", ctypes.POINTER(_inchi_Atom)),
("stereo0D", ctypes.POINTER(_inchi_Stereo0D)),
("szOptions", ctypes.c_char_p),
("num_atoms", ctypes.c_int),
("num_stereo0D", ctypes.c_int),]
def connect(self, atomIndex1, atomIndex2, bond_type=1):
"""add a bond between the atom at index1 and index2.
"""
if (not 0<=atomIndex1<self.num_atoms
or not 0<=atomIndex2<self.num_atoms):
raise InChIError("Atom indexes must be < {} here".format(
self.num_atoms))
self.atom[atomIndex1].connect(atomIndex2, bond_type)
self.atom[atomIndex2].connect(atomIndex1, bond_type)
class _inchi_OutputStruct(ctypes.Structure, _MoleculeMixin):
_fields_ = [
("atom", ctypes.POINTER(_inchi_Atom)),
("stereo0D", ctypes.POINTER(_inchi_Stereo0D)),
("num_atoms", ctypes.c_short),
("num_stereo0D", ctypes.c_short),
("szMessage", ctypes.c_char_p),
("szLog", ctypes.c_char_p),
("WarningFlags", ctypes.c_long*4)]
class _inchi_Output(ctypes.Structure):
_fields_ = [
("szInChI", ctypes.c_char_p),
("szAuxInfo", ctypes.c_char_p),
("szMessage", ctypes.c_char_p),
("szLog", ctypes.c_char_p)]
class _inchi_InputINCHI(ctypes.Structure):
_fields_ = [
("szInChI", ctypes.c_char_p),
("szOptions", ctypes.c_char_p),
]
class _InChIBinding:
"""A class encapsulating our binding to libinchi.
This is intended to be a singleton; users ought to obtain it through
the getInChI function below.
This will raise a ReportableError if the inchi C library cannot be found.
"""
_libname = "inchi"
def __init__(self):
self.libPath = find_library(self._libname)
if self.libPath is None:
raise base.ReportableError("No libinchi found.",
hint="On Debian systems, install libinchi1 to get it.")
self.lib = ctypes.CDLL(self.libPath)
for wrappedName, argtypes in [
("CheckINCHI", [ctypes.c_char_p, ctypes.c_int]),
("GetStdINCHIKeyFromStdINCHI",
[ctypes.c_char_p, ctypes.c_char_p]),
("GetINCHIKeyFromINCHI",
[ctypes.c_char_p, ctypes.c_int, ctypes.c_int, ctypes.c_char_p,
ctypes.c_char_p, ctypes.c_char_p]),
("GetINCHI",
[ctypes.POINTER(_inchi_Input), ctypes.POINTER(_inchi_Output)]),
("GetStdINCHI",
[ctypes.POINTER(_inchi_Input), ctypes.POINTER(_inchi_Output)]),
("GetINCHIfromINCHI",
[ctypes.POINTER(_inchi_InputINCHI), ctypes.POINTER(_inchi_Output)]),
("GetStructFromINCHI",
[ctypes.POINTER(_inchi_InputINCHI),
ctypes.POINTER(_inchi_OutputStruct)]),
("FreeStdINCHI", [ctypes.POINTER(_inchi_Output)]),
("FreeStructFromINCHI", [ctypes.POINTER(_inchi_OutputStruct)]),
]:
setattr(self, "_"+wrappedName, getattr(self.lib, wrappedName))
getattr(self, "_"+wrappedName).argtypes = argtypes
def checkInChI(self, inchi, strict=1):
"""returns 0 if inchi is ok, something else (cf. API docs) otherwise.
"""
return self._CheckINCHI(utils.bytify(inchi), ctypes.c_int(strict))
def getInChIKey(self, inChI):
"""returns an InChIKey for a standard InChI.
"""
buf = ctypes.create_string_buffer(28)
retval = self._GetINCHIKeyFromINCHI(
utils.bytify(inChI), 0, 0, buf, None, None)
if retval!=0:
raise InChIError(f"Could not generate InChIKey: {retval}")
return utils.debytify(buf.value)
def getInput(self, atoms):
"""returns an _inchi_Input for a list of inchi_Atoms.
This is what you need to call to generate an InChI(Key). Essentially,
you create atoms using getAtom, put them into a list, pass them
to getInput and then call connect(index1, index2) to make the
molecule connections.
"""
atomsArray = (_inchi_Atom*len(atoms))(*atoms)
return _inchi_Input(
atom=atomsArray,
stereo0D=None,
szOptions=b"-DoNotAddH",
num_atoms=len(atoms),
num_stereo0D=0)
def getAtom(self, elname, isotopic_mass=0, charge=0):
"""returns a new atom.
Only pass isotopic_mass if you actually care about isotopism.
"""
neighbor = (ctypes.c_short*MAXVAL)(*([0]*MAXVAL))
bond_type = (ctypes.c_byte*MAXVAL)(*([0]*MAXVAL))
return _inchi_Atom(
x=0, y=0, z=0,
neighbor=neighbor,
bond_type=bond_type, # (i.e., single, double... binding)
bond_stereo=b"", # (let's see when we want it)
elname=utils.bytify(elname),
num_bonds=0,
num_iso_H=(ctypes.c_byte*4)(0, 0, 0, 0),
isotopic_mass=isotopic_mass,
radical=0,
charge=ctypes.c_byte(charge))
def _handleAndFree(self, retval, output, input=None):
try:
self._interpretICReturnCode(retval, output, input)
return utils.debytify(output.szInChI)
finally:
self._FreeStdINCHI(output)
_errcodes = {
0: "Success",
2: "No InChi created",
3: "Severe error (out of memory?)",
4: "Unknown error",
5: "Previous call has not returned yet",
-1: "No structural data provided"}
def _interpretICReturnCode(self, retval, output, input=None):
if retval==1:
if input is not None:
warnings.warn("Source for the following warning: "+input)
warnings.warn("InChI warning: "+utils.debytify(output.szMessage))
elif retval!=0:
raise InChIError(utils.debytify(output.szMessage
or self._errcodes.get(retval, "(Unclear problem)")),
hint=utils.debytify(output.szLog))
def getInChI(self, inchiInput, rawInput=None):
"""returns the InChI of the molecule described through inchiInput
as a string.
"""
output = _inchi_Output()
res = self._GetStdINCHI(inchiInput, output)
return self._handleAndFree(res, output, rawInput)
def normalizeInChI(self, inChI):
"""returns a string inChI in normalised form.
"""
output = _inchi_Output()
res = self._GetINCHIfromINCHI(
_inchi_InputINCHI(utils.bytify(inChI)), output)
return self._handleAndFree(res, output)
_dcodeAtom = re.compile(r"(?P<iso>\d+)?(?P<at>[A-Z][a-z]?)(?P<ch>[-+]+)?$")
_dcodeConn = re.compile(r"(\d+)([-=#])(\d+)$")
_dcodeMult = {'-': 1, '=': 2, '#': 3}
def _freeStructAfterUse(self, struct):
"""returns a context manager returning struct and freeing it once
the controlled block is through.
"""
def _():
try:
yield struct
finally:
self._FreeStructFromINCHI(struct)
return contextlib.contextmanager(_)()
def parseInChI(self, inchi):
"""returns a context manager for a parsed inchi from a string inchi.
The result is probably not fun to work with, but it has
getTotalWeight and getTotalCharge methods that are handy
for filling LineTAP tables with incoming inchis.
This returns a context manager so the stuff allocated by
libinchi gets freed again. So, you'll use this like::
with ilib.parseInChI("InChI=1S...") as molecule:
(do something)
"""
struct = _inchi_OutputStruct()
input = _inchi_InputINCHI(utils.bytify(inchi))
retval = self._GetStructFromINCHI(input, struct)
self._interpretICReturnCode(retval, struct, inchi)
return self._freeStructAfterUse(struct)
def getInChIFromCode(self, code):
"""returns a pair on (input, InChI) from a rather simple heuristic code.
The input part may be useful if you want to manipulate it further;
it's an _inchi_Input with iterAtoms and getTotalWeight methods.
This consists of whitespace-separated atoms or connections in arbitrary
order. An atom is [isotope]Atom[charge], where isotope is a number
and charge a number of + or -. A connection is index(-=#)index, where
index enumerates the atom, 1-based and -,=,# is a single, double, or
triple bind.
Yeah, I know it sucks there's *another* molecule input format, but,
really, the chemistry people drove me crazy.
Examples for this are found in tests/slaptest.py
"""
atoms, connections = [], []
for tok in code.split():
mat = self._dcodeConn.match(tok)
if mat:
connections.append((self._dcodeMult[mat.group(2)],
int(mat.group(1)), int(mat.group(3))))
continue
mat = self._dcodeAtom.match(tok)
if mat:
pars = mat.groupdict("")
charge = 0
if pars["ch"]:
charge = len(pars["ch"])*{"+": 1, "-": -1}[pars["ch"][0]]
atoms.append(self.getAtom(
mat.group("at"),
int(mat.group("iso") or "0"),
charge))
continue
raise ValueError(f"Invalid molecule code token: '{tok}'")
mol = self.getInput(atoms)
for bond_type, index1, index2 in connections:
mol.connect(index1-1, index2-1, bond_type)
return mol, self.getInChI(mol, code)
[docs]@functools.lru_cache(None)
def getILib():
"""returns a shallow binding to the InChI library.
This will always return the same object if called multiple times.
It will raise a ReportableError if the library is not available.
"""
return _InChIBinding()