Source code for gavo.adql.grammar

"""
A parser for ADQL.

The grammar follows the official BNF grammar quite closely, except where
pyparsing makes a different approach desirable; the names should mostly
match except for the obvious underscore to camel case map.

The grammar given in the spec has some nasty rules when you're parsing
without backtracking and by recursive descent (which is what pyparsing
does).  I need some reformulations.  The more interesting of those
include:

TableReference
--------------

Trouble is  that table_reference is left-recursive in the following rules::

  <table_reference> ::=
	 <table_name> [ <correlation_specification> ]
   | <derived_table> <correlation_specification>
   | <joined_table>

  <joined_table> ::=
	  <qualified_join>
	| <left_paren> <joined_table> <right_paren>

  <qualified_join> ::=
	  <table_reference> [ NATURAL ] [ <join_type> ] JOIN
	  <table_reference> [ <join_specification> ]

We fix this by adding rules::

	<sub_join> ::= '(' <joinedTable> ')'
  <join_opener> ::=
	 <table_name> [ <correlation_specification> ]
   | <derived_table> <correlation_specification>
	 | <sub_join>

and then writing::

  <qualified_join> ::=
	  <join_opener> [ NATURAL ] [ <join_type> ] JOIN
	  <table_reference> [ <join_specification> ]



statement
---------

I can't have StringEnd appended to querySpecification since it's used
in subqueries, but I need to have it to keep pyparsing from just matching
parts of the input.  Thus, the top-level production is for "statement".


trig_function, math_function, system_defined_function
-----------------------------------------------------

I think it's a bit funny to have the arity of functions in the syntax, but
there you go.  Anyway, I don't want to have the function names in separate
symbols since they are expensive but go for a Regex (trig1ArgFunctionName).
The only exception is ATAN since it has a different arity from the rest of the
lot.

Similarly, for math_function I group symbols by arity.

The system defined functions are also regrouped to keep the number of
symbols reasonable.

column_reference and below
--------------------------

Here the lack of backtracking hurts badly, since once, say, schema name
is matched with a dot that's it, even if the dot should really have separated
schema and table.

Hence, we don't assign semantic labels in the grammar but leave that to
whatever interprets the tokens.

The important rules here are::

	<column_name> ::= <identifier>
	<correlation_name> ::= <identifier>
	<catalog_name> ::= <identifier>
	<unqualified_schema name> ::= <identifier>
	<schema_name> ::= [ <catalog_name> <period> ] <unqualified_schema name>
	<table_name> ::= [ <schema_name> <period> ] <identifier>
	<qualifier> ::= <table_name> | <correlation_name>
	<column_reference> ::= [ <qualifier> <period> ] <column_name>

By substitution, one has::

	<schema_name> ::= [ <identifier> <period> ] <identifier>

hence::

	<table_name> ::= [[ <identifier> <period> ] <identifier> <period> ]
		<identifier>

hence::

	<qualifier> ::= [[ <identifier> <period> ] <identifier> <period> ]
		<identifier>

(which matches both table_name and correlation_name) and thus::

	<column_reference> ::= [[[ <identifier> <period> ] <identifier> <period> ]
		<identifier> <period> ] <identifier>

We need the table_name, qualifier, and column_reference productions.


generalLiterals in unsigngedLiterals
------------------------------------

One point I'm deviating from the published grammar is that I disallow
generalLiterals in unsignedLiterals.  Allowing them would let pyparsing
match a string literal as a numericValueLiteral, which messes up
string expressions.  I'm not sure why generalLiterals are allowed
in there anyway.  If this bites at some point, we'll face a major rewrite
of the grammar (or we need to dump pyparsing).

To make the whole thing work, I added the generalLiteral to the
characterPrimary production.
"""

#c Copyright 2008-2023, the GAVO project <gavo@ari.uni-heidelberg.de>
#c
#c This program is free software, covered by the GNU GPL.  See the
#c COPYING file in the source distribution.


import functools

from gavo.utils.parsetricks import (
	Word, Literal, Optional, alphas, CaselessKeyword,
	ZeroOrMore, OneOrMore, StringEnd,
	Forward,
	MatchFirstWithSaneError,   # temporary hack, see parsetricks
	CaselessLiteral, ParseException, Regex, sglQuotedString, alphanums,
	White,
	ParseExpression,
	Suppress, pyparsingWhitechars)
from gavo.utils.parsetricks import ParseSyntaxException #noflake: exported name

from gavo import utils
from gavo import stc
from gavo.adql import bindinggrammar

__docformat__ = "restructuredtext en"

import sys

# all SQL and ADQL reserved words are expected in uppercase by this and
# other modules.
ADQL_RESERVED_WORDS = set([ "ABS", "ACOS", "AREA", "ASIN", "ATAN", "ATAN2",
	"CEILING", "CENTROID", "CIRCLE", "CONTAINS", "COS", "DEGREES", "DISTANCE",
	"EXP", "FLOOR", "INTERSECTS", "LATITUDE", "LOG", "LOG10", "COORD1",
	"COORD2", "COORDSYS", "BOX",
	"MOD", "PI", "POINT", "POLYGON", "POWER", "RADIANS", "REGION",
	"RAND", "ROUND", "SIN", "SQUARE", "SQRT", "TOP", "TAN", "TRUNCATE",
	"OFFSET", "BITWISE_AND", "BITWISE_OR", "BITWISE_XOR", "BITWISE_NOT",
	"MOC", "STDDEV"])

SQL_RESERVED_WORDS = set([
	"ABSOLUTE", "ACTION", "ADD", "ALL", "ALLOCATE", "ALTER", "AND", "ANY",
	"ARE", "AS", "ASC", "ASSERTION", "AT", "AUTHORIZATION", "AVG", "BEGIN",
	"BETWEEN", "BIT", "BIT_LENGTH", "BOTH", "BY", "CASCADE", "CASCADED",
	"CASE", "CAST", "CATALOG", "CHAR", "CHARACTER", "CHAR_LENGTH",
	"CHARACTER_LENGTH", "CHECK", "CLOSE", "COALESCE", "COLLATE",
	"COLLATION", "COLUMN", "COMMIT", "CONNECT", "CONNECTION", "CONSTRAINT",
	"CONSTRAINTS", "CONTINUE", "CONVERT", "CORRESPONDING", "COUNT",
	"CREATE", "CROSS", "CURRENT", "CURRENT_DATE", "CURRENT_TIME",
	"CURRENT_TIMESTAMP", "CURRENT_USER", "CURSOR", "DATE", "DAY",
	"DEALLOCATE", "DECIMAL", "DECLARE", "DEFAULT", "DEFERRABLE", "DEFERRED",
	"DELETE", "DESC", "DESCRIBE", "DESCRIPTOR", "DIAGNOSTICS", "DISCONNECT",
	"DISTINCT", "DOMAIN", "DOUBLE", "DROP", "ELSE", "END", "END-EXEC",
	"ESCAPE", "EXCEPT", "EXCEPTION", "EXEC", "EXECUTE", "EXISTS",
	"EXTERNAL", "EXTRACT", "FALSE", "FETCH", "FIRST", "FLOAT", "FOR",
	"FOREIGN", "FOUND", "FROM", "FULL", "GET", "GLOBAL", "GO", "GOTO",
	"GRANT", "GROUP", "HAVING", "HOUR", "IDENTITY", "IMMEDIATE", "IN",
	"INDICATOR", "INITIALLY", "INNER", "INPUT", "INSENSITIVE", "INSERT",
	"INT", "INTEGER", "INTERSECT", "INTERVAL", "INTO", "IS", "ISOLATION",
	"JOIN", "KEY", "LANGUAGE", "LAST", "LEADING", "LEFT", "LEVEL", "LIKE",
	"LOCAL", "LOWER", "MATCH", "MAX", "MIN", "MINUTE", "MODULE", "MONTH",
	"NAMES", "NATIONAL", "NATURAL", "NCHAR", "NEXT", "NO", "NOT", "NULL",
	"NULLIF", "NUMERIC", "OCTET_LENGTH", "OF", "ON", "ONLY", "OPEN",
	"OPTION", "OR", "ORDER", "OUTER", "OUTPUT", "OVERLAPS", "PAD",
	"PARTIAL", "POSITION", "PRECISION", "PREPARE", "PRESERVE", "PRIMARY",
	"PRIOR", "PRIVILEGES", "PROCEDURE", "PUBLIC", "READ", "REAL",
	"REFERENCES", "RELATIVE", "RESTRICT", "REVOKE", "RIGHT", "ROLLBACK",
	"ROWS", "SCHEMA", "SCROLL", "SECOND", "SECTION", "SELECT", "SESSION",
	"SESSION_USER", "SET", "SIZE", "SMALLINT", "SOME", "SPACE", "SQL",
	"SQLCODE", "SQLERROR", "SQLSTATE", "SUBSTRING", "SUM", "SYSTEM_USER",
	"TABLE", "TEMPORARY", "THEN", "TIME", "TIMESTAMP", "TIMEZONE_HOUR",
	"TIMEZONE_MINUTE", "TO", "TRAILING", "TRANSACTION", "TRANSLATE",
	"TRANSLATION", "TRIM", "TRUE", "UNION", "UNIQUE", "UNKNOWN", "UPDATE",
	"UPPER", "USAGE", "USER", "USING", "VALUE", "VALUES", "VARCHAR",
	"VARYING", "VIEW", "WHEN", "WHENEVER", "WHERE", "WITH", "WORK", "WRITE",
	"YEAR", "ZONE", "TABLESAMPLE"])

ALL_RESERVED_WORDS = ADQL_RESERVED_WORDS | SQL_RESERVED_WORDS


# A regular expression for prefixes of user defined functions
userFunctionPrefix = "(gavo|ivo)"


def _makeQuotedName(s, p, t):
# Parse action for delimitedIdentifer.  No longer necessary when we can
# rely on working pyparsing QuotedString
	return utils.QuotedName(str(t[0])[1:-1].replace('""', '"'))


[docs]def Args(pyparseSymbol): """wraps pyparseSymbol such that matches get added to an args list on the parent node. """ return pyparseSymbol.setResultsName("args", listAllMatches=True)
[docs]class RegularIdentifier(Word): """regular identifiers are all C-style identifiers except reserved words. Filtering these in the parse action doesn't always work properly for all versions of pyparsing, thus this special class. reservedWords are assumed to be in upper case, but matching case-insensitively. """ def __init__(self, reservedWords): self.reservedWords = reservedWords super(RegularIdentifier, self).__init__(alphas+"_", alphanums+"_") # oh wow. in pyparsing 2.4, Word sets __class__, which we # can't have. Let's undo the damage. self.__class__ = RegularIdentifier
[docs] def parseImpl(self, instring, loc, doActions=True): locNew, match = super(RegularIdentifier, self).parseImpl(instring, loc, doActions) if match.upper() in self.reservedWords: raise ParseException(instring, loc, "Reserved word not allowed here") return locNew, match
[docs]class LongestMatch(ParseExpression): """pyparsing's Or, except that ParseFatalExceptions are still propagated. """ def __init__(self, exprs, savelist=False): super(LongestMatch, self).__init__(exprs, savelist) self.mayReturnEmpty = False for e in self.exprs: if e.mayReturnEmpty: self.mayReturnEmpty = True break
[docs] def parseImpl(self, instring, loc, doActions=True): maxExcLoc = -1 maxMatchLoc = -1 maxException = None for e in self.exprs: try: loc2 = e._parse(instring, loc, doActions=False)[0] except ParseException as err: if err.loc>maxExcLoc: maxException = err maxExcLoc = err.loc except IndexError: if len(instring)>maxExcLoc: maxException = ParseException( instring, len(instring), e.errmsg, self) maxExcLoc = len(instring) else: if loc2>maxMatchLoc: maxMatchLoc = loc2 maxMatchExp = e if maxMatchLoc<0: if maxException is not None: raise maxException else: raise ParseException(instring, loc, "no defined alternatives to match", self) return maxMatchExp._parse(instring, loc, doActions)
def __str__( self ): if hasattr(self,"name"): return self.name if self.strRepr is None: self.strRepr = "{" + " ^ ".join( [ str(e) for e in self.exprs ] ) + "}" return self.strRepr
[docs] def checkRecursion( self, parseElementList ): subRecCheckList = parseElementList[:] + [ self ] for e in self.exprs: e.checkRecursion( subRecCheckList )
[docs]def getADQLGrammarCopy(nodes): """returns a pair symbols, selectSymbol for a grammar parsing ADQL. You should only use this if you actually require a fresh copy of the ADQL grammar. Otherwise, use getADQLGrammar or a wrapper function defined by a client module. """ g = bindinggrammar.AutoBindingGrammar(nodes) with pyparsingWhitechars("\n\t\r "): # Fundamental syntactic units and literals g.sqlComment = Regex("--[^\n\r]*") g.whitespace = Word(" \t\n") # need that as a hack now and then to keep # numbers and identifiers from sticking g.unsignedInteger = Regex(r"\d+") g.unsignedInteger.setName("unsigned integer") exactNumericRE = r"\d+(\.(\d+)?)?|\.\d+" g.exactNumericLiteral = Regex(exactNumericRE) g.approximateNumericLiteral = Regex(r"(?i)(%s)E[+-]?\d+"%exactNumericRE) g.unsignedHexadecimal = Regex("0[xX][0-9A-Fa-f]+" ).addParseAction(lambda s,p,t: str(int(t[0], 16)) ) g.unsignedHexadecimal.setName("numeric literal") g.unsignedNumericLiteral = ( g.unsignedHexadecimal | g.approximateNumericLiteral | g.exactNumericLiteral) g.unsignedNumericLiteral.setName("numericLiteral") g.characterStringLiteral = sglQuotedString + ZeroOrMore( sglQuotedString ) g.generalLiteral = g.characterStringLiteral.copy() g.unsignedLiteral = g.unsignedNumericLiteral # !!! DEVIATION: we're not accepting | generalLiteral here. g.sign = Literal("+") | "-" g.signedInteger = (Optional( g.sign ) + g.unsignedInteger) g.signedInteger.setName("signed integer") g.nullLiteral = CaselessKeyword("NULL") g.nullLiteral.setName("NULL") g.regularIdentifier = RegularIdentifier(ALL_RESERVED_WORDS) g.regularIdentifier.setName("identifier") # There's a bug with QuotedString in some versions of pyparsing. # So, don't use this: # delimitedIdentifier = QuotedString(quoteChar='"', escQuote='"', # unquoteResults=True).addParseAction( # lambda s,p,t: utils.QuotedName(str(t))) # but rather g.delimitedIdentifier = Regex('("[^"]*")+').addParseAction( _makeQuotedName) g.delimitedIdentifier.setName("delimited identifier") g.identifier = g.regularIdentifier | g.delimitedIdentifier # Operators g.compOp = Regex("=|!=|<=|>=|<>|<|>") g.compOp.setName("comparison operator") g.multOperator = Literal("*") | Literal("/") g.addOperator = Literal("+") | Literal("-") g.notKeyword = CaselessKeyword("NOT") # Column names and such g.columnName = g.identifier.copy() g.columnName.setName("column name") g.correlationName = g.identifier.copy() g.qualifier = (g.identifier + Optional( "." + g.identifier ) + Optional( "." + g.identifier )) g.tableName = g.qualifier("tableName") g.columnReferenceByUCD = ( CaselessKeyword("UCDCOL") + '(' + g.characterStringLiteral + ')') g.columnReference = ( g.columnReferenceByUCD | g.identifier + Optional( "." + g.identifier ) + Optional( "." + g.identifier ) + Optional( "." + g.identifier )) g.columnReference.setName("column reference") g.asClause = Optional(CaselessKeyword("AS")) + g.columnName("alias") g.valueExpression = Forward() # set functions g.setFunctionType = Regex("(?i)AVG|MAX|MIN|SUM|COUNT|STDDEV") g.setQuantifier = Regex("(?i)DISTINCT|ALL") g.generalSetFunction = (g.setFunctionType("fName") + '(' + Optional(g.setQuantifier) + Args(g.valueExpression) + ')') g.countAll = (CaselessLiteral("COUNT")("fName") + '(' + Args(Literal('*')) + ')') g.setFunctionSpecification = (g.countAll | g.generalSetFunction) # cast g.castOperand = ( g.valueExpression | g.nullLiteral) g.castOperand.setName("value to cast") g.length = g.unsignedInteger | '*' g.sqlTypeLength = Literal("(") - g.length - Literal(")") g.sqlAtomType = ( ((CaselessKeyword("CHAR") | CaselessKeyword("VARCHAR")) + Optional(g.sqlTypeLength)) | (CaselessKeyword("NATIONAL") - CaselessKeyword("CHAR") + Optional(g.sqlTypeLength)) | CaselessKeyword("INTEGER") | CaselessKeyword("BIGINT") | CaselessKeyword("SMALLINT") | CaselessKeyword("REAL") | CaselessKeyword("DOUBLE") + CaselessKeyword("PRECISION") | CaselessKeyword("TIMESTAMP") | CaselessKeyword("POINT") | CaselessKeyword("CIRCLE") | CaselessKeyword("POLYGON")) g.sqlArraySpec = (Literal("[") + Optional(g.unsignedInteger) + Literal("]")) g.castTarget = (g.sqlAtomType + Optional(g.sqlArraySpec)).addParseAction( lambda s,p,t: " ".join(t).upper()) g.castSpecification = ( CaselessLiteral("CAST") - "(" + g.castOperand.setResultsName("value") - CaselessLiteral("AS") + g.castTarget.setResultsName("newType") - ')') # value expressions g.arrayReference = ( g.columnReference + Literal('[') + g.valueExpression + Literal(']')) g.valueExpressionPrimary = ( CaselessKeyword("NULL") | g.unsignedLiteral | g.arrayReference | g.columnReference | g.setFunctionSpecification # this should allow whole character value expressions by the # spec. If we ever actually want that, we'd have to see how # to avoid infinite recursion. | g.generalLiteral | '(' + g.valueExpression + ')' | g.castSpecification) g.valueExpressionPrimary.setName("value expression") # string literal stuff g.characterPrimary = Forward() g.characterPrimary.setName("String expression") g.characterFactor = g.characterPrimary g.characterValueExpression = (g.characterFactor + ZeroOrMore("||" + g.characterFactor)) g.stringValueExpression = g.characterValueExpression # numeric expressions/terms g.numericValueExpression = Forward() g.numericValueExpression.setName("Numeric expression") g.numericValueFunction = Forward() g.numericValueFunction.setName("Numeric value function") g.numericExpressionPrimary = ( g.unsignedLiteral | g.columnReference | g.setFunctionSpecification | '(' + g.valueExpression + ')') g.numericPrimary = ( g.numericValueFunction | g.valueExpressionPrimary) g.numericPrimary.setName("numeric expression") g.factor = Optional(g.sign) + g.numericPrimary g.term = (g.factor + ZeroOrMore(g.multOperator + g.factor)) g.numericValueExpression << ( g.term + ZeroOrMore(g.addOperator + g.term) ) g.nvArgs = Args(g.numericValueExpression) # geometry types and expressions g.userDefinedFunction = Forward() g.userDefinedFunction.setName("User defined function") g.geometryValueExpression = Forward() g.geometryValueExpression.setName("Geometry expression") g.tapCoordLiteral = Regex("(?i)'(?P<sys>%s)'"%"|".join(stc.TAP_SYSTEMS) ).addParseAction(lambda s,p,t: t["sys"].upper() ) g.tapCoordLiteral.setName("coordinate system literal (ICRS, GALACTIC,...)") g.coordSys = (g.tapCoordLiteral | g.nullLiteral.copy().addParseAction(lambda s,p,t: "UNKNOWN") ).setResultsName("coordSys") g.coordSys.setName("coordinate system literal (ICRS, GALACTIC,...)") g.coordinates = (g.nvArgs + ',' + g.nvArgs) g.box = (CaselessKeyword("BOX")("fName") - '(' + Optional(g.coordSys + ',') + g.coordinates + ',' + g.coordinates + ')') g.point = (CaselessKeyword("POINT")("fName") - '(' + Optional(g.coordSys - ',') + g.coordinates + ')') g.coordValue = ( g.userDefinedFunction | g.point | g.columnReference) g.circleSplitCooArgs = (g.coordinates + ',' - g.nvArgs) g.circlePointCooArgs = ( Args(g.coordValue) + ',' - g.nvArgs) g.circle = (CaselessKeyword("CIRCLE")("fName") - '(' + Optional(g.coordSys + ',') + (g.circleSplitCooArgs | g.circlePointCooArgs) + ')') g.polygonSplitCooArgs = (g.coordinates + ',' + g.coordinates + OneOrMore( ',' + g.coordinates )) g.polygonPointCooArgs = (Args(g.coordValue) - ',' - Args(g.coordValue) - OneOrMore(',' - Args(g.coordValue))) g.polygon = (CaselessKeyword("POLYGON")("fName") - '(' + Optional(g.coordSys + ',') + Args(g.polygonSplitCooArgs | g.polygonPointCooArgs) + ')') g.mocArgs = ( Args(g.stringValueExpression) ^ Args( g.numericValueExpression) - ',' - Args(g.geometryValueExpression)) g.moc = (CaselessKeyword("MOC")("fName") - '(' + g.mocArgs + ')') g.region = (CaselessKeyword("REGION")("fName") + '(' + Args(g.stringValueExpression) + ')') g.geometryExpression = (g.box | g.point | g.circle | g.polygon | g.region | g.moc) g.geometryValue = g.columnReference.copy() g.centroid = (CaselessKeyword("CENTROID")("fName") + '(' + Args(g.geometryValueExpression) + ')') g.geometryValueExpression << (g.geometryExpression | g.userDefinedFunction | g.geometryValue | g.centroid) g.geometryValueExpression.setName("geometry value expression") # geometry functions g.distanceSplitCooArgs = (g.coordinates + ',' + g.coordinates) g.distanceSplitCooArgs.setName("Numeric coordinates") g.distancePointCooArgs = (Args(g.coordValue) + ',' + Args(g.coordValue)) g.distancePointCooArgs.setName("Geometry argument") g.distanceFunction = (CaselessKeyword("DISTANCE")("fName") - '(' + (g.distanceSplitCooArgs | g.distancePointCooArgs) + ')') g.pointFunction = (Regex("(?i)COORD[12]|COORDSYS")("fName") + '(' + Args(g.coordValue) + ')') g.area = (CaselessKeyword("AREA")("fName") + '(' + Args(g.geometryValueExpression) + ')') g.nonPredicateGeometryFunction = ( g.distanceFunction | g.pointFunction | g.area ) g.predicateGeoFunctionName = Regex("(?i)CONTAINS|INTERSECTS") g.predicateGeometryFunction = (g.predicateGeoFunctionName("fName") + '(' + Args(g.geometryValueExpression) + ',' + Args(g.geometryValueExpression) + ')') g.numericGeometryFunction = (g.predicateGeometryFunction | g.nonPredicateGeometryFunction) # numeric, system, user defined functions g.trig1ArgFunctionName = Regex("(?i)ACOS|ASIN|ATAN|COS|COT|SIN|TAN") # trig1ArgFunction is what causes a parse failure on common numeric # value expressions. We take the liberty of misnaming it for better # error messages in most cases. g.trig1ArgFunctionName.setName("numeric expression") g.trigFunction = ( g.trig1ArgFunctionName("fName") + '(' + g.nvArgs + ')' | CaselessKeyword("ATAN2")("fName") + '(' + g.nvArgs + ',' + g.nvArgs + ')') g.math0ArgFunctionName = Regex("(?i)PI") g.optIntFunctionName = Regex("(?i)RAND") g.math1ArgFunctionName = Regex("(?i)ABS|CEILING|DEGREES|EXP|FLOOR|LOG10|" "LOG|RADIANS|SQUARE|SQRT|BITWISE_NOT") g.optPrecArgFunctionName = Regex("(?i)ROUND|TRUNCATE") g.math2ArgFunctionName = Regex("(?i)POWER|MOD|BITWISE_AND|BITWISE_OR" "|BITWISE_XOR") g.mathFunction = ( g.math0ArgFunctionName("fName") + '(' + ')' | g.optIntFunctionName("fName") + '(' + Optional( Args(g.unsignedInteger) ) + ')' | g.math1ArgFunctionName("fName") + '(' + g.nvArgs + ')' | g.optPrecArgFunctionName("fName") + '(' + g.nvArgs + Optional( ',' + Args(g.signedInteger) ) + ')' | g.math2ArgFunctionName("fName") + '(' + g.nvArgs + ',' + g.nvArgs + ')') g.inUnitFunction = ( CaselessKeyword("IN_UNIT") - '(' - g.numericValueExpression - ',' - g.characterStringLiteral - ')') g.scalarArray1FunctionName = Regex( "(?i)arr_(min|max|avg|stddev|sum|count)") g.scalarArray1Function = ( g.scalarArray1FunctionName("fName") - '(' - g.nvArgs - ')') g.scalarArray2FunctionName = Regex("(?i)arr_dot") g.scalarArray2Function = ( g.scalarArray2FunctionName("fName") - '(' - g.nvArgs - ',' - g.nvArgs - ')') g.scalarArrayFunction = ( g.scalarArray2Function | g.scalarArray1Function) g.arrayMapFunction = ( CaselessKeyword("arr_map")("fName") - '(' - Args(g.numericValueExpression) - ',' - g.nvArgs - ')') g.userDefinedFunctionParam = g.valueExpression g.userDefinedFunctionName = Regex("(?i)"+userFunctionPrefix+"_[A-Za-z_]+") g.userDefinedFunctionName.setName("Name of locally defined function") g.userDefinedFunction << (g.userDefinedFunctionName("fName") + '(' + Optional(Args(g.userDefinedFunctionParam) + ZeroOrMore("," + Args(g.userDefinedFunctionParam))) + ')') g.numericValueFunction << (g.trigFunction | g.mathFunction | g.inUnitFunction | g.scalarArrayFunction | g.arrayMapFunction | g.userDefinedFunction | g.numericGeometryFunction) g.foldFunction = ( (CaselessKeyword("UPPER") | CaselessKeyword("LOWER"))("fName") - '(' + Args(g.characterValueExpression) + ')' ) g.stringValueFunction = g.foldFunction g.characterPrimary << ( g.stringValueFunction | g.generalLiteral | g.userDefinedFunction | g.valueExpressionPrimary) g.timestampFunction = ( CaselessKeyword('TIMESTAMP')("fName") - '(' + Args(g.stringValueExpression) - ')') g.dateValueExpression = (g.timestampFunction) g.coalesceExpression = ( CaselessKeyword('COALESCE') - '(' + Args(g.valueExpression) + ZeroOrMore(',' + Args(g.valueExpression)) - ')') g.searchCondition = Forward() g.searchCondition.setName("Condition") g.caseResult = g.valueExpression g.searchedWhenClause = ( CaselessKeyword('WHEN') + g.searchCondition + CaselessKeyword('THEN') + g.caseResult) g.simpleWhenClause = ( CaselessKeyword('WHEN') + g.valueExpression + CaselessKeyword('THEN') + g.caseResult) g.caseElseClause = CaselessKeyword('ELSE') + g.caseResult g.searchedCase = ( CaselessKeyword('CASE') + OneOrMore(g.searchedWhenClause) + Optional(g.caseElseClause) - CaselessKeyword('END')) g.simpleCase = ( CaselessKeyword('CASE') + g.valueExpression + OneOrMore(g.simpleWhenClause) + Optional(g.caseElseClause) - CaselessKeyword('END')) g.caseExpression = ( g.coalesceExpression | g.searchedCase | g.simpleCase) # toplevel value expression g.valueExpression << ( LongestMatch([ g.caseExpression, g.numericValueExpression, g.stringValueExpression, g.dateValueExpression, g.geometryValueExpression])) g.valueExpression.setName("value expression") g.derivedColumn = g.valueExpression("expr") + Optional(g.asClause) # parts of select clauses g.setQuantifier = (CaselessKeyword( "DISTINCT" ) | CaselessKeyword( "ALL" ))("setQuantifier") g.setLimit = CaselessKeyword( "TOP" ) - g.unsignedInteger("setLimit") g.offsetSpec = CaselessKeyword( "OFFSET" )- g.unsignedInteger("offset") g.qualifiedStar = g.qualifier + "." + "*" g.selectSublist = (g.qualifiedStar | g.derivedColumn ).setResultsName("fieldSel", listAllMatches=True) g.selectSublist.setName("select list item") g.selectList = (Literal("*")("starSel") | g.selectSublist + ZeroOrMore("," - g.selectSublist) ) g.selectList.setName("select list") # boolean terms g.subquery = Forward() g.subquery.setName("Subquery") g.comparisonPredicate = g.valueExpression + g.compOp + g.valueExpression g.betweenPredicate = (g.valueExpression + Optional(g.notKeyword) + CaselessKeyword("BETWEEN") - g.valueExpression + CaselessKeyword("AND") - g.valueExpression) g.inValueList = g.valueExpression + ZeroOrMore(',' + g.valueExpression) g.inPredicateValue = g.subquery | ("(" + g.inValueList + ")") g.inPredicate = (g.valueExpression + Optional(g.notKeyword) + CaselessKeyword("IN") + g.inPredicateValue) g.existsPredicate = CaselessKeyword("EXISTS") - g.subquery g.likePredicate = ( g.characterValueExpression + Optional(g.notKeyword) + (CaselessKeyword("LIKE") | CaselessKeyword("ILIKE")) + g.characterValueExpression) g.nullPredicate = (g.columnReference + CaselessKeyword("IS") + Optional(g.notKeyword) - g.nullLiteral) g.predicate = (g.comparisonPredicate | g.betweenPredicate | g.inPredicate | g.likePredicate | g.nullPredicate | g.existsPredicate) g.booleanPrimaryOpener = Literal('(') # for error messages g.booleanPrimaryOpener.setName("boolean expression") g.booleanPrimary = ( g.booleanPrimaryOpener + g.searchCondition + ')' | g.predicate) g.booleanPrimary.setName("boolean expression") g.booleanFactor = Optional(g.notKeyword) + g.booleanPrimary g.booleanTerm = (g.booleanFactor + ZeroOrMore(CaselessKeyword("AND") - g.booleanFactor)) # WHERE clauses and such g.searchCondition << (g.booleanTerm + ZeroOrMore(CaselessKeyword("OR") - g.booleanTerm) ) g.searchCondition.setName("search condition") g.whereClause = (CaselessKeyword("WHERE") - g.searchCondition)("whereClause") # Referencing tables g.queryExpression = Forward() g.queryExpression.setName("Query expression") g.correlationSpecification = ( Optional(CaselessKeyword("AS") | g.whitespace) + g.correlationName("alias")) g.subqueryOpener = Literal('(') # for error reporting g.subqueryOpener.setName("subquery") g.subquery << (g.subqueryOpener + g.queryExpression + ')') g.derivedTable = g.subquery.copy() + g.correlationSpecification g.tablesample = (CaselessKeyword("TABLESAMPLE") - Literal('(') - g.unsignedNumericLiteral - Literal(')')) g.possiblyAliasedTable = ( g.tableName + Optional(g.correlationSpecification) + Optional(g.tablesample)("tablesample")) g.setGeneratingFunction = ( CaselessKeyword("generate_series") - Literal('(') - g.nvArgs - ',' - g.nvArgs - Literal(')') + Optional(g.correlationSpecification)) g.joinedTable = Forward() g.subJoin = '(' + g.joinedTable + ')' g.joinOperand = ( g.setGeneratingFunction | g.possiblyAliasedTable | g.derivedTable | g.subJoin).setName("Join Operand") g.tableReference = ( g.joinedTable | g.possiblyAliasedTable | g.derivedTable) g.tableReference.setName("table reference") # JOINs g.columnNameList = ( g.columnName + ZeroOrMore( "," + g.columnName) ) g.columnNameList.setName("column name list") g.namedColumnsJoin = ( CaselessKeyword("USING") + '(' + g.columnNameList("columnNames") + ')') g.joinCondition = CaselessKeyword("ON") - g.searchCondition g.joinSpecification = g.joinCondition | g.namedColumnsJoin g.outerJoinType = (CaselessKeyword("LEFT") | CaselessKeyword("RIGHT") | CaselessKeyword("FULL")) g.joinType = (CaselessKeyword("INNER") | (g.outerJoinType + Optional(CaselessKeyword("OUTER"))) | CaselessKeyword("CROSS")) # local extension g.joinOperator = (Optional(CaselessKeyword("NATURAL")) + Optional(g.joinType) + CaselessKeyword("JOIN")) g.joinedTable << (g.joinOperand + ZeroOrMore(g.joinOperator + g.joinOperand + Optional(g.joinSpecification))).setName("Joined Table") # Detritus in table expressions g.groupByClause = (CaselessKeyword("GROUP") + CaselessKeyword("BY") + g.valueExpression + ZeroOrMore(',' + g.valueExpression))("groupby") g.havingClause = (CaselessKeyword("HAVING") + g.searchCondition)("having") g.orderingSpecification = (Regex("(?i)ASC|DESC")) g.sortKey = g.valueExpression g.sortSpecification = g.sortKey + Optional(g.orderingSpecification) g.orderByClause = (CaselessKeyword("ORDER") + CaselessKeyword("BY") + g.sortSpecification + ZeroOrMore( ',' + g.sortSpecification ))("orderBy") # FROM fragments and such g.fromClause = (Suppress(CaselessKeyword("FROM").setName("FROM")) + g.tableReference + ZeroOrMore(Suppress(',') - g.tableReference))("fromClause") g.tableExpression = (g.fromClause + Optional(g.whereClause) + Optional(g.groupByClause) + Optional(g.havingClause) + Optional(g.orderByClause)) # Common Table Expressions g.selectExpression = Forward().setName("Select expression") g.querySetExpression = Forward() g.querySetTerm = Forward() g.withQuery = (g.regularIdentifier("queryName") + CaselessKeyword("AS") + '(' + g.selectExpression + ')') g.withSpecification = (CaselessLiteral("WITH") + g.withQuery + ZeroOrMore(',' + g.withQuery) ) # toplevel select clause and set operators g.selectQuery = (CaselessKeyword("SELECT").setName("SELECT") + Optional(g.setQuantifier) + Optional(g.setLimit) + g.selectList + g.tableExpression) g.optionalAll = Optional(CaselessKeyword("ALL")) g.intersectOperator = ( CaselessKeyword("INTERSECT") + g.optionalAll) g.additiveSetOperator = (( CaselessKeyword("UNION") | CaselessKeyword("EXCEPT") ) + g.optionalAll) # I had to rewrite a few of the following ones to avoid pyparse # endless recursions. g.querySetPrimary = MatchFirstWithSaneError([ g.selectQuery, '(' + g.selectExpression + ')']) g.querySetTerm << ( g.querySetPrimary + ZeroOrMore(g.intersectOperator + g.querySetExpression)) g.querySetExpression << ( g.querySetTerm + ZeroOrMore(g.additiveSetOperator + g.querySetTerm)) g.selectExpression << ( g.querySetExpression + Optional(g.offsetSpec)) g.queryExpression << (g.selectExpression | g.joinedTable).setName( "Query Expression") g.querySpecification = Optional(g.withSpecification) + g.selectExpression g.statement = g.querySpecification + Optional(White()) + StringEnd() g.statement.ignore(g.sqlComment) return g.getSymbols_(), g.statement
_grammarCache = None
[docs]def enableDebug(syms, debugNames=None): # pragma: no cover if not debugNames: debugNames = syms for name in debugNames: ob = syms[name] if not ob.debug: ob.setDebug(True) ob.setName(name) if "sqlComment" in syms: syms["sqlComment"].setDebug(False)
[docs]def enableTree(syms): # pragma: no cover def makeAction(name): def action(s, pos, toks): return [name, toks] return action for name in syms: ob = syms[name] if not ob.debug: ob.setDebug(True) ob.setName(name) ob.addParseAction(makeAction(name))
[docs]@functools.lru_cache(1) def getADQLGrammar(): """returns a pair of (symbols, root) for an ADQL grammar. This probably is mainly useful for testing. At least you should not set names or parseActions on whatever you are returned unless you are testing. """ return getADQLGrammarCopy([])
if __name__=="__main__": # pragma: no cover def printCs(s, pos, toks): print("---------------Tokens:", toks) import pprint syms, grammar = getADQLGrammar() enableTree(syms) res = syms["statement"].parseString( """ select * from (select * from z) as q """ , parseAll=True) pprint.pprint(res.asList(), stream=sys.stderr)