/* * Copyright (c) 2015-2016 Tada AB and other contributors, as listed below. * * All rights reserved. This program and the accompanying materials * are made available under the terms of the The BSD 3-Clause License * which accompanies this distribution, and is available at * http://opensource.org/licenses/BSD-3-Clause * * Contributors: * Chapman Flack */ package org.postgresql.pljava.sqlgen; import java.util.regex.Matcher; import java.util.regex.Pattern; import javax.annotation.processing.Messager; import javax.tools.Diagnostic.Kind; /** * A few useful SQL lexical definitions supplied as {@link Pattern} objects. * * The idea is not to go overboard and reimplement an SQL lexer, but to * capture in one place the rules for those bits of SQL snippets that are * likely to be human-supplied in annotations and need to be checked for * correctness when emitted into deployment descriptors. Identifiers, for a * start. * * Supplied in the API module so they are available to {@code javac} to * compile and generate DDR when the rest of PL/Java is not necessarily * present. Of course backend code such as {@code SQLDeploymentDescriptor} * can also refer to these. */ public abstract class Lexicals { /** Allowed as the first character of a regular identifier by ISO. */ public static final Pattern ISO_REGULAR_IDENTIFIER_START = Pattern.compile( "[\\p{Lu}\\p{Ll}\\p{Lt}\\p{Lm}\\p{Lo}\\p{Nl}]" ); /** Allowed as any non-first character of a regular identifier by ISO. */ public static final Pattern ISO_REGULAR_IDENTIFIER_PART = Pattern.compile(String.format( "[\\xb7\\p{Mn}\\p{Mc}\\p{Nd}\\p{Pc}\\p{Cf}%1$s]", ISO_REGULAR_IDENTIFIER_START.pattern() )); /** A complete regular identifier as allowed by ISO. */ public static final Pattern ISO_REGULAR_IDENTIFIER = Pattern.compile(String.format( "%1$s%2$s{0,127}+", ISO_REGULAR_IDENTIFIER_START.pattern(), ISO_REGULAR_IDENTIFIER_PART.pattern() )); /** A complete ISO regular identifier in a single capturing group. */ public static final Pattern ISO_REGULAR_IDENTIFIER_CAPTURING = Pattern.compile(String.format( "(%1$s)", ISO_REGULAR_IDENTIFIER.pattern() )); /** A complete delimited identifier as allowed by ISO. As it happens, this * is also the form PostgreSQL uses for elements of a LIST_QUOTE-typed GUC. */ public static final Pattern ISO_DELIMITED_IDENTIFIER = Pattern.compile( "\"(?:[^\"]|\"\"){1,128}+\"" ); /** An ISO delimited identifier with a single capturing group that captures * the content (which still needs to have "" replaced with " throughout). * The capturing group is named {@code xd}. */ public static final Pattern ISO_DELIMITED_IDENTIFIER_CAPTURING = Pattern.compile(String.format( "\"(?<xd>(?:[^\"]|\"\"){1,128}+)\"" )); /** The escape-specifier part of a Unicode delimited identifier or string. * The escape character itself is in the capturing group named {@code uec}. * The group can be absent, in which case \ should be used as the uec. */ public static final Pattern ISO_UNICODE_ESCAPE_SPECIFIER = Pattern.compile( "(?:\\p{IsWhite_Space}*+[Uu][Ee][Ss][Cc][Aa][Pp][Ee]"+ "\\p{IsWhite_Space}*+'(?<uec>[^0-9A-Fa-f+'\"\\p{IsWhite_Space}])')?+" ); /** A Unicode delimited identifier. The body is in capturing group * {@code xui} and the escape character in group {@code uec}. The body * still needs to have "" replaced with ", and {@code Unicode escape value}s * decoded and replaced, and then it has to be verified to be no longer * than 128 codepoints. */ public static final Pattern ISO_UNICODE_IDENTIFIER = Pattern.compile(String.format( "[Uu]&\"(?<xui>(?:[^\"]|\"\")++)\"%1$s", ISO_UNICODE_ESCAPE_SPECIFIER.pattern() )); /** A compilable pattern to match a {@code Unicode escape value}. * A match should have one of three named capturing groups. If {@code cev}, * substitute the {@code uec} itself. If {@code u4d} or {@code u6d}, * substitute the codepoint represented by the hex digits. A match with none * of those capturing groups indicates an ill-formed string. *<p> * Maka a Pattern from this by supplying the right {@code uec}, so: * {@code Pattern.compile(String.format(ISO_UNICODE_REPLACER, * Pattern.quote(uec)));} */ public static final String ISO_UNICODE_REPLACER = "%1$s(?:(?<cev>%1$s)|(?<u4d>[0-9A-Fa-f]{4})|\\+(?<u6d>[0-9A-Fa-f]{6}))"; /** Allowed as the first character of a regular identifier by PostgreSQL * (PG 7.4 -). */ public static final Pattern PG_REGULAR_IDENTIFIER_START = Pattern.compile( "[A-Za-z\\P{ASCII}_]" // hasn't seen a change since PG 7.4 ); /** Allowed as any non-first character of a regular identifier by PostgreSQL * (PG 7.4 -). */ public static final Pattern PG_REGULAR_IDENTIFIER_PART = Pattern.compile(String.format( "[0-9$%1$s]", PG_REGULAR_IDENTIFIER_START.pattern() )); /** A complete regular identifier as allowed by PostgreSQL (PG 7.4 -). */ public static final Pattern PG_REGULAR_IDENTIFIER = Pattern.compile(String.format( "%1$s%2$s*+", PG_REGULAR_IDENTIFIER_START.pattern(), PG_REGULAR_IDENTIFIER_PART.pattern() )); /** A complete PostgreSQL regular identifier in a single capturing group. */ public static final Pattern PG_REGULAR_IDENTIFIER_CAPTURING = Pattern.compile(String.format( "(%1$s)", PG_REGULAR_IDENTIFIER.pattern() )); /** A regular identifier that satisfies both ISO and PostgreSQL rules. */ public static final Pattern ISO_AND_PG_REGULAR_IDENTIFIER = Pattern.compile(String.format( "(?:(?=%1$s)%2$s)(?:(?=%3$s)%4$s)*+", ISO_REGULAR_IDENTIFIER_START.pattern(), PG_REGULAR_IDENTIFIER_START.pattern(), ISO_REGULAR_IDENTIFIER_PART.pattern(), PG_REGULAR_IDENTIFIER_PART.pattern() )); /** A regular identifier that satisfies both ISO and PostgreSQL rules, * in a single capturing group named {@code i}. */ public static final Pattern ISO_AND_PG_REGULAR_IDENTIFIER_CAPTURING = Pattern.compile( String.format( "(?<i>%1$s)", ISO_AND_PG_REGULAR_IDENTIFIER.pattern()) ); /** Pattern that matches any identifier valid by both ISO and PG rules, * with the presence of named capturing groups indicating which kind it is: * {@code i} for a regular identifier, {@code xd} for a delimited identifier * (still needing "" replaced with "), or {@code xui} (with or without an * explicit {@code uec} for a Unicode identifier (still needing "" to " and * decoding of {@code Unicode escape value}s). */ public static final Pattern ISO_AND_PG_IDENTIFIER_CAPTURING = Pattern.compile(String.format( "%1$s|(?:%2$s)|(?:%3$s)", ISO_AND_PG_REGULAR_IDENTIFIER_CAPTURING.pattern(), ISO_DELIMITED_IDENTIFIER_CAPTURING.pattern(), ISO_UNICODE_IDENTIFIER.pattern() )); /** An identifier by ISO SQL, PostgreSQL, <em>and</em> Java (not SQL at all) * rules. (Not called {@code REGULAR} because Java allows no other form of * identifier.) This restrictive form is the safest for identifiers being * generated into a deployment descriptor file that an old version of * PL/Java might load, because through 1.4.3 PL/Java used the Java * identifier rules to recognize identifiers in deployment descriptors. */ public static final Pattern ISO_PG_JAVA_IDENTIFIER = Pattern.compile(String.format( "(?:(?=%1$s)(?=\\p{%5$sStart})%2$s)(?:(?=%3$s)(?=\\p{%5$sPart})%4$s)*+", ISO_REGULAR_IDENTIFIER_START.pattern(), PG_REGULAR_IDENTIFIER_START.pattern(), ISO_REGULAR_IDENTIFIER_PART.pattern(), PG_REGULAR_IDENTIFIER_PART.pattern(), "javaJavaIdentifier" )); /** * Return an Identifier, given a {@code Matcher} that has matched an * ISO_AND_PG_IDENTIFIER_CAPTURING. Will determine from the matching named * groups which type of identifier it was, process the matched sequence * appropriately, and return it. * @param m A {@code Matcher} known to have matched an identifier. * @return Identifier made from the recovered string. */ public static Identifier identifierFrom(Matcher m) { String s = m.group("i"); if ( null != s ) return Identifier.from(s, false); s = m.group("xd"); if ( null != s ) return Identifier.from(s.replace("\"\"", "\""), true); s = m.group("xui"); if ( null == s ) return null; // XXX? s = s.replace("\"\"", "\""); String uec = m.group("uec"); if ( null == uec ) uec = "\\"; int uecp = uec.codePointAt(0); Matcher replacer = Pattern.compile( String.format(ISO_UNICODE_REPLACER, Pattern.quote(uec))) .matcher(s); StringBuffer sb = new StringBuffer(); while ( replacer.find() ) { replacer.appendReplacement(sb, ""); int cp; String uev = replacer.group("u4d"); if ( null == uev ) uev = replacer.group("u6d"); if ( null != uev ) cp = Integer.parseInt(uev, 16); else cp = uecp; // XXX check validity sb.appendCodePoint(cp); } return Identifier.from(replacer.appendTail(sb).toString(), true); } /** * Class representing a SQL identifier. These have wild and wooly behavior * depending on whether they were represented in the source in quoted form * or not. Quoted ones are case-sensitive, * and {@link #equals(Object) equals} will only recognize exact matches. * Non-quoted ones match case-insensitively; just to make this interesting, * ISO SQL has one set of case-folding rules, while PostgreSQL has another. * Also, a non-quoted identifier can match a quoted one, if the quoted one's * exact spelling matches the non-quoted one's case-folded form. *<p> * For even more fun, the PostgreSQL rules depend on the server encoding. * For any multibyte encoding, <em>only</em> the 26 ASCII uppercase letters * are folded to lower, leaving all other characters alone. In single-byte * encodings, more letters can be touched. But this code has to run in a * javac annotation processor without knowledge of any particular database's * server encoding. The recommended encoding, UTF-8, is multibyte, so the * PostgreSQL rule will be taken to be: only the 26 ASCII letters, always. */ public static class Identifier { protected final String m_nonFolded; /** * Whether this Identifier case-folds. * @return true if this Identifier was non-quoted in the source, * false if it was quoted. */ public boolean folds() { return false; } /** * This Identifier's original spelling. * @return The spelling as seen in the source, with no case folding. */ public String nonFolded() { return m_nonFolded; } /** * This Identifier as PostgreSQL would case-fold it (or the same as * nonFolded if this was quoted and does not fold). * @return The spelling with ASCII letters (only) folded to lowercase, * if this Identifier folds. */ public String pgFolded() { return m_nonFolded; } /** * This Identifier as ISO SQL would case-fold it (or the same as * nonFolded if this was quoted and does not fold). * @return The spelling with lowercase and titlecase letters folded to * (possibly length-changing) uppercase equivalents, * if this Identifier folds. */ public String isoFolded() { return m_nonFolded; } /** * Create an Identifier given its original, non-folded spelling, * and whether it represents a quoted identifier. * @param s The exact, internal, non-folded spelling of the identifier * (unwrapped from any quoting in its external form). * @param quoted Pass {@code true} if this was parsed from any quoted * external form, false if non-quoted. * @return A corresponding Identifier * @throws IllegalArgumentException if {@code quoted} is {@code false} * but {@code s} cannot be a non-quoted identifier, or {@code s} is * empty or longer than the ISO SQL maximum 128 codepoints. */ public static Identifier from(String s, boolean quoted) { boolean foldable = ISO_AND_PG_REGULAR_IDENTIFIER.matcher(s).matches(); if ( ! quoted ) { if ( ! foldable ) throw new IllegalArgumentException(String.format( "impossible for \"%1$s\" to be a non-quoted identifier", s)); return new Folding(s); } if ( foldable ) return new Foldable(s); return new Identifier(s); } @Override public String toString() { return m_nonFolded; } /** * For a quoted identifier that could not match any non-quoted one, * the hash code of its non-folded spelling is good enough. In other * cases, the code must be derived more carefully. */ @Override public int hashCode() { return m_nonFolded.hashCode(); } @Override public boolean equals(Object other) { return equals(other, null); } /** * For use in an annotation processor, a version of {@code equals} that * can take a {@link Messager} and use it to emit warnings. It will * emit a warning whenever it compares two Identifiers that are equal * by one or the other of PostgreSQL's or ISO SQL's rules but not both. * @param other Object to compare to * @param msgr a Messager to use for warnings; if {@code null}, no * warnings will be generated. * @return true if two quoted Identifiers match exactly, or two * non-quoted ones match in either the PostgreSQL or ISO SQL folded * form, or a quoted one exactly matches either folded form of a * non-quoted one. */ public boolean equals(Object other, Messager msgr) { if ( ! (other instanceof Identifier) ) return false; Identifier oi = (Identifier)other; if ( oi.folds() ) return oi.equals(this); return m_nonFolded.equals(oi.nonFolded()); } protected Identifier(String nonFolded) { m_nonFolded = nonFolded; int cpc = nonFolded.codePointCount(0, nonFolded.length()); if ( 0 == cpc || cpc > 128 ) throw new IllegalArgumentException(String.format( "identifier empty or longer than 128 codepoints: \"%s\"", nonFolded)); } /** * Class representing an Identifier that was quoted, therefore does * not case-fold, but satisfies {@code ISO_AND_PG_REGULAR_IDENTIFIER} * and so could conceivably be matched by a non-quoted identifier. */ static class Foldable extends Identifier { private final int m_hashCode; protected Foldable(String nonFolded) { this(nonFolded, isoFold(nonFolded)); } protected Foldable(String nonFolded, String isoFolded) { super(nonFolded); m_hashCode = isoFolded.hashCode(); } /** * For any identifier that case-folds, or even could be matched by * another identifier that case-folds, the hash code is tricky. * Hash codes are required to be equal for any instances that are * equal (but not required to be different for instances that are * unequal). In this case, the hash codes need to be equal whenever * the PostgreSQL <em>or</em> ISO SQL folded forms match. *<p> * This hash code will be derived from the ISO-folded spelling of * the identifier. As long as the PostgreSQL rules only affect the * 26 ASCII letters, all of which are also folded (albeit in the * other direction) by the ISO rules, hash codes will also match for * identifiers equal under PostgreSQL rules. */ @Override public int hashCode() { return m_hashCode; } /** * The characters that ISO SQL rules will fold: anything that is * lowercase or titlecase. */ private static final Pattern s_isoFolded = Pattern.compile("[\\p{javaLowerCase}\\p{javaTitleCase}]"); /** * Case-fold a string by the ISO SQL rules, where any lowercase or * titlecase character gets replaced by its uppercase form (the * generalized, possibly length-changing one, requiring * {@link String#toUpperCase} and not * {@link Character#toUpperCase}. * @param s The non-folded value. * @return The folded value. */ protected static String isoFold(String s) { Matcher m = s_isoFolded.matcher(s); StringBuffer sb = new StringBuffer(); while ( m.find() ) m.appendReplacement(sb, m.group().toUpperCase()); return m.appendTail(sb).toString(); } } /** * Class representing an Identifier that was not quoted, and therefore * has case-folded forms. */ static class Folding extends Foldable { private final String m_pgFolded; private final String m_isoFolded; protected Folding(String nonFolded) { this(nonFolded, isoFold(nonFolded)); } protected Folding(String nonFolded, String isoFolded) { super(nonFolded, isoFolded); m_pgFolded = pgFold(nonFolded); m_isoFolded = isoFolded; } @Override public String pgFolded() { return m_pgFolded; } @Override public String isoFolded() { return m_isoFolded; } @Override public boolean folds() { return true; } @Override public boolean equals(Object other, Messager msgr) { if ( ! (other instanceof Identifier) ) return false; Identifier oi = (Identifier)other; boolean eqPG = m_pgFolded.equals(oi.pgFolded()); boolean eqISO = m_isoFolded.equals(oi.isoFolded()); if ( eqPG != eqISO && oi.folds() && null != msgr ) { msgr.printMessage(Kind.WARNING, String.format( "identifiers \"%1$s\" and \"%2$s\" are equal by ISO " + "or PostgreSQL case-insensitivity rules but not both", m_nonFolded, oi.nonFolded())); } return eqPG || eqISO; } /** * The characters that PostgreSQL rules will fold: only the 26 * uppercase ASCII letters. */ private static final Pattern s_pgFolded = Pattern.compile("[A-Z]"); /** * Case-fold a string by the PostgreSQL rules (assuming a * multibyte server encoding, where only the 26 uppercase ASCII * letters fold to lowercase). * @param s The non-folded value. * @return The folded value. */ private String pgFold(String s) { Matcher m = s_pgFolded.matcher(s); StringBuffer sb = new StringBuffer(); while ( m.find() ) m.appendReplacement(sb, m.group().toLowerCase()); return m.appendTail(sb).toString(); } } } }