package net.enilink.komma.parser; import java.util.ArrayList; import java.util.LinkedList; import java.util.List; import net.enilink.komma.parser.sparql.tree.BNode; import net.enilink.komma.parser.sparql.tree.BooleanLiteral; import net.enilink.komma.parser.sparql.tree.DoubleLiteral; import net.enilink.komma.parser.sparql.tree.GenericLiteral; import net.enilink.komma.parser.sparql.tree.GraphNode; import net.enilink.komma.parser.sparql.tree.IntegerLiteral; import net.enilink.komma.parser.sparql.tree.IriRef; import net.enilink.komma.parser.sparql.tree.QName; import org.parboiled.BaseParser; import org.parboiled.Rule; import org.parboiled.annotations.SuppressNode; public abstract class BaseRdfParser extends BaseParser<Object> { public static Object LIST_BEGIN = new Object(); public GenericLiteral createLiteral(String label, String language) { return new GenericLiteral(label.trim(), null, language != null ? language.trim() : null); } public GenericLiteral createTypedLiteral(String label, GraphNode datatype) { return new GenericLiteral(label.trim(), datatype, null); } public Rule RdfLiteral() { return sequence( StringLiteral(), push(null), optional(firstOf(LANGTAG(), sequence(string("^^"), IriRef())), drop(1) // ), // push(peek() instanceof GraphNode ? // createTypedLiteral((String) pop(1), (GraphNode) pop()) : createLiteral((String) pop(1), (String) pop()) // ), WS()); } public Rule NumericLiteral() { return firstOf(NumericLiteralUnsigned(), NumericLiteralPositive(), NumericLiteralNegative()); } public Rule NumericLiteralUnsigned() { return firstOf(DOUBLE(), DECIMAL(), INTEGER()); } public Rule NumericLiteralPositive() { return firstOf(DOUBLE_POSITIVE(), DECIMAL_POSITIVE(), INTEGER_POSITIVE()); } public Rule NumericLiteralNegative() { return firstOf(DOUBLE_NEGATIVE(), DECIMAL_NEGATIVE(), INTEGER_NEGATIVE()); } public Rule BooleanLiteral() { return sequence(firstOf("TRUE", "FALSE"), // push(new BooleanLiteral("true".equals(match().toLowerCase())))); } public Rule StringLiteral() { return firstOf(STRING_LITERAL_LONG1(), STRING_LITERAL1(), STRING_LITERAL_LONG2(), STRING_LITERAL2()); } public Rule IriRef() { return firstOf(IRI_REF(), PrefixedName()); } public String stripColon(String prefix) { return prefix.trim().replaceAll(":", ""); } public String trim(String string) { return string != null ? string.trim() : null; } public Rule PrefixedName() { return firstOf(PNAME_LN(), // sequence(PNAME_NS(), WS(), // push(new QName((String) pop(), "")))); } public Rule BlankNode() { return sequence( firstOf(BLANK_NODE_LABEL(), sequence('[', ']', push(null))), push(new BNode((String) pop()))); } @SuppressNode public Rule WS() { return zeroOrMore(firstOf(COMMENT(), WS_NO_COMMENT())); } public Rule WS_NO_COMMENT() { return firstOf(ch(' '), ch('\t'), ch('\f'), EOL()); } public Rule PNAME_NS() { return sequence(optional(PN_PREFIX()), push(match()), ch(':')); } public Rule PNAME_LN() { return sequence(PNAME_NS(), PN_LOCAL(), push(new QName((String) pop(1), ((String) pop()).trim()))); } public Rule IRI_REF() { return sequence( LESS_NO_COMMENT(), zeroOrMore(testNot(firstOf(IRI_REF_CHARS_WO_SPACE(), ch(' '))), ANY), push(new IriRef(match().trim())), '>'); } /** * Rule that allows spaces in IRIs which are normally disallowed. */ public Rule IRI_REF_WSPACE() { return sequence(LESS_NO_COMMENT(), zeroOrMore(testNot(IRI_REF_CHARS_WO_SPACE()), ANY), push(new IriRef(match().trim())), '>'); } public Rule IRI_REF_CHARS_WO_SPACE() { return firstOf(LESS_NO_COMMENT(), ch('>'), ch('"'), ch('{'), ch('}'), ch('|'), ch('^'), ch('\\'), ch('`'), charRange('\u0000', '\u0019')); } public Rule BLANK_NODE_LABEL() { return sequence(string("_:"), PN_LOCAL(), WS()); } public Rule LANGTAG() { return sequence( ch('@'), sequence(oneOrMore(PN_CHARS_BASE()), zeroOrMore('-', oneOrMore(PN_CHARS_BASE(), DIGIT()))), push(match())); } public Rule INTEGER() { return sequence(oneOrMore(DIGIT()), push(new IntegerLiteral(Integer.parseInt(match().trim()))), WS()); } public Rule DECIMAL() { return sequence( firstOf(sequence(oneOrMore(DIGIT()), '.', zeroOrMore(DIGIT())), sequence('.', oneOrMore(DIGIT()))), push(new DoubleLiteral(Double.parseDouble(match().trim()))), WS()); } public Rule DOUBLE() { return sequence( firstOf(sequence(oneOrMore(DIGIT()), '.', zeroOrMore(DIGIT()), EXPONENT()), sequence('.', oneOrMore(DIGIT()), EXPONENT()), sequence(oneOrMore(DIGIT()), EXPONENT())), push(new DoubleLiteral(Double.parseDouble(match().trim()))), WS()); } public Rule INTEGER_POSITIVE() { return sequence('+', INTEGER()); } public Rule DECIMAL_POSITIVE() { return sequence('+', DECIMAL()); } public Rule DOUBLE_POSITIVE() { return sequence('+', DOUBLE()); } public Rule INTEGER_NEGATIVE() { return sequence('-', INTEGER(), // push(new IntegerLiteral(-((IntegerLiteral) pop()).getValue())) // ); } public Rule DECIMAL_NEGATIVE() { return sequence('-', DECIMAL(), // push(new DoubleLiteral(-((DoubleLiteral) pop()).getValue())) // ); } public Rule DOUBLE_NEGATIVE() { return sequence('-', DOUBLE(), // push(new DoubleLiteral(-((DoubleLiteral) pop()).getValue())) // ); } public Rule EXPONENT() { return sequence(ignoreCase('e'), optional(firstOf('+', '-')), oneOrMore(DIGIT())); } public Rule STRING_LITERAL1() { return sequence( ch('\''), push(new StringBuilder()), zeroOrMore(firstOf( sequence(testNot(firstOf('\'', '\\', '\n', '\r')), ANY, appendToSb(matchedChar())), ECHAR(), UCHAR())), push(pop().toString()), '\'', WS()); } public Rule STRING_LITERAL2() { return sequence( ch('"'), push(new StringBuilder()), zeroOrMore(firstOf( sequence(testNot(firstOf('"', '\\', '\n', '\r')), ANY, appendToSb(matchedChar())), ECHAR(), UCHAR())), push(pop().toString()), '"', WS()); } public Rule STRING_LITERAL_LONG1() { return sequence( string("'''"), push(new StringBuilder()), zeroOrMore( testNot("'''"), optional(firstOf("''", '\''), appendToSb(match())), firstOf(sequence(testNot(firstOf('\'', '\\')), ANY, appendToSb(matchedChar())), ECHAR(), UCHAR())), push(pop().toString()), "'''", WS()); } public Rule STRING_LITERAL_LONG2() { return sequence( "\"\"\"", push(new StringBuilder()), zeroOrMore( testNot("\"\"\""), optional(firstOf("\"\"", '\"'), appendToSb(match())), firstOf(sequence(testNot(firstOf('\"', '\\')), ANY, appendToSb(matchedChar())), ECHAR(), UCHAR())), push(pop().toString()), "\"\"\"", WS()); } public boolean appendToSb(String s) { ((StringBuilder) peek()).append(s); return true; } public boolean appendToSb(char c) { ((StringBuilder) peek()).append(c); return true; } public char unescape(char c) { switch (c) { case 't': return '\t'; case 'b': return '\b'; case 'n': return '\n'; case 'r': return '\r'; case 'f': return '\f'; default: return c; } } /** * Unescapes the character <code>c</code> and appends it to a string builder * on the value stack. */ public Rule Ech(char c) { return sequence(ch(c), appendToSb(unescape(c))); } public Rule ECHAR() { return sequence( '\\', firstOf(Ech('t'), Ech('b'), Ech('n'), Ech('r'), Ech('f'), Ech('"'), Ech('\''), Ech('\\'))); } public Rule UCHAR() { return firstOf( sequence( "\\u", sequence(HEX(), HEX(), HEX(), HEX()), appendToSb(new String(Character.toChars(Integer .parseInt(match(), 16))))), sequence( "\\U", sequence(HEX(), HEX(), HEX(), HEX(), HEX(), HEX(), HEX(), HEX()), appendToSb(new String(Character.toChars(Integer .parseInt(match(), 16)))))); } public Rule PN_CHARS_U() { return firstOf(PN_CHARS_BASE(), ch('_')); } public Rule PN_CHARS() { return firstOf(PN_CHARS_U(), '-', DIGIT(), ch('\u00B7'), charRange('\u0300', '\u036F'), charRange('\u203F', '\u2040')); } public Rule PN_PREFIX() { return sequence(PN_CHARS_BASE(), zeroOrMore(firstOf(PN_CHARS(), sequence('.', PN_CHARS())))); } public Rule PN_CHARS_SUFFIX() { return firstOf(PN_CHARS(), ch(':'), PLX()); } public Rule PN_LOCAL() { return sequence( sequence( firstOf(PN_CHARS_U(), ch(':'), DIGIT(), PLX()), zeroOrMore(firstOf(PN_CHARS_SUFFIX(), sequence(ch('.'), PN_CHARS_SUFFIX())))), push(match()), WS()); } public Rule PLX() { return firstOf(PERCENT(), PN_LOCAL_ESC()); } public Rule PERCENT() { return sequence(ch('%'), HEX(), HEX()); } public Rule HEX() { return firstOf(DIGIT(), charRange('A', 'F'), charRange('a', 'f')); } public Rule PN_LOCAL_ESC() { return sequence( '\\', firstOf('_', '~', '.', '-', '!', '$', '&', "'", '(', ')', '*', '+', ',', ';', '=', '/', '?', '#', '@', '%')); } public Rule PN_CHARS_BASE() { return firstOf( // charRange('A', 'Z'),// charRange('a', 'z'), // charRange('\u00C0', '\u00D6'), // charRange('\u00D8', '\u00F6'), // charRange('\u00F8', '\u02FF'), // charRange('\u0370', '\u037D'), // charRange('\u037F', '\u1FFF'), // charRange('\u200C', '\u200D'), // charRange('\u2070', '\u218F'), // charRange('\u2C00', '\u2FEF'), // charRange('\u3001', '\uD7FF'), // charRange('\uF900', '\uFDCF'), // charRange('\uFDF0', '\uFFFD') // ); } public Rule DIGIT() { return charRange('0', '9'); } public Rule COMMENT() { return sequence(ch('#'), zeroOrMore(testNot(EOL()), ANY), EOL()); } public Rule EOL() { return firstOf(ch('\n'), ch('\r')); } public Rule LESS_NO_COMMENT() { return sequence(ch('<'), zeroOrMore(WS_NO_COMMENT())); } @Override protected Rule fromCharLiteral(char c) { return sequence(ch(c), WS()); } @Override protected Rule fromStringLiteral(String string) { return sequence(string(string), WS()); } @SuppressWarnings("unchecked") public <T> List<T> popList(int start, Class<T> elementType, int additional) { LinkedList<T> list = new LinkedList<T>(); Object element; while ((element = pop(start)) != LIST_BEGIN) { list.addFirst((T) element); } while (additional-- > 0) { list.addFirst((T) pop(start)); } return new ArrayList<T>(list); } public <T> List<T> popList(int start, Class<T> elementType) { return popList(start, elementType); } public <T> List<T> popList(Class<T> elementType, int additional) { return popList(0, elementType, additional); } public <T> List<T> popList(Class<T> elementType) { return popList(elementType, 0); } }