/* * Copyright Aduna (http://www.aduna-software.com/) (c) 1997-2007. * * Licensed under the Aduna BSD-style license. */ package org.openrdf.rio.turtle; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.LineNumberReader; import java.io.PushbackReader; import java.io.Reader; import java.io.UnsupportedEncodingException; import info.aduna.text.ASCIIUtil; import org.openrdf.model.BNode; import org.openrdf.model.Literal; import org.openrdf.model.Resource; import org.openrdf.model.Statement; import org.openrdf.model.URI; import org.openrdf.model.Value; import org.openrdf.model.ValueFactory; import org.openrdf.model.impl.ValueFactoryImpl; import org.openrdf.model.vocabulary.RDF; import org.openrdf.model.vocabulary.XMLSchema; import org.openrdf.rio.RDFFormat; import org.openrdf.rio.RDFHandlerException; import org.openrdf.rio.RDFParseException; import org.openrdf.rio.helpers.RDFParserBase; /** * RDF parser for <a href="http://www.dajobe.org/2004/01/turtle/">Turtle</a> * files. This parser is not thread-safe, therefore its public methods are * synchronized. * <p> * This implementation is based on the 2006/01/02 version of the Turtle * specification, with slight deviations: * <ul> * <li>Normalization of integer, floating point and boolean values is dependent * on the specified datatype handling. According to the specification, integers * and booleans should be normalized, but floats don't.</li> * <li>Comments can be used anywhere in the document, and extend to the end of * the line. The Turtle grammar doesn't allow comments to be used inside triple * constructs that extend over multiple lines, but the author's own parser * deviates from this too.</li> * </ul> * * @author Arjohn Kampman */ public class TurtleParser extends RDFParserBase { /*-----------* * Variables * *-----------*/ private LineNumberReader lineReader; private PushbackReader reader; private Resource subject; private URI predicate; private Value object; /*--------------* * Constructors * *--------------*/ /** * Creates a new TurtleParser that will use a {@link ValueFactoryImpl} to * create RDF model objects. */ public TurtleParser() { super(); } /** * Creates a new TurtleParser that will use the supplied ValueFactory to * create RDF model objects. * * @param valueFactory * A ValueFactory. */ public TurtleParser(ValueFactory valueFactory) { super(valueFactory); } /*---------* * Methods * *---------*/ public RDFFormat getRDFFormat() { return RDFFormat.TURTLE; } /** * Implementation of the <tt>parse(InputStream, String)</tt> method defined * in the RDFParser interface. * * @param in * The InputStream from which to read the data, must not be * <tt>null</tt>. The InputStream is supposed to contain UTF-8 * encoded Unicode characters, as per the Turtle specification. * @param baseURI * The URI associated with the data in the InputStream, must not be * <tt>null</tt>. * @throws IOException * If an I/O error occurred while data was read from the InputStream. * @throws RDFParseException * If the parser has found an unrecoverable parse error. * @throws RDFHandlerException * If the configured statement handler encountered an unrecoverable * error. * @throws IllegalArgumentException * If the supplied input stream or base URI is <tt>null</tt>. */ public synchronized void parse(InputStream in, String baseURI) throws IOException, RDFParseException, RDFHandlerException { if (in == null) { throw new IllegalArgumentException("Input stream must not be 'null'"); } // Note: baseURI will be checked in parse(Reader, String) try { parse(new InputStreamReader(in, "UTF-8"), baseURI); } catch (UnsupportedEncodingException e) { // Every platform should support the UTF-8 encoding... throw new RuntimeException(e); } } /** * Implementation of the <tt>parse(Reader, String)</tt> method defined in * the RDFParser interface. * * @param reader * The Reader from which to read the data, must not be <tt>null</tt>. * @param baseURI * The URI associated with the data in the Reader, must not be * <tt>null</tt>. * @throws IOException * If an I/O error occurred while data was read from the InputStream. * @throws RDFParseException * If the parser has found an unrecoverable parse error. * @throws RDFHandlerException * If the configured statement handler encountered an unrecoverable * error. * @throws IllegalArgumentException * If the supplied reader or base URI is <tt>null</tt>. */ public synchronized void parse(Reader reader, String baseURI) throws IOException, RDFParseException, RDFHandlerException { if (reader == null) { throw new IllegalArgumentException("Reader must not be 'null'"); } if (baseURI == null) { throw new IllegalArgumentException("base URI must not be 'null'"); } rdfHandler.startRDF(); lineReader = new LineNumberReader(reader); // Start counting lines at 1: lineReader.setLineNumber(1); // Allow at most 2 characters to be pushed back: this.reader = new PushbackReader(lineReader, 2); // Store normalized base URI setBaseURI(baseURI); reportLocation(); try { int c = skipWSC(); while (c != -1) { parseStatement(); c = skipWSC(); } } finally { clear(); } rdfHandler.endRDF(); } protected void parseStatement() throws IOException, RDFParseException, RDFHandlerException { int c = peek(); if (c == '@') { parseDirective(); skipWSC(); verifyCharacter(read(), "."); } else { parseTriples(); skipWSC(); verifyCharacter(read(), "."); } } protected void parseDirective() throws IOException, RDFParseException, RDFHandlerException { // Verify that the first characters form the string "prefix" verifyCharacter(read(), "@"); StringBuilder sb = new StringBuilder(8); int c = read(); while (c != -1 && !TurtleUtil.isWhitespace(c)) { sb.append((char)c); c = read(); } String directive = sb.toString(); if (directive.equals("prefix")) { parsePrefixID(); } else if (directive.equals("base")) { parseBase(); } else if (directive.length() == 0) { reportFatalError("Directive name is missing, expected @prefix or @base"); } else { reportFatalError("Unknown directive \"@" + directive + "\""); } } protected void parsePrefixID() throws IOException, RDFParseException, RDFHandlerException { skipWSC(); // Read prefix ID (e.g. "rdf:" or ":") StringBuilder prefixID = new StringBuilder(8); while (true) { int c = read(); if (c == ':') { unread(c); break; } else if (TurtleUtil.isWhitespace(c)) { break; } else if (c == -1) { throwEOFException(); } prefixID.append((char)c); } skipWSC(); verifyCharacter(read(), ":"); skipWSC(); // Read the namespace URI URI namespace = parseURI(); // Store and report this namespace mapping String prefixStr = prefixID.toString(); String namespaceStr = namespace.toString(); setNamespace(prefixStr, namespaceStr); rdfHandler.handleNamespace(prefixStr, namespaceStr); } protected void parseBase() throws IOException, RDFParseException, RDFHandlerException { skipWSC(); URI baseURI = parseURI(); setBaseURI(baseURI.toString()); } protected void parseTriples() throws IOException, RDFParseException, RDFHandlerException { parseSubject(); skipWSC(); parsePredicateObjectList(); subject = null; predicate = null; object = null; } protected void parsePredicateObjectList() throws IOException, RDFParseException, RDFHandlerException { predicate = parsePredicate(); skipWSC(); parseObjectList(); while (skipWSC() == ';') { read(); int c = skipWSC(); if (c == '.' || // end of triple c == ']') // end of predicateObjectList inside blank node { break; } predicate = parsePredicate(); skipWSC(); parseObjectList(); } } protected void parseObjectList() throws IOException, RDFParseException, RDFHandlerException { parseObject(); while (skipWSC() == ',') { read(); skipWSC(); parseObject(); } } protected void parseSubject() throws IOException, RDFParseException, RDFHandlerException { int c = peek(); if (c == '(') { subject = parseCollection(); } else if (c == '[') { subject = parseImplicitBlank(); } else { Value value = parseValue(); if (value instanceof Resource) { subject = (Resource)value; } else { reportFatalError("Illegal subject value: " + value); } } } protected URI parsePredicate() throws IOException, RDFParseException { // Check if the short-cut 'a' is used int c1 = read(); if (c1 == 'a') { int c2 = read(); if (TurtleUtil.isWhitespace(c2)) { // Short-cut is used, return the rdf:type URI return RDF.TYPE; } // Short-cut is not used, unread all characters unread(c2); } unread(c1); // Predicate is a normal resource Value predicate = parseValue(); if (predicate instanceof URI) { return (URI)predicate; } else { reportFatalError("Illegal predicate value: " + predicate); return null; } } protected void parseObject() throws IOException, RDFParseException, RDFHandlerException { int c = peek(); if (c == '(') { object = parseCollection(); } else if (c == '[') { object = parseImplicitBlank(); } else { object = parseValue(); } reportStatement(subject, predicate, object); } /** * Parses a collection, e.g. <tt>( item1 item2 item3 )</tt>. */ protected Resource parseCollection() throws IOException, RDFParseException, RDFHandlerException { verifyCharacter(read(), "("); int c = skipWSC(); if (c == ')') { // Empty list read(); return RDF.NIL; } else { BNode listRoot = createBNode(); // Remember current subject and predicate Resource oldSubject = subject; URI oldPredicate = predicate; // generated bNode becomes subject, predicate becomes rdf:first subject = listRoot; predicate = RDF.FIRST; parseObject(); BNode bNode = listRoot; while (skipWSC() != ')') { // Create another list node and link it to the previous BNode newNode = createBNode(); reportStatement(bNode, RDF.REST, newNode); // New node becomes the current subject = bNode = newNode; parseObject(); } // Skip ')' read(); // Close the list reportStatement(bNode, RDF.REST, RDF.NIL); // Restore previous subject and predicate subject = oldSubject; predicate = oldPredicate; return listRoot; } } /** * Parses an implicit blank node. This method parses the token <tt>[]</tt> * and predicateObjectLists that are surrounded by square brackets. */ protected Resource parseImplicitBlank() throws IOException, RDFParseException, RDFHandlerException { verifyCharacter(read(), "["); BNode bNode = createBNode(); int c = read(); if (c != ']') { unread(c); // Remember current subject and predicate Resource oldSubject = subject; URI oldPredicate = predicate; // generated bNode becomes subject subject = bNode; // Enter recursion with nested predicate-object list skipWSC(); parsePredicateObjectList(); skipWSC(); // Read closing bracket verifyCharacter(read(), "]"); // Restore previous subject and predicate subject = oldSubject; predicate = oldPredicate; } return bNode; } /** * Parses an RDF value. This method parses uriref, qname, node ID, quoted * literal, integer, double and boolean. */ protected Value parseValue() throws IOException, RDFParseException { int c = peek(); if (c == '<') { // uriref, e.g. <foo://bar> return parseURI(); } else if (c == ':' || TurtleUtil.isPrefixStartChar(c)) { // qname or boolean return parseQNameOrBoolean(); } else if (c == '_') { // node ID, e.g. _:n1 return parseNodeID(); } else if (c == '"') { // quoted literal, e.g. "foo" or """foo""" return parseQuotedLiteral(); } else if (ASCIIUtil.isNumber(c) || c == '.' || c == '+' || c == '-') { // integer or double, e.g. 123 or 1.2e3 return parseNumber(); } else if (c == -1) { throwEOFException(); return null; } else { reportFatalError("Expected an RDF value here, found '" + (char)c + "'"); return null; } } /** * Parses a quoted string, optionally followed by a language tag or datatype. */ protected Literal parseQuotedLiteral() throws IOException, RDFParseException { String label = parseQuotedString(); // Check for presence of a language tag or datatype int c = peek(); if (c == '@') { read(); // Read language StringBuilder lang = new StringBuilder(8); c = read(); if (c == -1) { throwEOFException(); } if (!TurtleUtil.isLanguageStartChar(c)) { reportError("Expected a letter, found '" + (char)c + "'"); } lang.append((char)c); c = read(); while (TurtleUtil.isLanguageChar(c)) { lang.append((char)c); c = read(); } unread(c); return createLiteral(label, lang.toString(), null); } else if (c == '^') { read(); // next character should be another '^' verifyCharacter(read(), "^"); // Read datatype Value datatype = parseValue(); if (datatype instanceof URI) { return createLiteral(label, null, (URI)datatype); } else { reportFatalError("Illegal datatype value: " + datatype); return null; } } else { return createLiteral(label, null, null); } } /** * Parses a quoted string, which is either a "normal string" or a """long * string""". */ protected String parseQuotedString() throws IOException, RDFParseException { String result = null; // First character should be '"' verifyCharacter(read(), "\""); // Check for long-string, which starts and ends with three double quotes int c2 = read(); int c3 = read(); if (c2 == '"' && c3 == '"') { // Long string result = parseLongString(); } else { // Normal string unread(c3); unread(c2); result = parseString(); } // Unescape any escape sequences try { result = TurtleUtil.decodeString(result); } catch (IllegalArgumentException e) { reportError(e.getMessage()); } return result; } /** * Parses a "normal string". This method assumes that the first double quote * has already been parsed. */ protected String parseString() throws IOException, RDFParseException { StringBuilder sb = new StringBuilder(32); while (true) { int c = read(); if (c == '"') { break; } else if (c == -1) { throwEOFException(); } sb.append((char)c); if (c == '\\') { // This escapes the next character, which might be a '"' c = read(); if (c == -1) { throwEOFException(); } sb.append((char)c); } } return sb.toString(); } /** * Parses a """long string""". This method assumes that the first three * double quotes have already been parsed. */ protected String parseLongString() throws IOException, RDFParseException { StringBuilder sb = new StringBuilder(1024); int doubleQuoteCount = 0; int c; while (doubleQuoteCount < 3) { c = read(); if (c == -1) { throwEOFException(); } else if (c == '"') { doubleQuoteCount++; } else { doubleQuoteCount = 0; } sb.append((char)c); if (c == '\\') { // This escapes the next character, which might be a '"' c = read(); if (c == -1) { throwEOFException(); } sb.append((char)c); } } return sb.substring(0, sb.length() - 3); } protected Literal parseNumber() throws IOException, RDFParseException { StringBuilder value = new StringBuilder(8); URI datatype = XMLSchema.INTEGER; int c = read(); // read optional sign character if (c == '+' || c == '-') { value.append((char)c); c = read(); } while (ASCIIUtil.isNumber(c)) { value.append((char)c); c = read(); } if (c == '.' || c == 'e' || c == 'E') { // We're parsing a decimal or a double datatype = XMLSchema.DECIMAL; // read optional fractional digits if (c == '.') { value.append((char)c); c = read(); while (ASCIIUtil.isNumber(c)) { value.append((char)c); c = read(); } if (value.length() == 1) { // We've only parsed a '.' reportFatalError("Object for statement missing"); } } else { if (value.length() == 0) { // We've only parsed an 'e' or 'E' reportFatalError("Object for statement missing"); } } // read optional exponent if (c == 'e' || c == 'E') { datatype = XMLSchema.DOUBLE; value.append((char)c); c = read(); if (c == '+' || c == '-') { value.append((char)c); c = read(); } if (!ASCIIUtil.isNumber(c)) { reportError("Exponent value missing"); } value.append((char)c); c = read(); while (ASCIIUtil.isNumber(c)) { value.append((char)c); c = read(); } } } // Unread last character, it isn't part of the number unread(c); // String label = value.toString(); // if (datatype.equals(XMLSchema.INTEGER)) { // try { // label = XMLDatatypeUtil.normalizeInteger(label); // } // catch (IllegalArgumentException e) { // // Note: this should never happen because of the parse constraints // reportError("Illegal integer value: " + label); // } // } // return createLiteral(label, null, datatype); // Return result as a typed literal return createLiteral(value.toString(), null, datatype); } protected URI parseURI() throws IOException, RDFParseException { StringBuilder uriBuf = new StringBuilder(100); // First character should be '<' int c = read(); verifyCharacter(c, "<"); // Read up to the next '>' character while (true) { c = read(); if (c == '>') { break; } else if (c == -1) { throwEOFException(); } uriBuf.append((char)c); if (c == '\\') { // This escapes the next character, which might be a '>' c = read(); if (c == -1) { throwEOFException(); } uriBuf.append((char)c); } } String uri = uriBuf.toString(); // Unescape any escape sequences try { uri = TurtleUtil.decodeString(uri); } catch (IllegalArgumentException e) { reportError(e.getMessage()); } return super.resolveURI(uri); } /** * Parses qnames and boolean values, which have equivalent starting * characters. */ protected Value parseQNameOrBoolean() throws IOException, RDFParseException { // First character should be a ':' or a letter int c = read(); if (c == -1) { throwEOFException(); } if (c != ':' && !TurtleUtil.isPrefixStartChar(c)) { reportError("Expected a ':' or a letter, found '" + (char)c + "'"); } String namespace = null; if (c == ':') { // qname using default namespace namespace = getNamespace(""); if (namespace == null) { reportError("Default namespace used but not defined"); } } else { // c is the first letter of the prefix StringBuilder prefix = new StringBuilder(8); prefix.append((char)c); c = read(); while (TurtleUtil.isPrefixChar(c)) { prefix.append((char)c); c = read(); } if (c != ':') { // prefix may actually be a boolean value String value = prefix.toString(); if (value.equals("true") || value.equals("false")) { return createLiteral(value, null, XMLSchema.BOOLEAN); } } verifyCharacter(c, ":"); namespace = getNamespace(prefix.toString()); if (namespace == null) { reportError("Namespace prefix '" + prefix.toString() + "' used but not defined"); } } // c == ':', read optional local name StringBuilder localName = new StringBuilder(16); c = read(); if (TurtleUtil.isNameStartChar(c)) { localName.append((char)c); c = read(); while (TurtleUtil.isNameChar(c)) { localName.append((char)c); c = read(); } } // Unread last character unread(c); // Note: namespace has already been resolved return createURI(namespace + localName.toString()); } /** * Parses a blank node ID, e.g. <tt>_:node1</tt>. */ protected BNode parseNodeID() throws IOException, RDFParseException { // Node ID should start with "_:" verifyCharacter(read(), "_"); verifyCharacter(read(), ":"); // Read the node ID int c = read(); if (c == -1) { throwEOFException(); } else if (!TurtleUtil.isNameStartChar(c)) { reportError("Expected a letter, found '" + (char)c + "'"); } StringBuilder name = new StringBuilder(32); name.append((char)c); // Read all following letter and numbers, they are part of the name c = read(); while (TurtleUtil.isNameChar(c)) { name.append((char)c); c = read(); } unread(c); return createBNode(name.toString()); } protected void reportStatement(Resource subj, URI pred, Value obj) throws RDFParseException, RDFHandlerException { Statement st = createStatement(subj, pred, obj); rdfHandler.handleStatement(st); } /** * Verifies that the supplied character <tt>c</tt> is one of the expected * characters specified in <tt>expected</tt>. This method will throw a * <tt>ParseException</tt> if this is not the case. */ protected void verifyCharacter(int c, String expected) throws RDFParseException { if (c == -1) { throwEOFException(); } else if (expected.indexOf((char)c) == -1) { StringBuilder msg = new StringBuilder(32); msg.append("Expected "); for (int i = 0; i < expected.length(); i++) { if (i > 0) { msg.append(" or "); } msg.append('\''); msg.append(expected.charAt(i)); msg.append('\''); } msg.append(", found '"); msg.append((char)c); msg.append("'"); reportError(msg.toString()); } } /** * Consumes any white space characters (space, tab, line feed, newline) and * comments (#-style) from <tt>reader</tt>. After this method has been * called, the first character that is returned by <tt>reader</tt> is * either a non-ignorable character, or EOF. For convenience, this character * is also returned by this method. * * @return The next character that will be returned by <tt>reader</tt>. */ protected int skipWSC() throws IOException { int c = read(); while (TurtleUtil.isWhitespace(c) || c == '#') { if (c == '#') { skipLine(); } c = read(); } unread(c); return c; } /** * Consumes characters from reader until the first EOL has been read. */ protected void skipLine() throws IOException { int c = read(); while (c != -1 && c != 0xD && c != 0xA) { c = read(); } // c is equal to -1, \r or \n. // In case c is equal to \r, we should also read a following \n. if (c == 0xD) { c = read(); if (c != 0xA) { unread(c); } } reportLocation(); } protected int read() throws IOException { return reader.read(); } protected void unread(int c) throws IOException { if (c != -1) { reader.unread(c); } } protected int peek() throws IOException { int result = read(); unread(result); return result; } protected void reportLocation() { reportLocation(lineReader.getLineNumber(), -1); } /** * Overrides {@link RDFParserBase#reportWarning(String)}, adding line number * information to the error. */ @Override protected void reportWarning(String msg) { reportWarning(msg, lineReader.getLineNumber(), -1); } /** * Overrides {@link RDFParserBase#reportError(String)}, adding line number * information to the error. */ @Override protected void reportError(String msg) throws RDFParseException { reportError(msg, lineReader.getLineNumber(), -1); } /** * Overrides {@link RDFParserBase#reportFatalError(String)}, adding line * number information to the error. */ @Override protected void reportFatalError(String msg) throws RDFParseException { reportFatalError(msg, lineReader.getLineNumber(), -1); } /** * Overrides {@link RDFParserBase#reportFatalError(Exception)}, adding line * number information to the error. */ @Override protected void reportFatalError(Exception e) throws RDFParseException { reportFatalError(e, lineReader.getLineNumber(), -1); } protected void throwEOFException() throws RDFParseException { throw new RDFParseException("Unexpected end of file"); } }