/*
* Copyright 2010 Paula Gearon
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.mulgara.content.rdfa;
// Java 2 standard packages
import java.io.InputStream;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.HashMap;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.TimeUnit;
import java.util.Map;
import javax.activation.MimeType;
// Third party packages
import org.apache.log4j.Logger; // Apache Log4J
import org.jrdf.graph.BlankNode; // JRDF
import org.jrdf.graph.Literal;
import org.jrdf.graph.Node;
import org.jrdf.graph.ObjectNode;
import org.jrdf.graph.PredicateNode;
import org.jrdf.graph.SubjectNode;
import org.jrdf.graph.Triple;
import org.xml.sax.InputSource;
import org.xml.sax.XMLReader;
import net.rootdev.javardfa.ParserFactory.Format;
import net.rootdev.javardfa.StatementSink;
import net.rootdev.javardfa.ParserFactory;
// Locally written packages
import org.mulgara.content.Content;
import org.mulgara.content.NotModifiedException;
import org.mulgara.parser.MulgaraParserException;
import org.mulgara.query.TuplesException;
import org.mulgara.query.rdf.BlankNodeImpl;
import org.mulgara.query.rdf.LiteralImpl;
import org.mulgara.query.rdf.MimeTypes;
import org.mulgara.query.rdf.TripleImpl;
import org.mulgara.query.rdf.URIReferenceImpl;
import org.mulgara.resolver.spi.ResolverSession;
import org.mulgara.util.IntFile;
import org.mulgara.util.TempDir;
/**
*
* @created 2010-08-09
* @author Paula Gearon
*/
class StatementParser implements Runnable, StatementSink {
/** Logger. */
private static final Logger logger = Logger.getLogger(StatementParser.class.getName());
/** Text prefix for blank nodes. */
@SuppressWarnings("unused")
private static final String BLANK_PREFIX = "_:";
/** The prefix that rdfa-java uses */
private static final String RJ_PREFIX = "_:node";
/** The period of time to wait in ms for the parser to provide some data. */
private static final int TIMEOUT = 30000;
/**
* Maximum size for the {@link #triples} buffer. Any larger and the parser will
* block and drain.
*/
private static final int BUFFER_SIZE = 1000;
/** Mapping between parsed blank node IDs and local node numbers. */
private IntFile blankNodeIdMap;
/** Mapping between blank node IDs and blank-node instances that haven't been stored. */
private Map<Long,BlankNodeImpl> blankNodeInstMap = new HashMap<Long,BlankNodeImpl>();
/** The resolverSession to create new internal identifiers for blank nodes. */
private ResolverSession resolverSession;
/** The data to be parsed and its metadata */
private final Content content;
/** The stream containing the data to be parsed. */
private InputStream inputStream;
/** The parser for the input stream. */
private XMLReader reader;
/** The base of the document. */
private URI base;
/** Resolves relative URIs and IRIs to absolute URIs/IRIs */
private BasedResolver parseResolver;
/** The queue of triples generated by the parser. */
private LinkedBlockingQueue<Triple> triples = new LinkedBlockingQueue<Triple>(BUFFER_SIZE);
/** A marker to indicate that the end of the data has been reached. */
static final Triple TERMINATOR = new TripleImpl(null, null, null);
/** The number of parsed statements */
private long statementCount = 0;
/** Indicates that parsing is complete */
private volatile boolean finished = false;
/** Used to asynchronously indicate an exception. */
private Throwable exception = null;
/** Thread used for parsing. This is the producer thread. */
private Thread parserThread;
/**
* Sets up the sink to start receiving triples.
* @param content Contains the data for parsing and its metadata.
* @param resolverSession Access to the database for inserting data.
*/
StatementParser(Content content, ResolverSession resolverSession) throws NotModifiedException, TuplesException {
if (content == null) throw new IllegalArgumentException("Null \"content\" parameter");
if (resolverSession == null) throw new IllegalArgumentException("Null \"resolverSession\" parameter");
this.content = content;
this.resolverSession = resolverSession;
try {
this.blankNodeIdMap = IntFile.open(TempDir.createTempFile("rdfaidmap", null), true);
this.inputStream = content.newInputStream();
this.base = content.getURI();
parseResolver = new BasedResolver(content.getURIString());
reader = ParserFactory.createReaderForFormat(this, getType(content), parseResolver);
} catch (Exception e) {
throw new TuplesException("Unable to obtain input stream from " + content.getURI(), e);
}
}
/**
* @return the number of statements parsed so far
*/
long getStatementCount() {
return statementCount;
}
/**
* Do the parsing. This is the entry point of the parsing thread.
*/
public void run() {
parserThread = Thread.currentThread();
Throwable t = null;
try {
reader.parse(new InputSource(inputStream));
if (logger.isDebugEnabled()) logger.debug("Parsed RDFa on " + content.getURI());
return;
} catch (Throwable th) {
logger.error("Error parsing RDFa", th);
t = th;
} finally {
try {
triples.put(TERMINATOR);
} catch (InterruptedException e) {
logger.error("Error ending RDFa parse", e);
t = e;
}
if (t != null) exception = t;
finished = true;
}
if (logger.isDebugEnabled()) logger.debug("Exception while parsing RDFa", exception);
}
public void start() {
if (logger.isDebugEnabled()) logger.debug("Started RDFa document");
}
public void end() {
if (logger.isDebugEnabled()) logger.debug("End RDFa document");
finished = true;
}
public void addPrefix(String prefix, String uri) {
if (logger.isDebugEnabled()) logger.debug("@prefix " + prefix + ": <" + uri + "> .");
}
public void setBase(String base) {
try {
if (base != null) parseResolver.setBase(base);
} catch (IllegalArgumentException e) {
logger.warn("Invalid base in RDFa file: " + base);
}
}
/**
* Adds an triple with a Literal as the object.
* @param subject string form of the subject.
* @param predicate string form of the predicate.
* @param lex The lexical form of the literal in the object.
* @param lang The language code of the literal in the object. May be <code>null</code>.
* @param datatype The datatype of the literal in the object. May be <code>null</code>.
*/
public void addLiteral(String subject, String predicate, String lex, String lang, String datatype) {
try {
enqueue((SubjectNode)toNode(subject), (PredicateNode)toNode(predicate), toLiteral(lex, lang, datatype));
} catch (MulgaraParserException e) {
logger.error("Unable to parse. " + e.getMessage());
return;
}
}
/**
* Adds an triple with a URI or blank node as the object.
* @param subject string form of the subject.
* @param predicate string form of the predicate.
* @param object string form of the object.
*/
public void addObject(String subject, String predicate, String object) {
try {
enqueue((SubjectNode)toNode(subject), (PredicateNode)toNode(predicate), (ObjectNode)toNode(object));
} catch (MulgaraParserException e) {
logger.error("Unable to parse. " + e.getMessage());
return;
}
}
/**
* Add a parsed triple to the queue.
* @param subjectNode The subject of the triple.
* @param predicateNode The predicate of the triple.
* @param objectNode The object of the triple.
*/
void enqueue(SubjectNode subjectNode, PredicateNode predicateNode, ObjectNode objectNode) {
if (logger.isDebugEnabled()) {
logger.debug("Parsed " + subjectNode + " " + predicateNode + " " + objectNode + " from " + content.getURI());
}
try {
triples.put(new TripleImpl(subjectNode, predicateNode, objectNode));
} catch (InterruptedException e) {
throw new RuntimeException("Unable to record parsed triple", e);
}
statementCount++;
}
/**
* Convert and validate an AST object into a node.
*
* @param text The text of the node that was parsed.
* @return a {@link Node} formed from the text
* @throws MulgaraParserException An unhandled element was encountered.
*/
private Node toNode(String text) throws MulgaraParserException {
if (text == null) return new URIReferenceImpl(base);
if (text.startsWith(RJ_PREFIX)) return getBlankNode(text);
return toUri(text);
}
/**
* Creates a URIReference out of a string.
* @param text The string to convert.
* @return A new URIReference containing the URI from the string.
* @throws MulgaraParserException The text was not a valid URI.
*/
private Node toUri(String text) throws MulgaraParserException {
try {
return new URIReferenceImpl(new URI(text));
} catch (URISyntaxException e) {
throw new MulgaraParserException("Invalid URI: " + text, e);
}
}
/**
* Create a blank node from a URI with a blank node form.
*
* @param n The node to convert to an anonymous node.
* @return An anonymous node that the node maps to.
*/
private BlankNode getBlankNode(String n) throws MulgaraParserException {
long anonId;
try {
anonId = Long.parseLong(n.substring(RJ_PREFIX.length()));
} catch (NumberFormatException nfe) {
throw new MulgaraParserException("Invalid blank node: " + n);
}
if (anonId < 0) throw new MulgaraParserException("Inexpected blank node format: " + n);
synchronized (this) {
// look up the id in the blank node map
long internalId = blankNodeIdMap.getLong(anonId);
// check if the node was found
BlankNodeImpl blankNode;
if (internalId == 0) {
blankNode = blankNodeInstMap.get(anonId);
if (blankNode == null) {
blankNode = new BlankNodeImpl();
blankNodeInstMap.put(anonId, blankNode);
}
} else {
// Found the ID, so need to recreate the anonymous resource for it
blankNode = new BlankNodeImpl(internalId);
}
return blankNode;
}
}
/**
* Creates a literal out of three components.
* @param text The lexical value of the literal.
* @param lang The language code of the literal, or <code>null</code> if not an
* untyped literal with a language code.
* @param datatype The URI of the datatype of the literal, or <code>null</code>
* if an untyped literal.
* @return A new literal.
*/
Literal toLiteral(String text, String lang, String datatype) throws MulgaraParserException {
if (datatype != null) {
assert lang == null;
try {
return new LiteralImpl(text, new URI(datatype));
} catch (URISyntaxException e) {
throw new MulgaraParserException("Invalid datatype on literal: " + text + "^^" + datatype, e);
}
}
if (lang != null) return new LiteralImpl(text, lang);
return new LiteralImpl(text);
}
/**
* If an exception occurred in the parser, throws a TuplesException that
* wraps the exception.
*/
private void checkForException() throws TuplesException {
if (exception != null) {
throw new TuplesException("Exception while reading " + content.getURIString(), exception);
}
}
/**
* Returns a new triple from the queue or null if there are no more triples.
* @return The oldest triple in the queue.
*/
Triple getTriple() throws TuplesException {
checkForException();
allocateBlankNodes();
try {
Triple result = triples.poll(TIMEOUT, TimeUnit.MILLISECONDS);
if (result == null) throw new TuplesException("Timeout waiting for data from parser");
return result;
} catch (InterruptedException e) {
throw new TuplesException("Unable to retrieve data from the parser", e);
}
}
/**
* Allocate the ids for the new blank nodes.
*/
private void allocateBlankNodes() {
try {
for (Map.Entry<Long, BlankNodeImpl> entry : blankNodeInstMap.entrySet()) {
resolverSession.localize(entry.getValue()); // This sets and returns the node ID
blankNodeIdMap.putLong(entry.getKey(), entry.getValue().getNodeId());
}
blankNodeInstMap.clear();
} catch (Exception le) {
throw new RuntimeException("Unable to create blank node", le);
}
}
/**
* Stops the thread.
*/
void terminate() {
finished = true;
if (parserThread != null && parserThread.isAlive()) parserThread.interrupt();
triples.clear();
try {
triples.put(TERMINATOR);
} catch (InterruptedException e) {
exception = e;
}
}
/**
* Tests if the parse is complete.
* @return <code>true</code> if parsing is over.
*/
boolean isFinished() {
return finished;
}
/**
* Determine the type of parsing to be done, based on the content.
* @param c The Content to be parsed.
* @return Either <code>Format.XHTML</code> or <code>Format.HTML</code>.
* @throws NotModifiedException
*/
private Format getType(Content c) throws NotModifiedException {
MimeType t = c.getContentType();
if (t != null) {
if (MimeTypes.APPLICATION_XHTML.match(t)) return Format.XHTML;
if (MimeTypes.TEXT_HTML.match(t)) return Format.HTML;
}
String loc = c.getURIString();
if (loc != null) {
if (loc.endsWith(RdfaContentHandler.XHTML_EXT)) return Format.XHTML;
if (loc.endsWith(RdfaContentHandler.HTML_EXT)) return Format.HTML;
}
logger.warn("Guessing HTML for unknown MIME type: " + t);
return Format.HTML;
}
}