/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package riotcmd;
import java.io.IOException ;
import java.io.InputStream ;
import java.io.OutputStream ;
import java.util.zip.GZIPOutputStream ;
import arq.cmdline.ModLangOutput ;
import arq.cmdline.ModLangParse ;
import arq.cmdline.ModContext ;
import arq.cmdline.ModTime ;
import jena.cmd.ArgDecl ;
import jena.cmd.CmdException;
import jena.cmd.CmdGeneral ;
import org.apache.jena.Jena ;
import org.apache.jena.atlas.io.IO ;
import org.apache.jena.atlas.lib.InternalErrorException ;
import org.apache.jena.atlas.lib.Pair ;
import org.apache.jena.atlas.web.ContentType ;
import org.apache.jena.atlas.web.TypedInputStream ;
import org.apache.jena.query.ARQ ;
import org.apache.jena.riot.* ;
import org.apache.jena.riot.lang.LabelToNode ;
import org.apache.jena.riot.lang.StreamRDFCounting ;
import org.apache.jena.riot.out.NodeToLabel ;
import org.apache.jena.riot.process.inf.InfFactory ;
import org.apache.jena.riot.process.inf.InferenceSetupRDFS ;
import org.apache.jena.riot.system.* ;
import org.apache.jena.riot.tokens.Tokenizer ;
import org.apache.jena.riot.tokens.TokenizerFactory ;
import org.apache.jena.sparql.core.DatasetGraph ;
import org.apache.jena.sparql.core.DatasetGraphFactory ;
import org.apache.jena.system.JenaSystem ;
/** Common framework for running RIOT parsers */
public abstract class CmdLangParse extends CmdGeneral
{
static { JenaSystem.init(); }
protected ModTime modTime = new ModTime() ;
protected ModLangParse modLangParse = new ModLangParse() ;
protected ModLangOutput modLangOutput = new ModLangOutput() ;
protected InferenceSetupRDFS setup = null ;
protected ModContext modContext = new ModContext() ;
protected ArgDecl strictDecl = new ArgDecl(ArgDecl.NoValue, "strict") ;
protected boolean cmdStrictMode = false ;
interface LangHandler {
String getItemsName() ;
String getRateName() ;
}
static LangHandler langHandlerQuads = new LangHandler() {
@Override
public String getItemsName() { return "quads" ; }
@Override
public String getRateName() { return "QPS" ; }
} ;
static LangHandler langHandlerTriples = new LangHandler() {
@Override
public String getItemsName() { return "triples" ; }
@Override
public String getRateName() { return "TPS" ; }
} ;
static LangHandler langHandlerAny = new LangHandler() {
@Override
public String getItemsName() { return "tuples" ; }
@Override
public String getRateName() { return "TPS" ; }
} ;
protected LangHandler langHandlerOverall = null ;
protected CmdLangParse(String[] argv)
{
super(argv) ;
addModule(modContext) ;
addModule(modTime) ;
addModule(modLangOutput) ;
addModule(modLangParse) ;
super.modVersion.addClass(Jena.class) ;
// Force - sometimes initialization does not cause these
// to initialized early enough for reflection.
String x1 = ARQ.VERSION ;
String x2 = ARQ.BUILD_DATE ;
super.modVersion.addClass(RIOT.class) ;
}
@Override
protected String getSummary() {
return getCommandName()+" [--time] [--check|--noCheck] [--sink] [--base=IRI] [--out=FORMAT] [--compress] file ..." ;
}
protected long totalMillis = 0 ;
protected long totalTuples = 0 ;
OutputStream output = System.out ;
StreamRDF outputStream = null ;
@Override
protected void processModulesAndArgs() {
cmdStrictMode = super.contains(strictDecl) ;
}
protected interface PostParseHandler { void postParse(); }
@Override
protected void exec() {
boolean oldStrictValue = SysRIOT.isStrictMode() ;
if ( modLangParse.strictMode() )
SysRIOT.setStrictMode(true) ;
try { exec$() ; }
finally { SysRIOT.setStrictMode(oldStrictValue) ; }
}
protected void exec$() {
if ( modLangParse.getRDFSVocab() != null )
setup = new InferenceSetupRDFS(modLangParse.getRDFSVocab()) ;
if ( modLangOutput.compressedOutput() ) {
try { output = new GZIPOutputStream(output, true) ; }
catch (IOException e) { IO.exception(e);}
}
outputStream = null ;
PostParseHandler postParse = null ;
outputStream = createStreamSink() ;
if ( outputStream == null ) {
Pair<StreamRDF, PostParseHandler> p = createAccumulateSink() ;
outputStream = p.getLeft() ;
postParse = p.getRight();
}
try {
if ( super.getPositional().isEmpty() )
parseFile("-");
else {
boolean b = super.getPositional().size() > 1;
for ( String fn : super.getPositional() ) {
if ( b && !super.isQuiet() )
SysRIOT.getLogger().info("File: " + fn);
parseFile(fn);
}
}
if ( postParse != null )
postParse.postParse();
if ( super.getPositional().size() > 1 && modTime.timingEnabled() )
output("Total", totalTuples, totalMillis, langHandlerOverall) ;
} finally {
if ( output != System.out )
IO.close(output) ;
else
IO.flush(output);
System.err.flush() ;
}
}
public void parseFile(String filename) {
TypedInputStream in = null ;
if ( filename.equals("-") ) {
in = new TypedInputStream(System.in) ;
parseFile("http://base/", "stdin", in) ;
} else {
try {
in = RDFDataMgr.open(filename) ;
} catch (Exception ex) {
System.err.println("Can't open '"+filename+"' "+ex.getMessage()) ;
return ;
}
parseFile(null, filename, in) ;
IO.close(in) ;
}
}
public void parseFile(String defaultBaseURI, String filename, TypedInputStream in) {
String baseURI = modLangParse.getBaseIRI() ;
if ( baseURI == null )
baseURI = defaultBaseURI ;
parseRIOT(baseURI, filename, in) ;
}
protected abstract Lang selectLang(String filename, ContentType contentType, Lang dftLang ) ;
protected void parseRIOT(String baseURI, String filename, TypedInputStream in) {
ContentType ct = in.getMediaType() ;
baseURI = SysRIOT.chooseBaseIRI(baseURI, filename) ;
boolean checking = true ;
if ( modLangParse.explicitChecking() ) checking = true ;
if ( modLangParse.explicitNoChecking() ) checking = false ;
ErrorHandler errHandler = ErrorHandlerFactory.errorHandlerWarn ;
if ( checking ) {
if ( modLangParse.stopOnBadTerm() )
errHandler = ErrorHandlerFactory.errorHandlerStd ;
else
// Try to go on if possible. This is the default behaviour.
errHandler = ErrorHandlerFactory.errorHandlerWarn ;
}
if ( modLangParse.skipOnBadTerm() ) {
// TODO skipOnBadterm
}
Lang lang = selectLang(filename, ct, RDFLanguages.NQUADS) ;
LangHandler handler = null ;
if ( RDFLanguages.isQuads(lang) )
handler = langHandlerQuads ;
else if ( RDFLanguages.isTriples(lang) )
handler = langHandlerTriples ;
else
throw new CmdException("Undefined language: "+lang) ;
// If multiple files, choose the overall labels.
if ( langHandlerOverall == null )
langHandlerOverall = handler ;
else {
if ( langHandlerOverall != langHandlerAny ) {
if ( langHandlerOverall != handler )
langHandlerOverall = langHandlerAny ;
}
}
// Make a flag.
// Input and output subflags.
// If input is "label, then output using NodeToLabel.createBNodeByLabelRaw() ;
// else use NodeToLabel.createBNodeByLabel() ;
// Also, as URI.
final boolean labelsAsGiven = false ;
NodeToLabel labels = SyntaxLabels.createNodeToLabel() ;
if ( labelsAsGiven )
labels = NodeToLabel.createBNodeByLabelEncoded() ;
StreamRDF s = outputStream ;
if ( setup != null )
s = InfFactory.inf(s, setup) ;
StreamRDFCounting sink = StreamRDFLib.count(s) ;
s = null ;
ReaderRIOT reader = RDFDataMgr.createReader(lang) ;
try {
if ( checking ) {
if ( lang == RDFLanguages.NTRIPLES || lang == RDFLanguages.NQUADS )
reader.setParserProfile(RiotLib.profile(baseURI, false, true, errHandler)) ;
else
reader.setParserProfile(RiotLib.profile(baseURI, true, true, errHandler)) ;
} else
reader.setParserProfile(RiotLib.profile(baseURI, false, false, errHandler)) ;
if ( labelsAsGiven ) {
FactoryRDF f = RiotLib.factoryRDF(LabelToNode.createUseLabelAsGiven()) ;
reader.getParserProfile().setFactoryRDF(f);
}
modTime.startTimer() ;
sink.start() ;
reader.read(in, baseURI, ct, sink, null) ;
sink.finish() ;
} catch (RiotException ex) {
// Should have handled the exception and logged a message by now.
// System.err.println("++++"+ex.getMessage());
if ( modLangParse.stopOnBadTerm() )
return ;
} finally {
// Not close the output - we may write again to the underlying output stream in another call to parse a file.
IO.close(in) ;
}
long x = modTime.endTimer() ;
long n = sink.countTriples()+sink.countQuads() ;
if ( modTime.timingEnabled() )
output(filename, n, x, handler) ;
totalMillis += x ;
totalTuples += n ;
}
/** Create a streaming output sink if possible */
protected StreamRDF createStreamSink() {
if ( modLangParse.toBitBucket() )
return StreamRDFLib.sinkNull() ;
RDFFormat fmt = modLangOutput.getOutputStreamFormat() ;
if ( fmt == null )
return null ;
/** Create an accumulating output stream for later pretty printing */
return StreamRDFWriter.getWriterStream(output, fmt) ;
}
/** Create an accumulating output stream for later pretty printing */
protected Pair<StreamRDF, PostParseHandler> createAccumulateSink() {
final DatasetGraph dsg = DatasetGraphFactory.create() ;
StreamRDF sink = StreamRDFLib.dataset(dsg) ;
final RDFFormat fmt = modLangOutput.getOutputFormatted() ;
PostParseHandler handler = new PostParseHandler() {
@Override
public void postParse() {
// Try as dataset, then as graph.
WriterDatasetRIOTFactory w = RDFWriterRegistry.getWriterDatasetFactory(fmt) ;
if ( w != null ) {
RDFDataMgr.write(output, dsg, fmt) ;
return ;
}
WriterGraphRIOTFactory wg = RDFWriterRegistry.getWriterGraphFactory(fmt) ;
if ( wg != null ) {
RDFDataMgr.write(System.out, dsg.getDefaultGraph(), fmt) ;
return ;
}
throw new InternalErrorException("failed to find the writer: "+fmt) ;
}
} ;
return Pair.create(sink, handler) ;
}
protected Tokenizer makeTokenizer(InputStream in) {
Tokenizer tokenizer = TokenizerFactory.makeTokenizerUTF8(in) ;
return tokenizer ;
}
protected void output(String label, long numberTriples, long timeMillis, LangHandler handler) {
double timeSec = timeMillis/1000.0 ;
System.out.flush() ;
System.err.printf("%s : %,5.2f sec %,d %s %,.2f %s\n",
label,
timeMillis/1000.0, numberTriples,
handler.getItemsName(),
timeSec == 0 ? 0.0 : numberTriples/timeSec,
handler.getRateName()) ;
}
protected void output(String label) {
System.err.printf("%s : \n", label) ;
}
}