/**
* Copyright 2011 Applied Research in Patacriticism and the University of Virginia
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
**/
package org.nines;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.StringReader;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CodingErrorAction;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.StringEscapeUtils;
import org.apache.log4j.Logger;
import org.openrdf.rio.ParseErrorListener;
import org.openrdf.rio.RDFHandlerException;
import org.openrdf.rio.RDFParseException;
import org.openrdf.rio.rdfxml.RDFXMLParser;
public class RdfDocumentParser {
private static long largestTextSize = 0;
public final static Logger log = Logger.getLogger(RdfDocumentParser.class.getName());
public static long getLargestTextSize() {
return largestTextSize;
}
public static HashMap<String, HashMap<String, ArrayList<String>>> parse(final File file, ErrorReport errorReport,
LinkCollector linkCollector, RDFIndexerConfig config) throws IOException {
largestTextSize = 0;
RDFXMLParser parser = new RDFXMLParser();
NinesStatementHandler statementHandler = new NinesStatementHandler(errorReport, linkCollector, config);
statementHandler.setFile(file);
parser.setRDFHandler(statementHandler);
parser.setParseErrorListener( new ParseListener(file, errorReport));
parser.setVerifyData(true);
parser.setStopAtFirstError(false);
// parse file
try {
String content = validateContent(file, errorReport);
parser.parse( new StringReader(content), "http://foo/" + file.getName());
} catch (RDFParseException e) {
errorReport.addError(new IndexerError(file.getName(), "", "Parse Error on Line " + e.getLineNumber() + ": "
+ e.getMessage()));
} catch (RDFHandlerException e) {
errorReport.addError(new IndexerError(file.getName(), "", "StatementHandler Exception: " + e.getMessage()));
} catch (Exception e) {
errorReport.addError(new IndexerError(file.getName(), "", "RDF Parser Error: " + e.getMessage()));
e.printStackTrace();
}
// retrieve parsed data
HashMap<String, HashMap<String, ArrayList<String>>> docHash = statementHandler.getDocuments( config.isPagesArchive() );
// process tags
Collection<HashMap<String, ArrayList<String>>> documents = docHash.values();
for (HashMap<String, ArrayList<String>> document : documents) {
// normalize tags, replace spaces with dashes, lowercase
ArrayList<String> tags = document.remove("tag");
if (tags != null) {
for (int i = 0; i < tags.size(); i++) {
String tag = tags.get(i);
tag = tag.toLowerCase();
tag = tag.replaceAll(" ", "-");
tags.set(i, tag);
}
// username is archive name
String archive = document.get("archive").get(0);
ArrayList<String> nameList = new ArrayList<String>();
nameList.add(archive);
document.put("username", nameList);
document.put(archive + "_tag", tags);
}
}
largestTextSize = statementHandler.getLargestTextSize();
return docHash;
}
private static String validateContent(File file, ErrorReport errorReport) {
InputStreamReader is = null;
try {
Charset cs = Charset.availableCharsets().get("UTF-8");
CharsetDecoder decoder = cs.newDecoder();
decoder.onMalformedInput(CodingErrorAction.REPLACE);
decoder.onUnmappableCharacter(CodingErrorAction.REPLACE);
is = new InputStreamReader(new FileInputStream(file), decoder);
String content = IOUtils.toString(is);
// look for unescaped sequences and flag them as trouble
String unescaped = StringEscapeUtils.unescapeXml(content);
int startPos = 0;
while ( true ) {
int pos = unescaped.indexOf("", startPos);
if (pos > -1) {
String snip = unescaped.substring(Math.max(0, pos-25), Math.min(unescaped.length(), pos+25));
IndexerError e = new IndexerError(file.getName(), "","Potentially Invalid Escape sequence.\n Position: [" +
pos + "]\n Snippet: [" +
snip + "]");
errorReport.addError(e);
startPos = pos+2;
} else {
break;
}
}
return content;
} catch (IOException e) {
errorReport.addError(new IndexerError(file.getName(), "", "Error validating content: " + e.getMessage()));
} finally {
IOUtils.closeQuietly(is);
}
return "";
}
private static final class ParseListener implements ParseErrorListener {
private ErrorReport errorReport;
private File file;
ParseListener(File file, ErrorReport errorReport ) {
this.errorReport = errorReport;
this.file = file;
}
public void warning(String msg, int lineNo, int colNo) {
this.errorReport.addError(new IndexerError(file.getName(), "",
"Parse warning at line "+lineNo+", col "+colNo+" : " + msg));
}
public void error(String msg, int lineNo, int colNo) {
this.errorReport.addError(new IndexerError(file.getName(), "",
"Parse error at line "+lineNo+", col "+colNo+" : " + msg));
}
public void fatalError(String msg, int lineNo, int colNo) {
this.errorReport.addError(new IndexerError(file.getName(), "",
"FATAL PARSE ERROR at line "+lineNo+", col "+colNo+" : " + msg));
}
}
}