/**
* Copyright 2011 Applied Research in Patacriticism and the University of Virginia
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
**/
package org.nines;
import java.io.File;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.URL;
import java.net.URLEncoder;
import java.text.SimpleDateFormat;
import java.util.*;
import org.apache.log4j.Logger;
import org.apache.log4j.xml.DOMConfigurator;
import org.nines.RDFIndexerConfig.Mode;
import com.google.gson.Gson;
import com.google.gson.JsonArray;
import com.google.gson.JsonElement;
import com.google.gson.JsonObject;
public class RDFIndexer {
private int numFiles = 0;
private int numObjects = 0;
private int numReferences = 0;
private long largestTextSize = 0;
private RDFIndexerConfig config;
private Queue<File> dataFileQueue;
private ErrorReport errorReport;
private LinkCollector linkCollector;
private Logger log;
private AsyncPoster asyncPoster;
private JsonArray jsonPayload = new JsonArray();
private int postCount = 0;
private SolrClient solrClient;
private Date ts = new Date();
private SimpleDateFormat ts2 = new SimpleDateFormat("yyyy-MM-dd");
private String timeStamp = new String(ts2.format(ts));
// special field names
private final String isPartOf = "isPartOf";
private final String hasPart = "hasPart";
/**
*
* @param config
* @param config
*/
public RDFIndexer(RDFIndexerConfig config) {
this.config = config;
String logFileRoot = this.config.getLogfileBaseName("");
// setup logger
String indexLog = this.config.getLogfileBaseName("progress") + "_progress.log";
System.setProperty("index.log.file", indexLog);
URL url = ClassLoader.getSystemResource("log4j-index.xml");
DOMConfigurator.configure(url);
this.log = Logger.getLogger(RDFIndexer.class.getName());
// keep report file in the same folder as the log file.
String logName;
if (this.config.mode.equals(Mode.INDEX) || this.config.mode.equals(Mode.TEST)) {
logName = logFileRoot + "_error.log";
} else {
logName = logFileRoot + "_" + this.config.mode.toString().toLowerCase() + "_error.log";
}
File reportFile = new File(logName);
try {
this.errorReport = new ErrorReport(reportFile);
} catch (IOException e1) {
this.log.error("Unable to open error report log for writing, aborting indexer.");
return;
}
this.linkCollector = new LinkCollector(this.config.getLogfileBaseName("links"));
this.solrClient = new SolrClient(this.config.solrBaseURL);
this.asyncPoster = new AsyncPoster( 1 );
}
/**
* Execute the configured indexing task
*/
public void execute() {
// There is only something else to do if a MODE was configured
if (config.mode.equals(Mode.NONE) == false) {
// first, ensure that core is valid and exists
try {
this.solrClient.validateCore( config.coreName( ) );
} catch (IOException e) {
this.errorReport.addError(new IndexerError("Validate core", "", e.getMessage()));
}
// if a purge was requested, it must be done FIRST
if (config.deleteAll) {
purgeArchive( config.coreName() );
}
// execute based on mode setting
if (config.mode.equals(Mode.SPIDER)) {
this.log.info("Full Text Spider Mode");
doSpidering();
} else if (config.mode.equals(Mode.CLEAN_RAW)) {
this.log.info("Raw Text Cleanup Mode");
doRawTextCleanup();
} else if (config.mode.equals(Mode.CLEAN_FULL)) {
this.log.info("Full Text Cleanup Mode");
doFullTextCleanup();
} else if (config.mode.equals(Mode.INDEX)) {
this.log.info("Index Mode");
doIndexing();
} else if (config.mode.equals(Mode.RESOLVE)) {
this.log.info("Resolve Mode");
doResolving();
} else {
this.log.info("*** TEST MODE: Not committing changes to SOLR");
doIndexing();
}
}
this.asyncPoster.shutdown( );
this.errorReport.close( );
this.linkCollector.close( );
}
private void doFullTextCleanup() {
Date start = new Date();
this.log.info("Started raw text cleanup at " + start);
this.dataFileQueue = new LinkedList<File>();
String fullPath = config.sourceDir.toString() + "/" + RDFIndexerConfig.safeArchive( config.archiveName );
recursivelyQueueFiles(new File(fullPath), false);
int totalFiles = this.dataFileQueue.size();
FullTextCleaner cleaner = new FullTextCleaner(config.archiveName, this.errorReport,
config.customCleanClass);
while (this.dataFileQueue.size() > 0) {
File txtFile = this.dataFileQueue.remove();
cleaner.clean(txtFile);
this.errorReport.flush();
}
String stats = "Cleaned " + totalFiles + " files (Original Size: " + cleaner.getOriginalLength()
+ ", Cleaned Size: " + cleaner.getCleanedLength() + ", Total Files Cleaned: "
+ cleaner.getTotalFilesChanged() + ")";
Date end = new Date();
double durationSec = (end.getTime() - start.getTime()) / 1000.0;
if (durationSec >= 60) {
this.log.info(String.format("%s in %3.2f minutes.", stats, (durationSec / 60.0)));
} else {
this.log.info(String.format("%s in %3.2f seconds.", stats, durationSec));
}
}
private void doRawTextCleanup() {
Date start = new Date();
log.info("Started raw text cleanup at " + start);
this.dataFileQueue = new LinkedList<File>();
String rawPath = config.sourceDir.toString() + "/" + RDFIndexerConfig.safeArchive( config.archiveName );
recursivelyQueueFiles(new File(rawPath), false);
int totalFiles = this.dataFileQueue.size();
RawTextCleaner cleaner = new RawTextCleaner(config, this.errorReport);
while (this.dataFileQueue.size() > 0) {
File rawFile = this.dataFileQueue.remove();
cleaner.clean(rawFile);
this.errorReport.flush();
}
String stats = "Cleaned " + totalFiles + " files (Original Size: " + cleaner.getOriginalLength()
+ ", Cleaned Size: " + cleaner.getCleanedLength() + ", Total Files Cleaned: "
+ cleaner.getTotalFilesChanged() + ")";
Date end = new Date();
double durationSec = (end.getTime() - start.getTime()) / 1000.0;
if (durationSec >= 60) {
this.log.info(String.format("%s in %3.2f minutes.", stats, (durationSec / 60.0)));
} else {
this.log.info(String.format("%s in %3.2f seconds.", stats, durationSec));
}
}
/**
* find the full path to the corrected text root baseed on
* the path to the original rdf sources
* @return
*/
private String findCorrectedTextRoot() {
String path = config.sourceDir.toString();
int pos = path.indexOf("/rdf/");
path = path.substring(0, pos) + "/correctedtext/";
path += RDFIndexerConfig.safeArchive( config.archiveName ) + "/";
return path;
}
private void doIndexing() {
Date start = new Date();
log.info("Started indexing at " + start);
System.out.println("Indexing " + config.sourceDir);
indexDirectory( config.sourceDir );
System.out.println("Indexing DONE");
// report indexing stats
Date end = new Date();
double durationSec = (end.getTime() - start.getTime()) / 1000.0;
if (durationSec >= 60) {
this.log.info(String.format(
"Indexed " + numFiles + " files (" + numObjects + " objects) in %3.2f minutes.", (durationSec / 60.0)));
} else {
this.log.info(String.format(
"Indexed " + numFiles + " files (" + numObjects + " objects) in %3.2f seconds.", durationSec));
}
this.log.info("Largest text field size: " + this.largestTextSize);
}
private void doResolving() {
Date start = new Date();
log.info("Started resolving at " + start);
System.out.println( "Started resolving at " + start );
updateReferenceFields();
System.out.println("Resolving DONE");
// report indexing stats
Date end = new Date();
double durationSec = (end.getTime() - start.getTime()) / 1000.0;
if (durationSec >= 60) {
this.log.info(String.format(
"Resolved/updated " + numReferences + " references in %3.2f minutes.", (durationSec / 60.0)));
} else {
this.log.info(String.format(
"Resolved/updated " + numReferences + " references in %3.2f seconds.", durationSec));
}
}
private void doSpidering() {
Date start = new Date();
log.info("Started full-text spider at " + start);
System.out.println("Full-text spider of " + config.sourceDir);
spiderDirectory( config.sourceDir);
System.out.println("DONE");
// report indexing stats
Date end = new Date();
double durationSec = (end.getTime() - start.getTime()) / 1000.0;
if (durationSec >= 60) {
this.log.info(String.format("Spidered " + numFiles + " files in %3.2f minutes.", (durationSec / 60.0)));
} else {
this.log.info( String.format( "Spidered " + numFiles + " files in %3.2f seconds.", durationSec ) );
}
}
private void purgeArchive(final String coreName) {
log.info("Deleting all data from: " + coreName);
try {
this.solrClient.postJSON("{\"delete\": { \"query\": \"*:*\"}, \"commit\": {}}", coreName);
} catch (IOException e) {
errorReport.addError(new IndexerError("", "", "Unable to POST DELETE message to SOLR. "
+ e.getLocalizedMessage()));
}
}
private void recursivelyQueueFiles(final File dir, final boolean rdfMode) {
if (dir.isDirectory()) {
log.info("loading directory: " + dir.getPath());
File fileList[] = dir.listFiles();
for (File entry : fileList) {
if ( entry.getName().endsWith(".svn") || entry.getName().endsWith(".git")) {
log.info("Skipping source control directory");
continue;
}
if (entry.isDirectory() ) {
recursivelyQueueFiles(entry, rdfMode);
}
if (rdfMode) {
if (entry.getName().endsWith(".rdf") || entry.getName().endsWith(".xml")) {
this.dataFileQueue.add(entry);
}
} else {
this.dataFileQueue.add(entry);
}
}
} else { // a file was passed in, not a folder
this.log.info("loading file: " + dir.getPath());
this.dataFileQueue.add(dir);
}
}
/**
* Run through all rdf files in the directory and harvest full text
* from remote sites.
*
* @param rdfDir
*/
private void spiderDirectory(final File rdfDir) {
this.dataFileQueue = new LinkedList<File>();
recursivelyQueueFiles(rdfDir, true);
this.numFiles = this.dataFileQueue.size();
log.info("=> Spider text for " + rdfDir + " total files: " + this.numFiles);
RdfTextSpider spider = new RdfTextSpider( config, this.errorReport);
while (this.dataFileQueue.size() > 0) {
File rdfFile = this.dataFileQueue.remove();
this.log.info("Spider text from file " + rdfFile.toString());
spider.spider(rdfFile);
try {
Thread.sleep(10);
} catch (InterruptedException e) {
}
this.errorReport.flush();
}
}
/**
* run through all RDF files in the directory and write them
* to a solr archive.
*
* @param rdfDir
*/
private void indexDirectory(File rdfDir) {
// see if corrected texts exist.
config.correctedTextDir = new File( findCorrectedTextRoot() );
if ( config.correctedTextDir .exists() ) {
// it does; grab a list of filenames that have corrected text and cache them.
// The file names are URIs with ugly characters replaces. Rules...
// '/' is replaced by _S_ and ':' by _C_
// Undo this and save a list of corrected doc URIs
for (File entry : config.correctedTextDir .listFiles()) {
if ( entry.getName().endsWith(".txt")) {
config.correctedTextMap.put(
entry.getName().replaceAll("_C_", ":").replaceAll("_S_", "\\/").replaceAll(".txt",""),
entry.getName() );
}
}
}
this.dataFileQueue = new LinkedList<File>();
recursivelyQueueFiles(rdfDir, true);
this.numFiles = this.dataFileQueue.size();
log.info( "=> Indexing " + rdfDir + " total files: " + this.numFiles );
while (this.dataFileQueue.size() > 0) {
File rdfFile = this.dataFileQueue.remove();
indexFile(rdfFile);
}
if( config.isTestMode( ) == false ) {
// flush any remaining data
flush( );
// commit the changes and wait for all the workers to complete
this.asyncPoster.asyncCommit( this.solrClient, config.coreName( ) );
this.asyncPoster.waitForPending( );
// if we actually processed any documents, process any isPartOf or hasPart references
if( this.numObjects != 0 && this.config.isPagesArchive() == false ) {
updateReferenceFields( );
}
}
}
private void indexFile(File file) {
HashMap<String, HashMap<String, ArrayList<String>>> objects;
// Parse a file into a hashmap.
// Key is object URI, Value is a set of key-value pairs
// that describe the object
try {
objects = RdfDocumentParser.parse(file, this.errorReport, this.linkCollector, config);
} catch (IOException e) {
this.errorReport.addError(new IndexerError(file.getName(), "", e.getMessage()));
return;
}
// Log an error for no objects and bail if size is zero
if (objects == null || objects.size() == 0) {
errorReport.addError(new IndexerError(file.getName(), "", "No objects in this file."));
errorReport.flush();
return;
}
// save the largest text field size
this.largestTextSize = Math.max(this.largestTextSize, RdfDocumentParser.getLargestTextSize());
for (Map.Entry<String, HashMap<String, ArrayList<String>>> entry : objects.entrySet()) {
String uri = entry.getKey();
HashMap<String, ArrayList<String>> object = entry.getValue();
// Validate archive and push objects into new archive map
ArrayList<String> objectArray = object.get("archive");
if (objectArray != null) {
String objArchive = objectArray.get(0);
if (!objArchive.equals( config.archiveName)) {
this.errorReport.addError(new IndexerError(file.getName(), uri, "The wrong archive was found. "
+ objArchive + " should be " + config.archiveName));
}
} else {
this.errorReport.addError(new IndexerError(file.getName(), uri,
"Unable to determine archive for this object."));
}
// validate all other parts of object and generate error report
try {
ArrayList<String> messages = ValidationUtility.validateObject(this.config.isPagesArchive(), object);
for (String message : messages) {
IndexerError e = new IndexerError(file.getName(), uri, message);
errorReport.addError(e);
}
} catch (Exception valEx) {
System.err.println("ERROR Validating file:" + file.getName() + " URI: " + uri);
valEx.printStackTrace();
IndexerError e = new IndexerError(file.getName(), uri, valEx.getMessage());
errorReport.addError(e);
}
// turn this object into an XML solr docm then xml string. Add this to the curr payload
JsonElement jsonDoc = docToJson(uri, object);
this.jsonPayload.add(jsonDoc);
if( config.isTestMode( ) == false ) {
flushIfEnough( );
}
}
this.numObjects += objects.size();
this.errorReport.flush();
}
//
// update the references for any isPartOf or hasPart fields
//
private void updateReferenceFields( ) {
int size = config.pageSize;
String fl = config.getFieldList( );
String coreName = config.coreName( );
List<String> orList = new ArrayList<String>( );
orList.add( isPartOf + "=http*" );
orList.add( hasPart + "=http*" );
while( true ) {
List<JsonObject> results = this.solrClient.getResultsPage( coreName, config.archiveName, 0, size, fl, null, orList );
if( results.isEmpty( ) == true ) {
log.info( "No more references to resolve" );
break;
}
log.info( "Got " + results.size() + " references to resolve" );
for( JsonObject json : results ) {
log.info( "Resolving references for " + json.get( "uri" ).getAsString( ) );
updateDocumentReferences( json );
this.numReferences++;
}
// flush any data and wait for completion...
flush( );
// commit the changes and wait for all the workers to complete
this.asyncPoster.asyncCommit( this.solrClient, config.coreName() );
this.asyncPoster.waitForPending( );
}
}
//
// resolve the isPartOf or hasPart references for the specified document
//
private void updateDocumentReferences( final JsonObject json ) {
String fl = config.getFieldList( );
String coreName = config.coreName();
String uri = json.get( "uri" ).getAsString( );
boolean updated = false;
try {
if( json.has( isPartOf ) == true ) {
JsonArray refs = json.getAsJsonArray( isPartOf );
//log.info( "isPartOf: " + refs.toString( ) );
JsonArray objs = new JsonArray( );
for( int ix = 0; ix < refs.size(); ix++ ) {
List<String> andList = new ArrayList<String>();
andList.add( "uri=" + URLEncoder.encode( "\"" + refs.get( ix ).getAsString( ) + "\"", "UTF-8" ) );
List<JsonObject> results = this.solrClient.getResultsPage( coreName, config.archiveName, 0, 1, fl, andList, null );
if( results.isEmpty( ) == false ) {
objs.add( removeExcessFields( results.get( 0 ) ) );
} else {
// reference to a non-existent object, note in the error log
IndexerError e = new IndexerError( "", uri, "Cannot resolve isPartOf reference (" + refs.get( ix ).getAsString( ) +
") for document " + uri );
errorReport.addError( e );
}
}
// remove the field; we may replace it with resolved data
json.remove( isPartOf );
updated = true;
// did we resolve any of the references
if( objs.size( ) != 0 ) {
//log.info( "UPDATING isPartOf: " + objs.toString( ) );
json.addProperty( isPartOf, objs.toString( ) );
}
}
if( json.has( hasPart ) == true ) {
JsonArray refs = json.getAsJsonArray( hasPart );
//log.info( "hasPart: " + refs.toString( ) );
JsonArray objs = new JsonArray( );
for( int ix = 0; ix < refs.size(); ix++ ) {
List<String> andList = new ArrayList<String>();
andList.add( "uri=" + URLEncoder.encode( "\"" + refs.get( ix ).getAsString( ) + "\"", "UTF-8" ) );
List<JsonObject> results = this.solrClient.getResultsPage( coreName, config.archiveName, 0, 1, fl, andList, null );
if( results.isEmpty( ) == false ) {
objs.add( removeExcessFields( results.get( 0 ) ) );
} else {
// reference to a non-existent object, note in the error log
IndexerError e = new IndexerError( "", uri, "Cannot resolve hasPart reference (" + refs.get( ix ).getAsString( ) +
") for document " + uri );
errorReport.addError( e );
}
}
// remove the field; we may replace it with resolved data
json.remove( hasPart );
updated = true;
if( objs.size( ) != 0 ) {
//log.info( "UPDATING hasPart: " + objs.toString( ) );
json.addProperty( hasPart, objs.toString( ) );
}
}
if( updated == true ) {
this.jsonPayload.add( json );
flushIfEnough( );
}
} catch( UnsupportedEncodingException ex ) {
// should never happen
}
}
//
// remove the fields we do not want for reference documents
//
private JsonObject removeExcessFields( JsonObject json ) {
json.remove( isPartOf );
json.remove( hasPart );
json.remove( "text" );
json.remove( "_version_" );
json.remove( "year_sort_desc" );
json.remove( "federation" );
json.remove( "year" );
json.remove( "decade" );
json.remove( "year_sort" );
json.remove( "year_sort_asc" );
json.remove( "title_sort" );
json.remove( "author_sort" );
json.remove( "date_created" );
json.remove( "date_updated" );
json.remove( "century" );
json.remove( "half_century" );
json.remove( "quarter_century" );
return( json );
}
private JsonElement docToJson(String documentName, HashMap<String, ArrayList<String>> fields) {
Gson gson = new Gson();
JsonObject obj = gson.toJsonTree(fields).getAsJsonObject();
obj.addProperty("date_created", this.timeStamp);
obj.addProperty("date_updated", this.timeStamp);
return obj;
}
private void flushIfEnough( ) {
if ( this.jsonPayload.toString().length( ) >= config.maxUploadSize ) flushPending( );
}
private void flush( ) {
if ( this.jsonPayload.size( ) > 0 ) flushPending( );
}
// flush pending data to SOLR
private void flushPending( ) {
this.asyncPoster.asyncPost( this.solrClient, config.coreName( ), this.jsonPayload.toString( ) );
this.jsonPayload = new JsonArray( );
this.postCount++;
if( postCount % 5 == 0 ) {
this.asyncPoster.asyncCommit( this.solrClient, config.coreName( ) );
}
}
}