/** * Copyright 2011 Applied Research in Patacriticism and the University of Virginia * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. **/ package org.nines; import java.io.File; import java.io.IOException; import java.io.UnsupportedEncodingException; import java.net.URL; import java.net.URLEncoder; import java.text.SimpleDateFormat; import java.util.*; import org.apache.log4j.Logger; import org.apache.log4j.xml.DOMConfigurator; import org.nines.RDFIndexerConfig.Mode; import com.google.gson.Gson; import com.google.gson.JsonArray; import com.google.gson.JsonElement; import com.google.gson.JsonObject; public class RDFIndexer { private int numFiles = 0; private int numObjects = 0; private int numReferences = 0; private long largestTextSize = 0; private RDFIndexerConfig config; private Queue<File> dataFileQueue; private ErrorReport errorReport; private LinkCollector linkCollector; private Logger log; private AsyncPoster asyncPoster; private JsonArray jsonPayload = new JsonArray(); private int postCount = 0; private SolrClient solrClient; private Date ts = new Date(); private SimpleDateFormat ts2 = new SimpleDateFormat("yyyy-MM-dd"); private String timeStamp = new String(ts2.format(ts)); // special field names private final String isPartOf = "isPartOf"; private final String hasPart = "hasPart"; /** * * @param config * @param config */ public RDFIndexer(RDFIndexerConfig config) { this.config = config; String logFileRoot = this.config.getLogfileBaseName(""); // setup logger String indexLog = this.config.getLogfileBaseName("progress") + "_progress.log"; System.setProperty("index.log.file", indexLog); URL url = ClassLoader.getSystemResource("log4j-index.xml"); DOMConfigurator.configure(url); this.log = Logger.getLogger(RDFIndexer.class.getName()); // keep report file in the same folder as the log file. String logName; if (this.config.mode.equals(Mode.INDEX) || this.config.mode.equals(Mode.TEST)) { logName = logFileRoot + "_error.log"; } else { logName = logFileRoot + "_" + this.config.mode.toString().toLowerCase() + "_error.log"; } File reportFile = new File(logName); try { this.errorReport = new ErrorReport(reportFile); } catch (IOException e1) { this.log.error("Unable to open error report log for writing, aborting indexer."); return; } this.linkCollector = new LinkCollector(this.config.getLogfileBaseName("links")); this.solrClient = new SolrClient(this.config.solrBaseURL); this.asyncPoster = new AsyncPoster( 1 ); } /** * Execute the configured indexing task */ public void execute() { // There is only something else to do if a MODE was configured if (config.mode.equals(Mode.NONE) == false) { // first, ensure that core is valid and exists try { this.solrClient.validateCore( config.coreName( ) ); } catch (IOException e) { this.errorReport.addError(new IndexerError("Validate core", "", e.getMessage())); } // if a purge was requested, it must be done FIRST if (config.deleteAll) { purgeArchive( config.coreName() ); } // execute based on mode setting if (config.mode.equals(Mode.SPIDER)) { this.log.info("Full Text Spider Mode"); doSpidering(); } else if (config.mode.equals(Mode.CLEAN_RAW)) { this.log.info("Raw Text Cleanup Mode"); doRawTextCleanup(); } else if (config.mode.equals(Mode.CLEAN_FULL)) { this.log.info("Full Text Cleanup Mode"); doFullTextCleanup(); } else if (config.mode.equals(Mode.INDEX)) { this.log.info("Index Mode"); doIndexing(); } else if (config.mode.equals(Mode.RESOLVE)) { this.log.info("Resolve Mode"); doResolving(); } else { this.log.info("*** TEST MODE: Not committing changes to SOLR"); doIndexing(); } } this.asyncPoster.shutdown( ); this.errorReport.close( ); this.linkCollector.close( ); } private void doFullTextCleanup() { Date start = new Date(); this.log.info("Started raw text cleanup at " + start); this.dataFileQueue = new LinkedList<File>(); String fullPath = config.sourceDir.toString() + "/" + RDFIndexerConfig.safeArchive( config.archiveName ); recursivelyQueueFiles(new File(fullPath), false); int totalFiles = this.dataFileQueue.size(); FullTextCleaner cleaner = new FullTextCleaner(config.archiveName, this.errorReport, config.customCleanClass); while (this.dataFileQueue.size() > 0) { File txtFile = this.dataFileQueue.remove(); cleaner.clean(txtFile); this.errorReport.flush(); } String stats = "Cleaned " + totalFiles + " files (Original Size: " + cleaner.getOriginalLength() + ", Cleaned Size: " + cleaner.getCleanedLength() + ", Total Files Cleaned: " + cleaner.getTotalFilesChanged() + ")"; Date end = new Date(); double durationSec = (end.getTime() - start.getTime()) / 1000.0; if (durationSec >= 60) { this.log.info(String.format("%s in %3.2f minutes.", stats, (durationSec / 60.0))); } else { this.log.info(String.format("%s in %3.2f seconds.", stats, durationSec)); } } private void doRawTextCleanup() { Date start = new Date(); log.info("Started raw text cleanup at " + start); this.dataFileQueue = new LinkedList<File>(); String rawPath = config.sourceDir.toString() + "/" + RDFIndexerConfig.safeArchive( config.archiveName ); recursivelyQueueFiles(new File(rawPath), false); int totalFiles = this.dataFileQueue.size(); RawTextCleaner cleaner = new RawTextCleaner(config, this.errorReport); while (this.dataFileQueue.size() > 0) { File rawFile = this.dataFileQueue.remove(); cleaner.clean(rawFile); this.errorReport.flush(); } String stats = "Cleaned " + totalFiles + " files (Original Size: " + cleaner.getOriginalLength() + ", Cleaned Size: " + cleaner.getCleanedLength() + ", Total Files Cleaned: " + cleaner.getTotalFilesChanged() + ")"; Date end = new Date(); double durationSec = (end.getTime() - start.getTime()) / 1000.0; if (durationSec >= 60) { this.log.info(String.format("%s in %3.2f minutes.", stats, (durationSec / 60.0))); } else { this.log.info(String.format("%s in %3.2f seconds.", stats, durationSec)); } } /** * find the full path to the corrected text root baseed on * the path to the original rdf sources * @return */ private String findCorrectedTextRoot() { String path = config.sourceDir.toString(); int pos = path.indexOf("/rdf/"); path = path.substring(0, pos) + "/correctedtext/"; path += RDFIndexerConfig.safeArchive( config.archiveName ) + "/"; return path; } private void doIndexing() { Date start = new Date(); log.info("Started indexing at " + start); System.out.println("Indexing " + config.sourceDir); indexDirectory( config.sourceDir ); System.out.println("Indexing DONE"); // report indexing stats Date end = new Date(); double durationSec = (end.getTime() - start.getTime()) / 1000.0; if (durationSec >= 60) { this.log.info(String.format( "Indexed " + numFiles + " files (" + numObjects + " objects) in %3.2f minutes.", (durationSec / 60.0))); } else { this.log.info(String.format( "Indexed " + numFiles + " files (" + numObjects + " objects) in %3.2f seconds.", durationSec)); } this.log.info("Largest text field size: " + this.largestTextSize); } private void doResolving() { Date start = new Date(); log.info("Started resolving at " + start); System.out.println( "Started resolving at " + start ); updateReferenceFields(); System.out.println("Resolving DONE"); // report indexing stats Date end = new Date(); double durationSec = (end.getTime() - start.getTime()) / 1000.0; if (durationSec >= 60) { this.log.info(String.format( "Resolved/updated " + numReferences + " references in %3.2f minutes.", (durationSec / 60.0))); } else { this.log.info(String.format( "Resolved/updated " + numReferences + " references in %3.2f seconds.", durationSec)); } } private void doSpidering() { Date start = new Date(); log.info("Started full-text spider at " + start); System.out.println("Full-text spider of " + config.sourceDir); spiderDirectory( config.sourceDir); System.out.println("DONE"); // report indexing stats Date end = new Date(); double durationSec = (end.getTime() - start.getTime()) / 1000.0; if (durationSec >= 60) { this.log.info(String.format("Spidered " + numFiles + " files in %3.2f minutes.", (durationSec / 60.0))); } else { this.log.info( String.format( "Spidered " + numFiles + " files in %3.2f seconds.", durationSec ) ); } } private void purgeArchive(final String coreName) { log.info("Deleting all data from: " + coreName); try { this.solrClient.postJSON("{\"delete\": { \"query\": \"*:*\"}, \"commit\": {}}", coreName); } catch (IOException e) { errorReport.addError(new IndexerError("", "", "Unable to POST DELETE message to SOLR. " + e.getLocalizedMessage())); } } private void recursivelyQueueFiles(final File dir, final boolean rdfMode) { if (dir.isDirectory()) { log.info("loading directory: " + dir.getPath()); File fileList[] = dir.listFiles(); for (File entry : fileList) { if ( entry.getName().endsWith(".svn") || entry.getName().endsWith(".git")) { log.info("Skipping source control directory"); continue; } if (entry.isDirectory() ) { recursivelyQueueFiles(entry, rdfMode); } if (rdfMode) { if (entry.getName().endsWith(".rdf") || entry.getName().endsWith(".xml")) { this.dataFileQueue.add(entry); } } else { this.dataFileQueue.add(entry); } } } else { // a file was passed in, not a folder this.log.info("loading file: " + dir.getPath()); this.dataFileQueue.add(dir); } } /** * Run through all rdf files in the directory and harvest full text * from remote sites. * * @param rdfDir */ private void spiderDirectory(final File rdfDir) { this.dataFileQueue = new LinkedList<File>(); recursivelyQueueFiles(rdfDir, true); this.numFiles = this.dataFileQueue.size(); log.info("=> Spider text for " + rdfDir + " total files: " + this.numFiles); RdfTextSpider spider = new RdfTextSpider( config, this.errorReport); while (this.dataFileQueue.size() > 0) { File rdfFile = this.dataFileQueue.remove(); this.log.info("Spider text from file " + rdfFile.toString()); spider.spider(rdfFile); try { Thread.sleep(10); } catch (InterruptedException e) { } this.errorReport.flush(); } } /** * run through all RDF files in the directory and write them * to a solr archive. * * @param rdfDir */ private void indexDirectory(File rdfDir) { // see if corrected texts exist. config.correctedTextDir = new File( findCorrectedTextRoot() ); if ( config.correctedTextDir .exists() ) { // it does; grab a list of filenames that have corrected text and cache them. // The file names are URIs with ugly characters replaces. Rules... // '/' is replaced by _S_ and ':' by _C_ // Undo this and save a list of corrected doc URIs for (File entry : config.correctedTextDir .listFiles()) { if ( entry.getName().endsWith(".txt")) { config.correctedTextMap.put( entry.getName().replaceAll("_C_", ":").replaceAll("_S_", "\\/").replaceAll(".txt",""), entry.getName() ); } } } this.dataFileQueue = new LinkedList<File>(); recursivelyQueueFiles(rdfDir, true); this.numFiles = this.dataFileQueue.size(); log.info( "=> Indexing " + rdfDir + " total files: " + this.numFiles ); while (this.dataFileQueue.size() > 0) { File rdfFile = this.dataFileQueue.remove(); indexFile(rdfFile); } if( config.isTestMode( ) == false ) { // flush any remaining data flush( ); // commit the changes and wait for all the workers to complete this.asyncPoster.asyncCommit( this.solrClient, config.coreName( ) ); this.asyncPoster.waitForPending( ); // if we actually processed any documents, process any isPartOf or hasPart references if( this.numObjects != 0 && this.config.isPagesArchive() == false ) { updateReferenceFields( ); } } } private void indexFile(File file) { HashMap<String, HashMap<String, ArrayList<String>>> objects; // Parse a file into a hashmap. // Key is object URI, Value is a set of key-value pairs // that describe the object try { objects = RdfDocumentParser.parse(file, this.errorReport, this.linkCollector, config); } catch (IOException e) { this.errorReport.addError(new IndexerError(file.getName(), "", e.getMessage())); return; } // Log an error for no objects and bail if size is zero if (objects == null || objects.size() == 0) { errorReport.addError(new IndexerError(file.getName(), "", "No objects in this file.")); errorReport.flush(); return; } // save the largest text field size this.largestTextSize = Math.max(this.largestTextSize, RdfDocumentParser.getLargestTextSize()); for (Map.Entry<String, HashMap<String, ArrayList<String>>> entry : objects.entrySet()) { String uri = entry.getKey(); HashMap<String, ArrayList<String>> object = entry.getValue(); // Validate archive and push objects into new archive map ArrayList<String> objectArray = object.get("archive"); if (objectArray != null) { String objArchive = objectArray.get(0); if (!objArchive.equals( config.archiveName)) { this.errorReport.addError(new IndexerError(file.getName(), uri, "The wrong archive was found. " + objArchive + " should be " + config.archiveName)); } } else { this.errorReport.addError(new IndexerError(file.getName(), uri, "Unable to determine archive for this object.")); } // validate all other parts of object and generate error report try { ArrayList<String> messages = ValidationUtility.validateObject(this.config.isPagesArchive(), object); for (String message : messages) { IndexerError e = new IndexerError(file.getName(), uri, message); errorReport.addError(e); } } catch (Exception valEx) { System.err.println("ERROR Validating file:" + file.getName() + " URI: " + uri); valEx.printStackTrace(); IndexerError e = new IndexerError(file.getName(), uri, valEx.getMessage()); errorReport.addError(e); } // turn this object into an XML solr docm then xml string. Add this to the curr payload JsonElement jsonDoc = docToJson(uri, object); this.jsonPayload.add(jsonDoc); if( config.isTestMode( ) == false ) { flushIfEnough( ); } } this.numObjects += objects.size(); this.errorReport.flush(); } // // update the references for any isPartOf or hasPart fields // private void updateReferenceFields( ) { int size = config.pageSize; String fl = config.getFieldList( ); String coreName = config.coreName( ); List<String> orList = new ArrayList<String>( ); orList.add( isPartOf + "=http*" ); orList.add( hasPart + "=http*" ); while( true ) { List<JsonObject> results = this.solrClient.getResultsPage( coreName, config.archiveName, 0, size, fl, null, orList ); if( results.isEmpty( ) == true ) { log.info( "No more references to resolve" ); break; } log.info( "Got " + results.size() + " references to resolve" ); for( JsonObject json : results ) { log.info( "Resolving references for " + json.get( "uri" ).getAsString( ) ); updateDocumentReferences( json ); this.numReferences++; } // flush any data and wait for completion... flush( ); // commit the changes and wait for all the workers to complete this.asyncPoster.asyncCommit( this.solrClient, config.coreName() ); this.asyncPoster.waitForPending( ); } } // // resolve the isPartOf or hasPart references for the specified document // private void updateDocumentReferences( final JsonObject json ) { String fl = config.getFieldList( ); String coreName = config.coreName(); String uri = json.get( "uri" ).getAsString( ); boolean updated = false; try { if( json.has( isPartOf ) == true ) { JsonArray refs = json.getAsJsonArray( isPartOf ); //log.info( "isPartOf: " + refs.toString( ) ); JsonArray objs = new JsonArray( ); for( int ix = 0; ix < refs.size(); ix++ ) { List<String> andList = new ArrayList<String>(); andList.add( "uri=" + URLEncoder.encode( "\"" + refs.get( ix ).getAsString( ) + "\"", "UTF-8" ) ); List<JsonObject> results = this.solrClient.getResultsPage( coreName, config.archiveName, 0, 1, fl, andList, null ); if( results.isEmpty( ) == false ) { objs.add( removeExcessFields( results.get( 0 ) ) ); } else { // reference to a non-existent object, note in the error log IndexerError e = new IndexerError( "", uri, "Cannot resolve isPartOf reference (" + refs.get( ix ).getAsString( ) + ") for document " + uri ); errorReport.addError( e ); } } // remove the field; we may replace it with resolved data json.remove( isPartOf ); updated = true; // did we resolve any of the references if( objs.size( ) != 0 ) { //log.info( "UPDATING isPartOf: " + objs.toString( ) ); json.addProperty( isPartOf, objs.toString( ) ); } } if( json.has( hasPart ) == true ) { JsonArray refs = json.getAsJsonArray( hasPart ); //log.info( "hasPart: " + refs.toString( ) ); JsonArray objs = new JsonArray( ); for( int ix = 0; ix < refs.size(); ix++ ) { List<String> andList = new ArrayList<String>(); andList.add( "uri=" + URLEncoder.encode( "\"" + refs.get( ix ).getAsString( ) + "\"", "UTF-8" ) ); List<JsonObject> results = this.solrClient.getResultsPage( coreName, config.archiveName, 0, 1, fl, andList, null ); if( results.isEmpty( ) == false ) { objs.add( removeExcessFields( results.get( 0 ) ) ); } else { // reference to a non-existent object, note in the error log IndexerError e = new IndexerError( "", uri, "Cannot resolve hasPart reference (" + refs.get( ix ).getAsString( ) + ") for document " + uri ); errorReport.addError( e ); } } // remove the field; we may replace it with resolved data json.remove( hasPart ); updated = true; if( objs.size( ) != 0 ) { //log.info( "UPDATING hasPart: " + objs.toString( ) ); json.addProperty( hasPart, objs.toString( ) ); } } if( updated == true ) { this.jsonPayload.add( json ); flushIfEnough( ); } } catch( UnsupportedEncodingException ex ) { // should never happen } } // // remove the fields we do not want for reference documents // private JsonObject removeExcessFields( JsonObject json ) { json.remove( isPartOf ); json.remove( hasPart ); json.remove( "text" ); json.remove( "_version_" ); json.remove( "year_sort_desc" ); json.remove( "federation" ); json.remove( "year" ); json.remove( "decade" ); json.remove( "year_sort" ); json.remove( "year_sort_asc" ); json.remove( "title_sort" ); json.remove( "author_sort" ); json.remove( "date_created" ); json.remove( "date_updated" ); json.remove( "century" ); json.remove( "half_century" ); json.remove( "quarter_century" ); return( json ); } private JsonElement docToJson(String documentName, HashMap<String, ArrayList<String>> fields) { Gson gson = new Gson(); JsonObject obj = gson.toJsonTree(fields).getAsJsonObject(); obj.addProperty("date_created", this.timeStamp); obj.addProperty("date_updated", this.timeStamp); return obj; } private void flushIfEnough( ) { if ( this.jsonPayload.toString().length( ) >= config.maxUploadSize ) flushPending( ); } private void flush( ) { if ( this.jsonPayload.size( ) > 0 ) flushPending( ); } // flush pending data to SOLR private void flushPending( ) { this.asyncPoster.asyncPost( this.solrClient, config.coreName( ), this.jsonPayload.toString( ) ); this.jsonPayload = new JsonArray( ); this.postCount++; if( postCount % 5 == 0 ) { this.asyncPoster.asyncCommit( this.solrClient, config.coreName( ) ); } } }