RDFIndexer.java example

/** 
 *  Copyright 2011 Applied Research in Patacriticism and the University of Virginia
 * 
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at
 * 
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 **/
package org.nines;

import java.io.File;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.URL;
import java.net.URLEncoder;
import java.text.SimpleDateFormat;
import java.util.*;

import org.apache.log4j.Logger;
import org.apache.log4j.xml.DOMConfigurator;
import org.nines.RDFIndexerConfig.Mode;

import com.google.gson.Gson;
import com.google.gson.JsonArray;
import com.google.gson.JsonElement;
import com.google.gson.JsonObject;

public class RDFIndexer {

    private int numFiles = 0;
    private int numObjects = 0;
    private int numReferences = 0;
    private long largestTextSize = 0;
    private RDFIndexerConfig config;
    private Queue<File> dataFileQueue;
    private ErrorReport errorReport;
    private LinkCollector linkCollector;
    private Logger log;
    private AsyncPoster asyncPoster;
    private JsonArray jsonPayload = new JsonArray();
    private int postCount = 0;
    private SolrClient solrClient;
    private Date ts = new Date();
    private SimpleDateFormat ts2 = new SimpleDateFormat("yyyy-MM-dd");
    private String timeStamp = new String(ts2.format(ts));

    // special field names
    private final String isPartOf = "isPartOf";
    private final String hasPart = "hasPart";

    /**
     * 
     * @param config
     * @param config
     */
    public RDFIndexer(RDFIndexerConfig config) {

        this.config = config;
        String logFileRoot = this.config.getLogfileBaseName("");

        // setup logger
        String indexLog = this.config.getLogfileBaseName("progress") + "_progress.log";
        System.setProperty("index.log.file", indexLog);
        URL url = ClassLoader.getSystemResource("log4j-index.xml");
        DOMConfigurator.configure(url);
        this.log = Logger.getLogger(RDFIndexer.class.getName());

        // keep report file in the same folder as the log file.
        String logName;
        if (this.config.mode.equals(Mode.INDEX) || this.config.mode.equals(Mode.TEST)) {
            logName = logFileRoot + "_error.log";
        } else {
            logName = logFileRoot + "_" + this.config.mode.toString().toLowerCase() + "_error.log";
        }
        File reportFile = new File(logName);
        try {
            this.errorReport = new ErrorReport(reportFile);
        } catch (IOException e1) {
            this.log.error("Unable to open error report log for writing, aborting indexer.");
            return;
        }

        this.linkCollector = new LinkCollector(this.config.getLogfileBaseName("links"));
        this.solrClient = new SolrClient(this.config.solrBaseURL);
        this.asyncPoster = new AsyncPoster( 1 );
    }

    /**
     * Execute the configured indexing task
     */
    public void execute() {

        // There is only something else to do if a MODE was configured
        if (config.mode.equals(Mode.NONE) == false) {

            // first, ensure that core is valid and exists
            try {
                this.solrClient.validateCore( config.coreName( ) );
            } catch (IOException e) {
                this.errorReport.addError(new IndexerError("Validate core", "", e.getMessage()));
            }
            
            // if a purge was requested, it must be done FIRST
            if (config.deleteAll) {
                purgeArchive( config.coreName() );
            }

            // execute based on mode setting
            if (config.mode.equals(Mode.SPIDER)) {
                this.log.info("Full Text Spider Mode");
                doSpidering();
            } else if (config.mode.equals(Mode.CLEAN_RAW)) {
                this.log.info("Raw Text Cleanup Mode");
                doRawTextCleanup();
            } else if (config.mode.equals(Mode.CLEAN_FULL)) {
                this.log.info("Full Text Cleanup Mode");
                doFullTextCleanup();
            } else if (config.mode.equals(Mode.INDEX)) {
                this.log.info("Index Mode");
                doIndexing();
            } else if (config.mode.equals(Mode.RESOLVE)) {
                this.log.info("Resolve Mode");
                doResolving();
            } else {
                this.log.info("*** TEST MODE: Not committing changes to SOLR");
                doIndexing();
            }
        }

        this.asyncPoster.shutdown( );
        this.errorReport.close( );
        this.linkCollector.close( );
    }

    private void doFullTextCleanup() {
        Date start = new Date();
        this.log.info("Started raw text cleanup at " + start);

        this.dataFileQueue = new LinkedList<File>();
        String fullPath = config.sourceDir.toString() + "/" + RDFIndexerConfig.safeArchive( config.archiveName );
        recursivelyQueueFiles(new File(fullPath), false);
        int totalFiles = this.dataFileQueue.size();

        FullTextCleaner cleaner = new FullTextCleaner(config.archiveName, this.errorReport,
            config.customCleanClass);
        while (this.dataFileQueue.size() > 0) {
            File txtFile = this.dataFileQueue.remove();
            cleaner.clean(txtFile);
            this.errorReport.flush();
        }

        String stats = "Cleaned " + totalFiles + " files (Original Size: " + cleaner.getOriginalLength()
            + ", Cleaned Size: " + cleaner.getCleanedLength() + ", Total Files Cleaned: "
            + cleaner.getTotalFilesChanged() + ")";

        Date end = new Date();
        double durationSec = (end.getTime() - start.getTime()) / 1000.0;
        if (durationSec >= 60) {
            this.log.info(String.format("%s in %3.2f minutes.", stats, (durationSec / 60.0)));
        } else {
            this.log.info(String.format("%s in %3.2f seconds.", stats, durationSec));
        }
    }

    private void doRawTextCleanup() {
        Date start = new Date();
        log.info("Started raw text cleanup at " + start);

        this.dataFileQueue = new LinkedList<File>();
        String rawPath = config.sourceDir.toString() + "/" + RDFIndexerConfig.safeArchive( config.archiveName );
        recursivelyQueueFiles(new File(rawPath), false);
        int totalFiles = this.dataFileQueue.size();

        RawTextCleaner cleaner = new RawTextCleaner(config, this.errorReport);
        while (this.dataFileQueue.size() > 0) {
            File rawFile = this.dataFileQueue.remove();
            cleaner.clean(rawFile);
            this.errorReport.flush();
        }

        String stats = "Cleaned " + totalFiles + " files (Original Size: " + cleaner.getOriginalLength()
            + ", Cleaned Size: " + cleaner.getCleanedLength() + ", Total Files Cleaned: "
            + cleaner.getTotalFilesChanged() + ")";

        Date end = new Date();
        double durationSec = (end.getTime() - start.getTime()) / 1000.0;
        if (durationSec >= 60) {
            this.log.info(String.format("%s in %3.2f minutes.", stats, (durationSec / 60.0)));
        } else {
            this.log.info(String.format("%s in %3.2f seconds.", stats, durationSec));
        }
    }
    
    /**
     * find the full path to the corrected text root baseed on 
     * the path to the original rdf sources
     * @return
     */
    private String findCorrectedTextRoot() {
        String path = config.sourceDir.toString();
        int pos = path.indexOf("/rdf/");
        path = path.substring(0, pos) + "/correctedtext/";
        path += RDFIndexerConfig.safeArchive( config.archiveName ) + "/";
        return path;
    }

    private void doIndexing() {        
        Date start = new Date();
        log.info("Started indexing at " + start);
        System.out.println("Indexing " + config.sourceDir);
        indexDirectory( config.sourceDir );
        System.out.println("Indexing DONE");

        // report indexing stats
        Date end = new Date();
        double durationSec = (end.getTime() - start.getTime()) / 1000.0;
        if (durationSec >= 60) {
            this.log.info(String.format(
                "Indexed " + numFiles + " files (" + numObjects + " objects) in %3.2f minutes.", (durationSec / 60.0)));
        } else {
            this.log.info(String.format(
                "Indexed " + numFiles + " files (" + numObjects + " objects) in %3.2f seconds.", durationSec));
        }
        this.log.info("Largest text field size: " + this.largestTextSize);
    }

    private void doResolving() {
        Date start = new Date();
        log.info("Started resolving at " + start);
        System.out.println( "Started resolving at " + start );
        updateReferenceFields();
        System.out.println("Resolving DONE");

        // report indexing stats
        Date end = new Date();
        double durationSec = (end.getTime() - start.getTime()) / 1000.0;
        if (durationSec >= 60) {
            this.log.info(String.format(
                    "Resolved/updated " + numReferences + " references in %3.2f minutes.", (durationSec / 60.0)));
        } else {
            this.log.info(String.format(
                    "Resolved/updated " + numReferences + " references in %3.2f seconds.", durationSec));
        }
    }

    private void doSpidering() {
        Date start = new Date();
        log.info("Started full-text spider at " + start);
        System.out.println("Full-text spider of " + config.sourceDir);
        spiderDirectory( config.sourceDir);
        System.out.println("DONE");

        // report indexing stats
        Date end = new Date();
        double durationSec = (end.getTime() - start.getTime()) / 1000.0;
        if (durationSec >= 60) {
            this.log.info(String.format("Spidered " + numFiles + " files in %3.2f minutes.", (durationSec / 60.0)));
        } else {
            this.log.info( String.format( "Spidered " + numFiles + " files in %3.2f seconds.", durationSec ) );
        }
    }

    private void purgeArchive(final String coreName) {
        log.info("Deleting all data from: " + coreName);
        try {
            this.solrClient.postJSON("{\"delete\": { \"query\": \"*:*\"}, \"commit\": {}}", coreName);
        } catch (IOException e) {
            errorReport.addError(new IndexerError("", "", "Unable to POST DELETE message to SOLR. "
                + e.getLocalizedMessage()));
        }
    }

    private void recursivelyQueueFiles(final File dir, final boolean rdfMode) {
        if (dir.isDirectory()) {
            log.info("loading directory: " + dir.getPath());

            File fileList[] = dir.listFiles();
            for (File entry : fileList) {
                if ( entry.getName().endsWith(".svn") || entry.getName().endsWith(".git")) {
                    log.info("Skipping source control directory");
                    continue;
                }
                if (entry.isDirectory() ) {
                    recursivelyQueueFiles(entry, rdfMode);
                }

                if (rdfMode) {
                    if (entry.getName().endsWith(".rdf") || entry.getName().endsWith(".xml")) {
                        this.dataFileQueue.add(entry);
                    }
                } else {
                    this.dataFileQueue.add(entry);
                }
            }
        } else { // a file was passed in, not a folder
            this.log.info("loading file: " + dir.getPath());
            this.dataFileQueue.add(dir);
        }
    }

    /**
     * Run through all rdf files in the directory and harvest full text
     * from remote sites.
     * 
     * @param rdfDir
     */
    private void spiderDirectory(final File rdfDir) {
        this.dataFileQueue = new LinkedList<File>();
        recursivelyQueueFiles(rdfDir, true);
        this.numFiles = this.dataFileQueue.size();
        log.info("=> Spider text for " + rdfDir + " total files: " + this.numFiles);
        RdfTextSpider spider = new RdfTextSpider( config, this.errorReport);
        while (this.dataFileQueue.size() > 0) {
            File rdfFile = this.dataFileQueue.remove();
            this.log.info("Spider text from file " + rdfFile.toString());
            spider.spider(rdfFile);
            try {
                Thread.sleep(10);
            } catch (InterruptedException e) {
            }
            this.errorReport.flush();
        }
    }

    /**
     * run through all RDF files in the directory and write them
     * to a solr archive.
     * 
     * @param rdfDir
     */
    private void indexDirectory(File rdfDir) {
        // see if corrected texts exist. 
        config.correctedTextDir = new File(  findCorrectedTextRoot() );
        if ( config.correctedTextDir .exists() ) {
            // it does; grab a list of filenames that have corrected text and cache them.
            // The file names are URIs with ugly characters replaces. Rules... 
            // '/' is replaced by _S_ and ':' by _C_
            // Undo this and save a list of corrected doc URIs
            for (File entry : config.correctedTextDir .listFiles()) {
                if ( entry.getName().endsWith(".txt")) {
                    config.correctedTextMap.put(
                        entry.getName().replaceAll("_C_", ":").replaceAll("_S_", "\\/").replaceAll(".txt",""),                        
                        entry.getName() );
                }
            }
        }
        
        this.dataFileQueue = new LinkedList<File>();
        recursivelyQueueFiles(rdfDir, true);
        this.numFiles = this.dataFileQueue.size();
        log.info( "=> Indexing " + rdfDir + " total files: " + this.numFiles );

        while (this.dataFileQueue.size() > 0) {
           File rdfFile = this.dataFileQueue.remove();
           indexFile(rdfFile);
        }

        if( config.isTestMode( ) == false ) {

            // flush any remaining data
            flush( );

            // commit the changes and wait for all the workers to complete
            this.asyncPoster.asyncCommit( this.solrClient, config.coreName( ) );
            this.asyncPoster.waitForPending( );

           // if we actually processed any documents, process any isPartOf or hasPart references
           if( this.numObjects != 0 && this.config.isPagesArchive() == false ) { 
               updateReferenceFields( );
           }
        }
    }

    private void indexFile(File file) {

        HashMap<String, HashMap<String, ArrayList<String>>> objects;

        // Parse a file into a hashmap.
        // Key is object URI, Value is a set of key-value pairs
        // that describe the object
        try {
            objects = RdfDocumentParser.parse(file, this.errorReport, this.linkCollector, config);
        } catch (IOException e) {
            this.errorReport.addError(new IndexerError(file.getName(), "", e.getMessage()));
            return;
        }

        // Log an error for no objects and bail if size is zero
        if (objects == null || objects.size() == 0) {
            errorReport.addError(new IndexerError(file.getName(), "", "No objects in this file."));
            errorReport.flush();
            return;
        }

        // save the largest text field size
        this.largestTextSize = Math.max(this.largestTextSize, RdfDocumentParser.getLargestTextSize());

        for (Map.Entry<String, HashMap<String, ArrayList<String>>> entry : objects.entrySet()) {

            String uri = entry.getKey();
            HashMap<String, ArrayList<String>> object = entry.getValue();

            // Validate archive and push objects into new archive map
            ArrayList<String> objectArray = object.get("archive");
            if (objectArray != null) {
                String objArchive = objectArray.get(0);
                if (!objArchive.equals( config.archiveName)) {
                    this.errorReport.addError(new IndexerError(file.getName(), uri, "The wrong archive was found. "
                        + objArchive + " should be " + config.archiveName));
                }
            } else {
                this.errorReport.addError(new IndexerError(file.getName(), uri,
                    "Unable to determine archive for this object."));
            }

            // validate all other parts of object and generate error report
            try {
                ArrayList<String> messages = ValidationUtility.validateObject(this.config.isPagesArchive(), object);
                for (String message : messages) {
                    IndexerError e = new IndexerError(file.getName(), uri, message);
                    errorReport.addError(e);
                }
            } catch (Exception valEx) {
                System.err.println("ERROR Validating file:" + file.getName() + " URI: " + uri);
                valEx.printStackTrace();
                IndexerError e = new IndexerError(file.getName(), uri, valEx.getMessage());
                errorReport.addError(e);
            }

            // turn this object into an XML solr docm then xml string. Add this to the curr payload
            JsonElement jsonDoc = docToJson(uri, object);
            this.jsonPayload.add(jsonDoc);

            if( config.isTestMode( ) == false ) {
                flushIfEnough( );
            }
        }

        this.numObjects += objects.size();
        this.errorReport.flush();
    }

    //
    // update the references for any isPartOf or hasPart fields
    //
    private void updateReferenceFields( ) {

        int size = config.pageSize;
        String fl = config.getFieldList( );
        String coreName = config.coreName( );
        List<String> orList = new ArrayList<String>(  );
        orList.add( isPartOf + "=http*" );
        orList.add( hasPart + "=http*" );

        while( true ) {
           List<JsonObject> results = this.solrClient.getResultsPage( coreName, config.archiveName, 0, size, fl, null, orList );

           if( results.isEmpty( ) == true ) {
              log.info( "No more references to resolve" );
              break;
           }

           log.info( "Got " + results.size() + " references to resolve" );
           for( JsonObject json : results ) {
              log.info( "Resolving references for " + json.get( "uri" ).getAsString( ) );
              updateDocumentReferences( json );
              this.numReferences++;
           }

            // flush any data and wait for completion...
            flush( );

            // commit the changes and wait for all the workers to complete
            this.asyncPoster.asyncCommit( this.solrClient, config.coreName() );
            this.asyncPoster.waitForPending( );
        }
    }

    //
    // resolve the isPartOf or hasPart references for the specified document
    //
    private void updateDocumentReferences( final JsonObject json ) {

        String fl = config.getFieldList( );
        String coreName = config.coreName();
        String uri = json.get( "uri" ).getAsString( );

        boolean updated = false;

        try {
            if( json.has( isPartOf ) == true ) {
                JsonArray refs = json.getAsJsonArray( isPartOf );
                //log.info( "isPartOf: " + refs.toString( ) );
                JsonArray objs = new JsonArray( );
                for( int ix = 0; ix < refs.size(); ix++ ) {
                    List<String> andList = new ArrayList<String>();
                    andList.add( "uri=" + URLEncoder.encode( "\"" + refs.get( ix ).getAsString( ) + "\"", "UTF-8" ) );
                    List<JsonObject> results = this.solrClient.getResultsPage( coreName, config.archiveName, 0, 1, fl, andList, null );
                    if( results.isEmpty( ) == false ) {
                        objs.add( removeExcessFields( results.get( 0 ) ) );
                    } else {
                        // reference to a non-existent object, note in the error log
                        IndexerError e = new IndexerError( "", uri, "Cannot resolve isPartOf reference (" + refs.get( ix ).getAsString( ) +
                                                           ") for document " + uri );
                        errorReport.addError( e );
                    }
                }

                // remove the field; we may replace it with resolved data
                json.remove( isPartOf );
                updated = true;

                // did we resolve any of the references
                if( objs.size( ) != 0 ) {
                    //log.info( "UPDATING isPartOf: " + objs.toString( ) );
                    json.addProperty( isPartOf, objs.toString( ) );
                }
            }

            if( json.has( hasPart ) == true ) {
                JsonArray refs = json.getAsJsonArray( hasPart );
                //log.info( "hasPart: " + refs.toString( ) );
                JsonArray objs = new JsonArray( );
                for( int ix = 0; ix < refs.size(); ix++ ) {
                    List<String> andList = new ArrayList<String>();
                    andList.add( "uri=" + URLEncoder.encode( "\"" + refs.get( ix ).getAsString( ) + "\"", "UTF-8" ) );
                    List<JsonObject> results = this.solrClient.getResultsPage( coreName, config.archiveName, 0, 1, fl, andList, null );
                    if( results.isEmpty( ) == false ) {
                        objs.add( removeExcessFields( results.get( 0 ) ) );
                    } else {
                        // reference to a non-existent object, note in the error log
                        IndexerError e = new IndexerError( "", uri, "Cannot resolve hasPart reference (" + refs.get( ix ).getAsString( ) +
                                ") for document " + uri );
                        errorReport.addError( e );
                    }
                }

                // remove the field; we may replace it with resolved data
                json.remove( hasPart );
                updated = true;

                if( objs.size( ) != 0 ) {
                    //log.info( "UPDATING hasPart: " + objs.toString( ) );
                    json.addProperty( hasPart, objs.toString( ) );
                }
            }

            if( updated == true ) {
                this.jsonPayload.add( json );
                flushIfEnough( );
            }
        } catch( UnsupportedEncodingException ex ) {
            // should never happen
        }
    }

    //
    // remove the fields we do not want for reference documents
    //
    private JsonObject removeExcessFields( JsonObject json ) {
        json.remove( isPartOf );
        json.remove( hasPart );
        json.remove( "text" );
        json.remove( "_version_" );
        json.remove( "year_sort_desc" );
        json.remove( "federation" );
        json.remove( "year" );
        json.remove( "decade" );
        json.remove( "year_sort" );
        json.remove( "year_sort_asc" );
        json.remove( "title_sort" );
        json.remove( "author_sort" );
        json.remove( "date_created" );
        json.remove( "date_updated" );
        json.remove( "century" );
        json.remove( "half_century" );
        json.remove( "quarter_century" );

        return( json );
    }

    private JsonElement docToJson(String documentName, HashMap<String, ArrayList<String>> fields) {
        Gson gson = new Gson();
        JsonObject obj = gson.toJsonTree(fields).getAsJsonObject();
        obj.addProperty("date_created", this.timeStamp);
        obj.addProperty("date_updated", this.timeStamp);
        return obj;
    }

    private void flushIfEnough( ) {
        if ( this.jsonPayload.toString().length( ) >= config.maxUploadSize ) flushPending( );
    }

    private void flush( ) {
        if ( this.jsonPayload.size( ) > 0 ) flushPending( );
    }

    // flush pending data to SOLR
    private void flushPending( ) {
        this.asyncPoster.asyncPost( this.solrClient, config.coreName( ), this.jsonPayload.toString( ) );
        this.jsonPayload = new JsonArray( );
        this.postCount++;
        if( postCount % 5 == 0 ) {
            this.asyncPoster.asyncCommit( this.solrClient, config.coreName( ) );
        }
    }
}