//Dstl (c) Crown Copyright 2017
package uk.gov.dstl.baleen.jobs.interactions;
import java.io.IOException;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.descriptor.ExternalResource;
import com.mongodb.Mongo;
import uk.gov.dstl.baleen.jobs.interactions.io.CsvInteractionReader;
import uk.gov.dstl.baleen.jobs.interactions.io.CsvInteractionWriter;
import uk.gov.dstl.baleen.jobs.interactions.io.MongoInteractionWriter;
import uk.gov.dstl.baleen.resources.SharedMongoResource;
import uk.gov.dstl.baleen.uima.BaleenTask;
import uk.gov.dstl.baleen.uima.JobSettings;
/**
* Upload interaction data from CSV to Mongo.
* <p>
* The CSV will be in {@link CsvInteractionWriter} format and the Mongo output will be in
* {@link Mongo} format.
* <p>
* The upload job will typically look like:
*
* <pre>
* mongo:
* db: baleen
* host: localhost
*
* job:
* tasks:
* - class: interactions.UploadInteractionsToMongo
* input: output/interactions-enhanced.csv
* </pre>
*
* The CSV file could be generated by a previous set in the process (ie through the
* {@link IdentifyInteractions} task, or it should be generated or manually created previously. The
* former provides a fast route to getting started, through typically the CSV will need some editing
* to tune performance.
* <p>
* Typically the extraction pipeline will then use the MongoStemming gazetteer:
*
* <pre>
* - class: gazetteer.MongoStemming
* collection: interactions
* type: Interaction
* </pre>
*
* A second, optional, annotator can be used to filter relations to only the correct UIMA types.
* This should be put after relationship extraction and will read from the database.
*
* <pre>
* - class: cleaners.RelationTypeFilter
* </pre>
*
* @baleen.javadoc
*
*/
public class UploadInteractionsToMongo extends BaleenTask {
/**
* Connection to Mongo
*
* @baleen.resource uk.gov.dstl.baleen.resources.SharedMongoResource
*/
public static final String KEY_MONGO = "mongo";
@ExternalResource(key = KEY_MONGO)
private SharedMongoResource mongo;
/**
* Clear existing Mongo collections before uploading the new data.
*
* @baleen.config clear true
*/
public static final String KEY_CLEAR = "clear";
@ConfigurationParameter(name = KEY_CLEAR, defaultValue = "true")
private Boolean clearCollection;
/**
* The name of the Mongo collection to outputs type (source, target, type) constraints too
*
* @baleen.config patterns relationTypes
*/
public static final String KEY_RELATIONSHIP_COLLECTION = "relationTypesCollection";
@ConfigurationParameter(name = KEY_RELATIONSHIP_COLLECTION, defaultValue = "relationTypes")
private String relationTypesCollection;
/**
* The name of the Mongo collection to output the interaction words to (as a gazetteer)
*
* @baleen.config patterns interactions
*/
public static final String KEY_INTERACTION_COLLECTION = "interactionCollection";
@ConfigurationParameter(name = KEY_INTERACTION_COLLECTION, defaultValue = "interactions")
private String interactionCollection;
/**
* The CSV file to load (written by CSVInteractionWriter)
*
* @baleen.config input interactions.csv
*/
public static final String KEY_CSV_FILENAME = "input";
@ConfigurationParameter(name = KEY_CSV_FILENAME, defaultValue = "interactions.csv")
private String inputFilename;
@Override
protected void execute(JobSettings settings) throws AnalysisEngineProcessException {
try (MongoInteractionWriter writer = new MongoInteractionWriter(mongo.getDB(), relationTypesCollection,
interactionCollection)) {
if (clearCollection) {
getMonitor().info("Clearing previous interacton collection");
writer.clear();
}
final CsvInteractionReader reader = new CsvInteractionReader(inputFilename);
reader.read((i, a) -> writer.write(i, a));
} catch (final IOException e) {
throw new AnalysisEngineProcessException(e);
}
getMonitor().info("Finished uploading interactions to Mongo");
}
}