//Dstl (c) Crown Copyright 2017
package uk.gov.dstl.baleen.jobs;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.nio.charset.StandardCharsets;
import java.time.LocalDateTime;
import java.time.format.DateTimeFormatter;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.descriptor.ExternalResource;
import org.bson.Document;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.mongodb.client.MongoCollection;
import uk.gov.dstl.baleen.resources.SharedMongoResource;
import uk.gov.dstl.baleen.uima.BaleenTask;
import uk.gov.dstl.baleen.uima.JobSettings;
/**
* A task which outputs statistics on the Mongo database.
*
* Statistics are saved to a CSV file (specified through the configuration parameter 'file').
*
* Typically this task will be used with a FixedRate scheduler for say hourly information:
*
* <pre>
* mongo:
* host: localhost
* port: 27017
*
* job:
* schedule:
* class: FixedRate
* period: 3600
* tasks:
* - MongoStats
* </pre>
*
* The format of the csv file has columns: timestamp, num documents, num entities, num relations.
*
* @baleen.javadoc
*/
public class MongoStats extends BaleenTask {
private static final Logger LOGGER = LoggerFactory.getLogger(MongoStats.class);
/**
* Connection to Mongo
*
* @baleen.resource uk.gov.dstl.baleen.resources.SharedMongoResource
*/
public static final String KEY_MONGO = "mongo";
@ExternalResource(key = KEY_MONGO)
private SharedMongoResource mongoResource;
/**
* The collection to entities are output to
*
* @baleen.config entities
*/
public static final String PARAM_ENTITIES_COLLECTION = "entities";
@ConfigurationParameter(name = PARAM_ENTITIES_COLLECTION, defaultValue = "entities")
private String entitiesCollectionName;
/**
* The collection to relationships are output to
*
* @baleen.config relations
*/
public static final String PARAM_RELATIONS_COLLECTION = "relations";
@ConfigurationParameter(name = PARAM_RELATIONS_COLLECTION, defaultValue = "relations")
private String relationsCollectionName;
/**
* The collection to documents are output to
*
* @baleen.config documents
*/
public static final String PARAM_DOCUMENTS_COLLECTION = "documents";
@ConfigurationParameter(name = PARAM_DOCUMENTS_COLLECTION, defaultValue = "documents")
private String documentsCollectionName;
/**
* The collection to output documents to
*
* @baleen.config documents
*/
public static final String PARAM_FILE = "file";
@ConfigurationParameter(name = PARAM_FILE, defaultValue = "mongo_stats.csv")
private String filename;
private static final DateTimeFormatter FORMATTER = DateTimeFormatter.ISO_DATE_TIME;
@Override
protected void execute(JobSettings settings) throws AnalysisEngineProcessException {
MongoCollection<Document> entityCollection = mongoResource.getDB().getCollection(entitiesCollectionName);
MongoCollection<Document> documentCollection = mongoResource.getDB().getCollection(documentsCollectionName);
MongoCollection<Document> relationCollection = mongoResource.getDB().getCollection(relationsCollectionName);
File file = new File(filename);
boolean newFile = !file.exists() || file.length() == 0;
try (Writer writer = new OutputStreamWriter(new FileOutputStream(filename, true), StandardCharsets.UTF_8)) {
// We have a new file, so write a header line
if (newFile) {
writer.write("timestamp,documents,entities,relations\n");
}
writer.write(String.format("%s,%d,%d,%d%n", FORMATTER.format(LocalDateTime.now()),
documentCollection.count(),
entityCollection.count(),
relationCollection.count()));
} catch (IOException e) {
LOGGER.warn("Unable to write stats to file {} ", filename, e);
}
}
}