//Dstl (c) Crown Copyright 2017
package uk.gov.dstl.baleen.collectionreaders;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.Charset;
import java.util.LinkedList;
import java.util.List;
import org.apache.commons.io.IOUtils;
import org.apache.uima.UimaContext;
import org.apache.uima.collection.CollectionException;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.descriptor.ExternalResource;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import org.bson.Document;
import org.bson.types.ObjectId;
import com.mongodb.client.MongoCollection;
import uk.gov.dstl.baleen.core.utils.BaleenDefaults;
import uk.gov.dstl.baleen.exceptions.InvalidParameterException;
import uk.gov.dstl.baleen.resources.SharedMongoResource;
import uk.gov.dstl.baleen.types.metadata.Metadata;
import uk.gov.dstl.baleen.uima.BaleenCollectionReader;
import uk.gov.dstl.baleen.uima.IContentExtractor;
/**
* This collection reader will process an entire Mongo collection, and then watch for new documents.
*
* <p>The ObjectId (_id) is used to sort documents and to identify new documents.
* This may miss documents if there is a high write rate, or if documents are being inserted by multiple clients.
* For more information, see http://docs.mongodb.org/manual/reference/object-id/</p>
*
*
* @baleen.javadoc
*/
public class MongoReader extends BaleenCollectionReader {
/**
* Connection to Mongo
*
* @baleen.resource uk.gov.dstl.baleen.resources.SharedMongoResource
*/
public static final String KEY_MONGO = "mongo";
@ExternalResource(key = KEY_MONGO)
SharedMongoResource mongo;
/**
* The Mongo collection to read data from
*
* @baleen.config input
*/
public static final String PARAM_COLLECTION = "collection";
@ConfigurationParameter(name = PARAM_COLLECTION, defaultValue = "input")
private String collection;
/**
* The field containing the Mongo ID (ObjectId)
*
* @baleen.config _id
*/
public static final String PARAM_ID_FIELD = "idField";
@ConfigurationParameter(name = PARAM_ID_FIELD, defaultValue = "_id")
private String idField;
/**
* The field containing the document content
*
* @baleen.config content
*/
public static final String PARAM_CONTENT_FIELD = "contentField";
@ConfigurationParameter(name = PARAM_CONTENT_FIELD, defaultValue = "content")
private String contentField;
/**
* The content extractor to use to extract content from files
*
* @baleen.config Value of BaleenDefaults.DEFAULT_CONTENT_EXTRACTOR
*/
public static final String PARAM_CONTENT_EXTRACTOR = "contentExtractor";
@ConfigurationParameter(name = PARAM_CONTENT_EXTRACTOR, defaultValue=BaleenDefaults.DEFAULT_CONTENT_EXTRACTOR)
private String contentExtractor;
/**
* Should the source document be deleted from Mongo after reading
*
* @baleen.config false
*/
public static final String PARAM_DELETE_SOURCE = "deleteSource";
@ConfigurationParameter(name = PARAM_DELETE_SOURCE, defaultValue = "false")
private boolean deleteSource = false;
private MongoCollection<Document> coll;
List<ObjectId> queue = new LinkedList<>();
ObjectId lastId = null;
private IContentExtractor extractor;
@Override
protected void doInitialize(UimaContext context) throws ResourceInitializationException {
try{
extractor = getContentExtractor(contentExtractor);
}catch(InvalidParameterException ipe){
throw new ResourceInitializationException(ipe);
}
extractor.initialize(context, getConfigParameters(context));
coll = mongo.getDB().getCollection(collection);
getNewIds();
}
@Override
protected void doGetNext(JCas jCas) throws IOException, CollectionException {
ObjectId id = queue.remove(0);
Document docIdField = new Document(idField, id);
Document document = coll.find(docIdField).first();
if(document == null){
getMonitor().error("No document returned from Mongo");
throw new CollectionException();
}
String content = (String) document.get(contentField);
InputStream is = IOUtils.toInputStream(content, Charset.defaultCharset());
extractor.processStream(is, mongo.getMongoURI() + "." + collection + "#" + id, jCas);
for(String key : document.keySet()){
if(contentField.equals(key) || idField.equals(key)){
continue;
}else{
Object obj = document.get(key);
processMongoMetadataField(jCas, key, obj);
}
}
if(deleteSource){
coll.deleteOne(docIdField);
}
}
private void processMongoMetadataField(JCas jCas, String key, Object obj){
if(obj instanceof List){
List<?> list = (List<?>) obj;
for(Object o : list){
addMetadata(jCas, key, o.toString());
}
}else{
addMetadata(jCas, key, obj.toString());
}
}
private void addMetadata(JCas jCas, String key, String value){
Metadata md = new Metadata(jCas);
md.setKey(key);
md.setValue(value);
getSupport().add(md);
}
@Override
protected void doClose() throws IOException {
coll = null;
if(extractor != null) {
extractor.destroy();
extractor = null;
}
}
@Override
public boolean doHasNext() throws IOException, CollectionException {
getNewIds();
return !queue.isEmpty();
}
private void getNewIds(){
Document query;
if(lastId != null){
query = new Document(idField, new Document("$gt", lastId));
}else{
query = new Document();
}
Document docIdField = new Document(idField, 1);
for(Document doc : coll.find(query).projection(docIdField).sort(docIdField)){
ObjectId id = (ObjectId) doc.get(idField);
queue.add(id);
lastId = id;
}
}
}