//Dstl (c) Crown Copyright 2017
package uk.gov.dstl.baleen.annotators.misc;
import java.io.File;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.tcas.DocumentAnnotation;
import com.google.common.collect.ImmutableSet;
import uk.gov.dstl.baleen.core.pipelines.orderers.AnalysisEngineAction;
import uk.gov.dstl.baleen.types.metadata.Metadata;
import uk.gov.dstl.baleen.uima.BaleenAnnotator;
/**
* Adds the document source (or just the file name) to the document metadata
*
* @baleen.javadoc
*/
public class AddSourceToMetadata extends BaleenAnnotator {
/**
* The metadata key to use
*
* @baleen.config source
*/
public static final String PARAM_METADATA_KEY = "key";
@ConfigurationParameter(name = PARAM_METADATA_KEY, defaultValue = "source")
private String key;
/**
* Only use the file name (without the file extension)
*
* @baleen.config false
*/
public static final String PARAM_NAME_ONLY = "nameOnly";
@ConfigurationParameter(name = PARAM_NAME_ONLY, defaultValue = "false")
private Boolean nameOnly;
@Override
protected void doProcess(JCas jCas) throws AnalysisEngineProcessException {
DocumentAnnotation da = getDocumentAnnotation(jCas);
String source = da.getSourceUri();
Metadata md = new Metadata(jCas);
md.setKey(key);
if(nameOnly){
File f = new File(source);
String file = f.getName();
if(file.contains(".")){
md.setValue(file.substring(0, file.lastIndexOf('.')));
}else{
md.setValue(file);
}
}else{
md.setValue(source);
}
addToJCasIndex(md);
}
@Override
public AnalysisEngineAction getAction() {
return new AnalysisEngineAction(ImmutableSet.of(DocumentAnnotation.class), ImmutableSet.of(Metadata.class));
}
}