DocumentTypeByFilename.java example

Explorer
baleen-master
//Dstl (c) Crown Copyright 2017
package uk.gov.dstl.baleen.annotators.misc;

import java.io.File;
import java.util.Collections;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.tcas.DocumentAnnotation;
import org.apache.uima.resource.ResourceInitializationException;

import com.google.common.collect.ImmutableSet;

import uk.gov.dstl.baleen.core.pipelines.orderers.AnalysisEngineAction;
import uk.gov.dstl.baleen.uima.BaleenAnnotator;

/**
 * Use part of the filename as the document type,
 * using a regular expression to extract the correct part.
 * 
 * Any leading or trailing whitespace from the type name is trimmed.
 * 
 * @baleen.javadoc
 */
public class DocumentTypeByFilename extends BaleenAnnotator {

	/**
	 * The pattern to match filenames against.
	 * By default, set's the document type to the file extension.
	 * 
	 * @baleen.config .*\\.([a-z0-9]{2,4})
	 */
	public static final String PARAM_PATTERN = "pattern";
	@ConfigurationParameter(name = PARAM_PATTERN, defaultValue=".*\\.([a-z0-9]{2,4})")
	private String pattern;
	
	private Pattern typePattern;
	
	/**
	 * The regex group to use as the type
	 * 
	 * @baleen.config 1
	 */
	public static final String PARAM_GROUP = "group";
	@ConfigurationParameter(name = PARAM_GROUP, defaultValue="1")
	private Integer group;
	
	/**
	 * Is the regular expression case sensitive?
	 * 
	 * @baleen.config false
	 */
	public static final String PARAM_CASE_SENSITIVE = "caseSensitive";
	@ConfigurationParameter(name = PARAM_CASE_SENSITIVE, defaultValue="false")
	private boolean caseSensitive = false;
	
	/**
	 * The default value to use if the filename doesn't match the regex
	 * 
	 * @baleen.config
	 */
	public static final String PARAM_DEFAULT = "default";
	@ConfigurationParameter(name = PARAM_DEFAULT, defaultValue="")
	private String defaultType;
	
	/**
	 * Should the extracted type be lower-cased?
	 * This will provide some level of normalisation across types
	 * 
	 * @baleen.config true
	 */
	public static final String PARAM_LOWER_CASE = "lowerCase";
	@ConfigurationParameter(name = PARAM_LOWER_CASE, defaultValue="true")
	private boolean lowerCase = true;
	
	/**
	 * An optional prefix to add to the type
	 * 
	 * @baleen.config
	 */
	public static final String PARAM_PREFIX = "prefix";
	@ConfigurationParameter(name = PARAM_PREFIX, defaultValue="")
	private String prefix;
	
	@Override
	public void doInitialize(UimaContext context) throws ResourceInitializationException {
		if(caseSensitive){
			typePattern = Pattern.compile(pattern);
		}else{
			typePattern = Pattern.compile(pattern, Pattern.CASE_INSENSITIVE);
		}
	}
	
	@Override
	public void doProcess(JCas aJCas) throws AnalysisEngineProcessException {		
		DocumentAnnotation da = getDocumentAnnotation(aJCas);
		
		File f = new File(da.getSourceUri());
		
		String type = defaultType;
		
		Matcher m = typePattern.matcher(f.getName());
		if(m.matches()){
			type = m.group(group);
		}
		
		if(lowerCase)
			type = type.toLowerCase();
		
		da.setDocType(prefix + type.trim());
	}
	
	@Override
	public AnalysisEngineAction getAction() {
		return new AnalysisEngineAction(Collections.emptySet(), ImmutableSet.of(DocumentAnnotation.class));
	}
}