//Dstl (c) Crown Copyright 2017 package uk.gov.dstl.baleen.annotators.misc; import java.io.File; import java.util.Collections; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.uima.UimaContext; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.tcas.DocumentAnnotation; import org.apache.uima.resource.ResourceInitializationException; import com.google.common.collect.ImmutableSet; import uk.gov.dstl.baleen.core.pipelines.orderers.AnalysisEngineAction; import uk.gov.dstl.baleen.uima.BaleenAnnotator; /** * Use part of the filename as the document type, * using a regular expression to extract the correct part. * * Any leading or trailing whitespace from the type name is trimmed. * * @baleen.javadoc */ public class DocumentTypeByFilename extends BaleenAnnotator { /** * The pattern to match filenames against. * By default, set's the document type to the file extension. * * @baleen.config .*\\.([a-z0-9]{2,4}) */ public static final String PARAM_PATTERN = "pattern"; @ConfigurationParameter(name = PARAM_PATTERN, defaultValue=".*\\.([a-z0-9]{2,4})") private String pattern; private Pattern typePattern; /** * The regex group to use as the type * * @baleen.config 1 */ public static final String PARAM_GROUP = "group"; @ConfigurationParameter(name = PARAM_GROUP, defaultValue="1") private Integer group; /** * Is the regular expression case sensitive? * * @baleen.config false */ public static final String PARAM_CASE_SENSITIVE = "caseSensitive"; @ConfigurationParameter(name = PARAM_CASE_SENSITIVE, defaultValue="false") private boolean caseSensitive = false; /** * The default value to use if the filename doesn't match the regex * * @baleen.config */ public static final String PARAM_DEFAULT = "default"; @ConfigurationParameter(name = PARAM_DEFAULT, defaultValue="") private String defaultType; /** * Should the extracted type be lower-cased? * This will provide some level of normalisation across types * * @baleen.config true */ public static final String PARAM_LOWER_CASE = "lowerCase"; @ConfigurationParameter(name = PARAM_LOWER_CASE, defaultValue="true") private boolean lowerCase = true; /** * An optional prefix to add to the type * * @baleen.config */ public static final String PARAM_PREFIX = "prefix"; @ConfigurationParameter(name = PARAM_PREFIX, defaultValue="") private String prefix; @Override public void doInitialize(UimaContext context) throws ResourceInitializationException { if(caseSensitive){ typePattern = Pattern.compile(pattern); }else{ typePattern = Pattern.compile(pattern, Pattern.CASE_INSENSITIVE); } } @Override public void doProcess(JCas aJCas) throws AnalysisEngineProcessException { DocumentAnnotation da = getDocumentAnnotation(aJCas); File f = new File(da.getSourceUri()); String type = defaultType; Matcher m = typePattern.matcher(f.getName()); if(m.matches()){ type = m.group(group); } if(lowerCase) type = type.toLowerCase(); da.setDocType(prefix + type.trim()); } @Override public AnalysisEngineAction getAction() { return new AnalysisEngineAction(Collections.emptySet(), ImmutableSet.of(DocumentAnnotation.class)); } }