//Dstl (c) Crown Copyright 2017 package uk.gov.dstl.baleen.annotators.misc; import java.io.File; import java.util.Collections; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.tcas.DocumentAnnotation; import com.google.common.collect.ImmutableSet; import uk.gov.dstl.baleen.core.pipelines.orderers.AnalysisEngineAction; import uk.gov.dstl.baleen.uima.BaleenAnnotator; /** * Sets the document type based on the file location, stripping out a 'root' directory specified by the user * * * @baleen.javadoc */ public class DocumentTypeByLocation extends BaleenAnnotator { /** * The root directory to strip out of the path before setting the document type * * @baleen.config */ public static final String PARAM_BASE_DIRECTORY = "baseDirectory"; @ConfigurationParameter(name = PARAM_BASE_DIRECTORY, defaultValue="") private String baseDirectory; @Override public void doProcess(JCas aJCas) throws AnalysisEngineProcessException { DocumentAnnotation da = getDocumentAnnotation(aJCas); File f = new File(da.getSourceUri()); String type = f.getParentFile().getAbsolutePath(); if(baseDirectory != null && type.startsWith(baseDirectory)) type = type.substring(baseDirectory.length()); // remove leading and trailing slashes and backslashes using a regular expression type = type.replaceAll("^\\\\+|^\\/+|\\\\+$|\\/+$", ""); da.setDocType(type); } @Override public AnalysisEngineAction getAction() { return new AnalysisEngineAction(Collections.emptySet(), ImmutableSet.of(DocumentAnnotation.class)); } }