//Dstl (c) Crown Copyright 2017 package uk.gov.dstl.baleen.annotators.gazetteer; import java.util.Collections; import java.util.HashMap; import java.util.Map; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.fit.descriptor.ExternalResource; import com.google.common.collect.ImmutableSet; import uk.gov.dstl.baleen.annotators.gazetteer.helpers.AbstractAhoCorasickAnnotator; import uk.gov.dstl.baleen.core.pipelines.orderers.AnalysisEngineAction; import uk.gov.dstl.baleen.exceptions.BaleenException; import uk.gov.dstl.baleen.resources.SharedFileResource; import uk.gov.dstl.baleen.resources.gazetteer.FileGazetteer; import uk.gov.dstl.baleen.resources.gazetteer.IGazetteer; /** * Generic file-backed RadixTree Gazetteer annotator, that will use a file based gazetteer to find and annotate entities. * * * @baleen.javadoc */ public class File extends AbstractAhoCorasickAnnotator { /** * Connection to File Gazetteer * * @baleen.resource uk.gov.dstl.baleen.resources.SharedFileResource */ public static final String KEY_FILE = "fileGazetteer"; @ExternalResource(key = KEY_FILE) private SharedFileResource fileResource; /** * The file, which is expected to be a line separated gazetteer with aliases comma-separated (by default) on the same line, to use as the gazetteer * * @baleen.config gazetteer.txt */ public static final String PARAM_FILE_NAME = "fileName"; @ConfigurationParameter(name = PARAM_FILE_NAME, defaultValue = "gazetteer.txt") private String fileName; /** * An alias term separator string that will override the "," default value * * @baleen.config , */ public static final String PARAM_TERM_SEPARATOR = "termSeparator"; @ConfigurationParameter(name = PARAM_TERM_SEPARATOR, defaultValue = ",") private String termSeparator; /** * Constructor */ public File() { // Do nothing } @Override public IGazetteer configureGazetteer() throws BaleenException { Map<String, Object> config = new HashMap<>(); config.put(FileGazetteer.CONFIG_CASE_SENSITIVE, caseSensitive); config.put(FileGazetteer.CONFIG_FILE, fileName); config.put(FileGazetteer.CONFIG_TERM_SEPARATOR, termSeparator); IGazetteer gaz = new FileGazetteer(); gaz.init(fileResource, config); return gaz; } @Override public AnalysisEngineAction getAction() { return new AnalysisEngineAction(Collections.emptySet(), ImmutableSet.of(entityType)); } }