//Dstl (c) Crown Copyright 2017 package uk.gov.dstl.baleen.annotators.regex; import java.util.Collections; import java.util.regex.Matcher; import org.apache.uima.jcas.JCas; import com.google.common.collect.ImmutableSet; import uk.gov.dstl.baleen.annotators.regex.helpers.AbstractRegexAnnotator; import uk.gov.dstl.baleen.core.pipelines.orderers.AnalysisEngineAction; import uk.gov.dstl.baleen.types.common.DocumentReference; /** * Annotate document references that start with document, letter or resolution followed by a number * * @baleen.javadoc */ public class DocumentNumber extends AbstractRegexAnnotator<DocumentReference> { //TODO: Allow users to specify document number pattern (and prefixes) private static final String DOCUMENT_REGEX = "(document|letter|resolution|executive order)( \\d+|s \\d+(, \\d+)*( )?(and \\d+)?)"; /** New instance. * */ public DocumentNumber() { super(DOCUMENT_REGEX, false, 1.0); } @Override public AnalysisEngineAction getAction() { return new AnalysisEngineAction(Collections.emptySet(), ImmutableSet.of(DocumentReference.class)); } @Override protected DocumentReference create(JCas jCas, Matcher matcher) { //TODO: Annotate each document separately return new DocumentReference(jCas); } }