//Dstl (c) Crown Copyright 2017 package uk.gov.dstl.baleen.annotators.regex; import java.util.Collections; import java.util.regex.Matcher; import org.apache.uima.jcas.JCas; import com.google.common.collect.ImmutableSet; import uk.gov.dstl.baleen.annotators.regex.helpers.AbstractRegexAnnotator; import uk.gov.dstl.baleen.core.pipelines.orderers.AnalysisEngineAction; import uk.gov.dstl.baleen.types.common.CommsIdentifier; /** * Annotate e-mail addresses within a document using regular expressions * * <p>Look for text matching the following regular expression and annotate it as a CommsIdentifier with type 'email':</p> * <pre>[A-Z0-9._%+-]+@([A-Z0-9.-]+[.][A-Z]{2,6})</pre> * <p>This will capture the vast majority of valid e-mail addresses, although it will not capture every valid e-mail address as defined in RFC 2822. * No checking is done to determine whether extracted e-mail addresses exist or not.</p> * * */ public class Email extends AbstractRegexAnnotator<CommsIdentifier> { private static final String EMAIL_REGEX = "[A-Z0-9._%+-]+@([A-Z0-9.-]+[.][A-Z]{2,6})"; /** New instance. * */ public Email() { super(EMAIL_REGEX, false, 1.0); } @Override protected CommsIdentifier create(JCas jCas, Matcher matcher) { CommsIdentifier ci = new CommsIdentifier(jCas); ci.setSubType("email"); return ci; } @Override public AnalysisEngineAction getAction() { return new AnalysisEngineAction(Collections.emptySet(), ImmutableSet.of(CommsIdentifier.class)); } }