//Dstl (c) Crown Copyright 2017
package uk.gov.dstl.baleen.annotators.regex;
import java.util.Collections;
import java.util.regex.Matcher;
import org.apache.uima.jcas.JCas;
import com.google.common.collect.ImmutableSet;
import uk.gov.dstl.baleen.annotators.regex.helpers.AbstractRegexAnnotator;
import uk.gov.dstl.baleen.core.pipelines.orderers.AnalysisEngineAction;
import uk.gov.dstl.baleen.types.common.CommsIdentifier;
/**
* Annotates CommsIdentifiers from social media sites, i.e. in the format @username.
*
* @baleen.javadoc
*/
public class SocialMediaUsername extends AbstractRegexAnnotator<CommsIdentifier> {
// We need the \b in so we don't have emails
private static final String PATTERN = "\\B@[A-Za-z0-9-_]+\\b";
/** New instance.
*
*/
public SocialMediaUsername() {
super(PATTERN, false, 1.0f);
}
@Override
protected CommsIdentifier create(JCas jCas, Matcher matcher) {
CommsIdentifier ci = new CommsIdentifier(jCas);
ci.setSubType("username");
return ci;
}
@Override
public AnalysisEngineAction getAction() {
return new AnalysisEngineAction(Collections.emptySet(), ImmutableSet.of(CommsIdentifier.class));
}
}