//Dstl (c) Crown Copyright 2017 package uk.gov.dstl.baleen.annotators.regex; import java.util.Collections; import java.util.regex.Matcher; import org.apache.commons.lang.StringUtils; import org.apache.uima.jcas.JCas; import com.google.common.collect.ImmutableSet; import uk.gov.dstl.baleen.annotators.regex.helpers.AbstractRegexAnnotator; import uk.gov.dstl.baleen.core.pipelines.orderers.AnalysisEngineAction; import uk.gov.dstl.baleen.types.common.Chemical; /** * Identify chemicals by looking for CAS numbers and checking the check digit */ public class CasRegistryNumber extends AbstractRegexAnnotator<Chemical> { private static final String CAS_REGEX = "\\b(\\d{2,7})-(\\d{2})-(\\d{1})\\b"; /** * New instance. */ public CasRegistryNumber() { super(CAS_REGEX, false, 1.0f); } @Override protected Chemical create(JCas jCas, Matcher matcher) { //Check checksum Integer checkDigit = Integer.valueOf(matcher.group(3)); String part1 = matcher.group(1); String part2 = matcher.group(2); part1 = StringUtils.reverse(part1); Integer sum = Integer.valueOf(part2.substring(1, 2)) + (2*Integer.valueOf(part2.substring(0, 1))); Integer pos = 0; while(pos < part1.length()){ Integer x = Integer.valueOf(part1.substring(pos, pos + 1)); sum += (pos + 3)*x; pos++; } if(sum % 10 != checkDigit){ getMonitor().debug("Pattern matching CAS format found, but check digit is incorrect"); return null; } Chemical c = new Chemical(jCas); c.setSubType("CAS"); return c; } @Override public AnalysisEngineAction getAction() { return new AnalysisEngineAction(Collections.emptySet(), ImmutableSet.of(Chemical.class)); } }