//Dstl (c) Crown Copyright 2017
package uk.gov.dstl.baleen.annotators.grammatical;
import java.util.Arrays;
import java.util.List;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.fit.util.JCasUtil;
import org.apache.uima.jcas.JCas;
import com.google.common.collect.ImmutableSet;
import uk.gov.dstl.baleen.core.pipelines.orderers.AnalysisEngineAction;
import uk.gov.dstl.baleen.types.common.Chemical;
import uk.gov.dstl.baleen.types.language.PhraseChunk;
import uk.gov.dstl.baleen.uima.BaleenAnnotator;
/**
* Find noun phrases that contain an element (e.g. carbon), and annotate
* as a Chemical
*/
public class NPElement extends BaleenAnnotator {
private static final List<String> ELEMENTS = Arrays.asList(
"hydrogen", "helium", "lithium", "beryllium", "boron", "carbon", "nitrogen", "oxygen", "fluorine", "neon",
"sodium", "magnesium", "aluminium", "silicon", "phosphorus", "sulfur", "chlorine", "argon", "potassium", "calcium",
"scandium", "titanium", "vanadium", "chromium", "manganese", "iron", "cobalt", "nickel", "copper", "zinc",
"gallium", "germanium", "arsenic", "selenium", "bromine", "krypton", "rubidium", "strontium", "yttrium", "zirconium",
"niobium", "molybdenum", "technetium", "ruthenium", "rhodium", "palladium", "silver", "cadmium", "indium", "tin",
"antimony", "tellurium", "iodine", "xenon", "caesium", "barium", "lanthanum", "cerium", "praseodymium", "neodymium",
"promethium", "samarium", "europium", "gadolinium", "terbium", "dysprosium", "holmium", "erbium", "thulium", "ytterbium",
"lutetium", "hafnium", "tantalum", "tungsten", "rhenium", "osmium", "iridium", "platinum", "gold", "mercury",
"thallium", "lead", "bismuth", "polonium", "astatine", "radon", "francium", "radium", "actinium", "thorium",
"protactinium", "uranium", "neptunium", "plutonium", "americium", "curium", "berkelium", "californium", "einsteinium", "fermium",
"mendelevium", "neblium", "lawrencium", "rutherfordium", "dubnium", "seaborgium", "bohrium", "hassium", "meitnerium", "darmstadtium",
"roentgenium", "copernicium", "nihonium", "flerovium", "moscovium", "livermorium", "tennessine", "oganesson"
);
@Override
protected void doProcess(JCas jCas) throws AnalysisEngineProcessException {
for(PhraseChunk chunk : JCasUtil.select(jCas, PhraseChunk.class)){
if(!"NP".equals(chunk.getChunkType()))
continue;
String coveredText = chunk.getCoveredText().toLowerCase();
String[] parts = coveredText.split("[^a-z]");
for(String part : parts){
if(ELEMENTS.contains(part)){
Chemical c = new Chemical(jCas, chunk.getBegin(), chunk.getEnd());
addToJCasIndex(c);
}
}
}
}
@Override
public AnalysisEngineAction getAction() {
return new AnalysisEngineAction(ImmutableSet.of(PhraseChunk.class), ImmutableSet.of(Chemical.class));
}
}