//Dstl (c) Crown Copyright 2017 package uk.gov.dstl.baleen.annotators.grammatical; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.uima.UimaContext; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; import org.atteo.evo.inflector.English; import com.google.common.collect.ImmutableSet; import uk.gov.dstl.baleen.core.pipelines.orderers.AnalysisEngineAction; import uk.gov.dstl.baleen.types.common.Organisation; import uk.gov.dstl.baleen.types.language.PhraseChunk; import uk.gov.dstl.baleen.uima.BaleenAnnotator; /** * Find noun phrases that contain a word that might indicate it's an organisation, * such as 'council' or 'forces'. */ public class NPOrganisation extends BaleenAnnotator { private static final List<String> ORGANISATION_KEYWORDS = Arrays.asList( "council", "government", "coalition", "agency", "community", "regime", "service", "group", "unit", "faculty", "division", "department", "institute", "university", "committee", "company", "organisation", "coalition", "tribe", "brotherhood","sisterhood", "force", "police", "army", "navy", "rebels", "insurgents", "militants", "united nations", "nato" ); private List<String> organisationInclPlurals; @Override public void doInitialize(UimaContext aContext) throws ResourceInitializationException { organisationInclPlurals = new ArrayList<>(ORGANISATION_KEYWORDS.size() * 2); for(String org : ORGANISATION_KEYWORDS){ organisationInclPlurals.add(org); organisationInclPlurals.add(English.plural(org)); } } @Override protected void doProcess(JCas jCas) throws AnalysisEngineProcessException { for(PhraseChunk chunk : JCasUtil.select(jCas, PhraseChunk.class)){ if(!"NP".equals(chunk.getChunkType())) continue; processNP(jCas, chunk); } } /** * Process a noun phrase to find entities */ public void processNP(JCas jCas, PhraseChunk chunk){ String coveredText = chunk.getCoveredText().toLowerCase(); for(String org : organisationInclPlurals){ if(coveredText.contains(org)){ //Check that word isn't part of another word Pattern p = Pattern.compile("\\b"+org+"\\b", Pattern.CASE_INSENSITIVE); Matcher m = p.matcher(coveredText); if(m.find()){ Organisation o = new Organisation(jCas); o.setBegin(chunk.getBegin()); o.setEnd(chunk.getEnd()); addToJCasIndex(o); return; //Phrase chunk shouldn't contain more than one organisation } } } } @Override public AnalysisEngineAction getAction() { return new AnalysisEngineAction(ImmutableSet.of(PhraseChunk.class), ImmutableSet.of(Organisation.class)); } }