package de.berlin.hu.uima.ae.filter; import de.berlin.hu.util.Constants; import org.apache.uima.UimaContext; import org.apache.uima.analysis_component.JCasAnnotator_ImplBase; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.cas.FSIndex; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; import org.u_compare.shared.semantic.NamedEntity; import java.io.*; import java.util.ArrayList; import java.util.HashSet; import java.util.Iterator; import java.util.List; /** * Filters annotations according to a stopword list generated from the Google-N-Gram corpus. */ public class StopwordFilter extends JCasAnnotator_ImplBase { private List<NamedEntity> invalidChemicals = null; private HashSet<String> stopwords = new HashSet<String>(); @Override public void initialize(UimaContext aContext) throws ResourceInitializationException { super.initialize(aContext); InputStream stopwordFile = this.getClass().getClassLoader().getResourceAsStream("resources/chemspot_stop_words.txt"); try { BufferedReader reader = new BufferedReader(new InputStreamReader(stopwordFile)); String line = ""; while(null != (line = reader.readLine()) ) { stopwords.add(line); } } catch (FileNotFoundException e) { throw new ResourceInitializationException(e); } catch (IOException e) { throw new ResourceInitializationException(e); } } @Override public void process(JCas aJCas) throws AnalysisEngineProcessException { FSIndex chemicalIndex = aJCas.getAnnotationIndex(NamedEntity.type); Iterator chemicalIterator = chemicalIndex.iterator(); invalidChemicals = new ArrayList<NamedEntity>(); while (chemicalIterator.hasNext()) { NamedEntity chemical = (NamedEntity) chemicalIterator.next(); if (!Constants.GOLDSTANDARD.equals(chemical.getSource())) { if (stopwords.contains(chemical.getCoveredText().toLowerCase())) { invalidChemicals.add(chemical); } } } for (NamedEntity invalidChemical : invalidChemicals) { invalidChemical.removeFromIndexes(); } } }