package de.berlin.hu.uima.ae.tagger.drug; import java.io.IOException; import java.io.InputStream; import java.io.ObjectInputStream; import java.io.PrintStream; import java.util.List; import org.apache.commons.io.output.NullOutputStream; import org.apache.uima.UimaContext; import org.apache.uima.analysis_component.JCasAnnotator_ImplBase; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; import org.u_compare.shared.semantic.Chemical; import org.u_compare.shared.syntactic.Sentence; import org.u_compare.shared.syntactic.Token; import org.uimafit.util.JCasUtil; import scala.Tuple2; import scala.collection.Iterator; import simplexnlp.core.Entity; import de.berlin.hu.chemspot.ChemSpotConfiguration; import de.berlin.hu.eumed.Config; import de.berlin.hu.eumed.EntityTagger; import de.berlin.hu.types.PubmedDocument; import de.berlin.hu.util.Constants; import de.berlin.hu.util.Constants.ChemicalType; public class EumedNERTagger extends JCasAnnotator_ImplBase { public static final String PATH_TO_EUMED_MODEL = "pathToEumedModel"; private EntityTagger tagger = null; public void loadCRFAndPipe(InputStream in) throws IOException, ClassNotFoundException { ObjectInputStream ois = new ObjectInputStream(in); ois.close(); } @Override public void initialize(UimaContext aContext) throws ResourceInitializationException { String eumedModel = (String)aContext.getConfigParameterValue(PATH_TO_EUMED_MODEL); tagger = new EntityTagger(); tagger.add(new Tuple2<String, String>("path", eumedModel)); tagger.add(new Tuple2<String, Config>("config", new Config(new String[0]))); tagger.initialize(); } @Override public void process(JCas aJCas) throws AnalysisEngineProcessException { Iterable<PubmedDocument> documents = JCasUtil.iterate(aJCas, PubmedDocument.class); for (PubmedDocument doc : documents) { String docId = doc.getPmid(); simplexnlp.core.Document simplexDoc = new simplexnlp.core.Document(docId, doc.getCoveredText()); Iterable<Sentence> sentences = JCasUtil.selectCovered(aJCas, Sentence.class, doc); int i = 0; for (Sentence sentence: sentences) { simplexnlp.example.Sentence simplexSentence = new simplexnlp.example.Sentence(sentence.getBegin() - doc.getBegin(), sentence.getEnd() - doc.getBegin() - 1); simplexSentence.id_$eq(docId + ".s" + i++); simplexSentence.origId_$eq(simplexSentence.id()); simplexDoc.add(simplexSentence); //System.out.println("--- Sentence " + i + " ---"); List<Token> tokens = JCasUtil.selectCovered(aJCas, Token.class, sentence); for (Token token : tokens) { simplexnlp.core.Token simplexToken = new simplexnlp.core.Token(token.getBegin() - sentence.getBegin(), token.getEnd() - sentence.getBegin() - 1); simplexToken.pos_$eq(token.getLabel()); simplexSentence.add(simplexToken); //System.out.println(simplexToken.toString()); } } // ugly hack to avoid debug output from model synchronized(System.out) { PrintStream out = System.out; System.setOut(new PrintStream(new NullOutputStream())); tagger.process(simplexDoc); System.setOut(out); } Iterator<simplexnlp.core.Sentence> simplexSentences = simplexDoc.sentences().toIterator(); while (simplexSentences.hasNext()) { simplexnlp.core.Sentence sentence = simplexSentences.next(); Iterator<Entity> entities = sentence.entities().toIterator(); while (entities.hasNext()) { Entity entity = entities.next(); ChemicalType type = ChemicalType.fromString(entity.className()); if (ChemSpotConfiguration.isAnnotateEumed(type)) { createChemicalAnnotation(aJCas, sentence.start() + entity.start() + doc.getBegin(), sentence.start() + doc.getBegin() + entity.end() + 1, entity.className()); } } } } } private Chemical createChemicalAnnotation(JCas aJCas, int begin, int end, String type) { Chemical chemical = new Chemical(aJCas); chemical.setBegin(begin); chemical.setEnd(end); chemical.setSource(Constants.EUMED); chemical.setEntityType(type); chemical.addToIndexes(); //System.out.println("'" + drug.getCoveredText() + "'"); return chemical; } }