package de.berlin.hu.uima.ae.tagger.banner;
import banner.tagging.CRFTagger;
import banner.types.Mention;
import de.berlin.hu.uima.util.Util;
import de.berlin.hu.util.Constants;
import de.berlin.hu.util.Constants.ChemicalType;
import org.apache.commons.configuration.ConfigurationException;
import org.apache.commons.configuration.XMLConfiguration;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_component.JCasAnnotator_ImplBase;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.FSIndex;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.tcas.Annotation;
import org.apache.uima.resource.ResourceInitializationException;
import org.u_compare.shared.semantic.NamedEntity;
import org.u_compare.shared.syntactic.Sentence;
import java.io.File;
import java.io.IOException;
import java.net.URL;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
/**
* @author Tim Rocktäschel
*
* This is an UIMA Analysis Engine for tagging with BANNER.
* It loads a BANNER model file and tags tokenized sentences in the text of a CAS object.
*/
public class BannerTagger extends JCasAnnotator_ImplBase {
private static final String BANNER_MODEL_FILE_PARAM = "BannerModelFile";
private static final String BANNER_CONFIG_FILE_PARAM = "BannerConfigFile";
private static final String THRESHOLD_PARAM = "Threshold";
//FIXME: remove hardcoded switch to choose between loading model file from external resource or parameter
// private static final boolean USE_RESOURCE = true;
private static final boolean USE_RESOURCE = false;
private CRFTagger tagger;
// private CRFTaggerStochasticGradient tagger;
private URL bannerModelFile;
private File bannerConfigFile;
// Confidence-Threshold for N-Best-Tagger
private double threshold;
// N parameter for N-Best-Tagger
private static final int N = 10;
private int documentCounter;
private XMLConfiguration config;
@Override
public void initialize(UimaContext aContext)
throws ResourceInitializationException {
super.initialize(aContext);
try {
String pathToModelFile = aContext.getConfigParameterValue(BANNER_MODEL_FILE_PARAM).toString();
String pathToConfigFile = aContext.getConfigParameterValue(BANNER_CONFIG_FILE_PARAM).toString();
bannerModelFile = new URL(pathToModelFile);
bannerConfigFile = new File(pathToConfigFile);
try {
config = new XMLConfiguration(bannerConfigFile);
} catch (ConfigurationException e) {
e.printStackTrace();
throw new ResourceInitializationException(e);
}
threshold = Double.parseDouble(aContext.getConfigParameterValue(THRESHOLD_PARAM).toString());
tagger = CRFWrapper.load(bannerModelFile, null, null, null);
// tagger = NBestCRFTagger.load(bannerModelFile, lemmatiser, posTagger, null, N, true);
// tagger = NBestCRFTagger.load(bannerModelFile, LEMMATISER, POS_TAGGER, null, N, false); //then the sum is used for the same sequences
} catch (IOException e) {
throw new ResourceInitializationException(e);
}
documentCounter = 0;
}
@Override
public void process(JCas aJCas) throws AnalysisEngineProcessException {
FSIndex<Annotation> sentenceIndex = aJCas.getAnnotationIndex(Sentence.type);
Iterator<Annotation> sentenceIterator = sentenceIndex.iterator();
int sentenceCounter = 0;
Set<Mention> mentions = new HashSet<Mention>();
String docText = aJCas.getDocumentText();
long start = System.currentTimeMillis();
while (sentenceIterator.hasNext()) {
Sentence sentence = (Sentence) sentenceIterator.next();
// convert every sentence into a BANNER sentence
banner.types.Sentence bannerSentence = new banner.types.Sentence(sentenceCounter+"", documentCounter+"", sentence.getCoveredText());
// get tokens covered by the sentence
List<org.u_compare.shared.syntactic.Token> tokensInSentence = Util.getTokens(aJCas, sentence);
Util.tokenizeBannerSentence(bannerSentence, tokensInSentence);
assert sentence.getCoveredText().equals(bannerSentence.getText());
assert tokensInSentence.size() == bannerSentence.getTokens().size() : (tokensInSentence.size() + " != " + bannerSentence.getTokens().size());
try {
tagger.tag(bannerSentence);
} catch (ArrayIndexOutOfBoundsException e) {
System.err.println("ERROR!");
System.err.println("Corpus:\t" + sentence.getCoveredText());
System.err.println("BANNER:\t" + bannerSentence.getText());
}
// annotate found entities
mentions.addAll(createAnnotations(aJCas, sentence.getBegin(), bannerSentence));
sentenceCounter++;
}
/*
//propagate all findings...
for (Mention mention : mentions) {
String pattern = mention.getText();
int index = docText.indexOf(pattern);
while (index >= 0) {
int beginning = index;
int ending = beginning + pattern.length();
if (beginning != mention.getStart()) {
NamedEntity entity = new NamedEntity(aJCas);
entity.setBegin(beginning);
entity.setEnd(ending);
entity.setEntityType(mention.getEntityType().getText());
entity.setConfidence(mention.getProbability());
entity.setSource(Constants.CRF);
entity.addToIndexes();
System.out.println(mention.getText() + " != " + entity.getCoveredText());
//TODO: check position
}
index = docText.indexOf(pattern, index + 1);
}
}
*/
long time = System.currentTimeMillis() - start;
//System.out.println("Tagging " + sentenceCounter + " sentences with CRF took " + time + "ms (" + (time/sentenceCounter) + "ms per sentence)");
documentCounter++;
}
/**
* converts each mention into an UIMA annotation
*/
private Set<Mention> createAnnotations(JCas aJCas, int offset, banner.types.Sentence bannerSentence) {
List<Mention> mentions = bannerSentence.getMentions();
Set<Mention> mentionsToAdd = new HashSet<Mention>();
//FIXME: when using NBestCRFTagger: sort mentions first!
Mention lastMention = null;
for (Mention mention : mentions) {
if (mention.getProbability() >= threshold) {
// simple approach to resolve overlapping entities when using NBestCRFTagger
if (lastMention != null && mention.overlaps(lastMention)) {
if (mention.getProbability() > lastMention.getProbability()) {
mentionsToAdd.remove(lastMention);
mentionsToAdd.add(mention);
lastMention = mention;
}
} else {
mentionsToAdd.add(mention);
lastMention = mention;
}
}
}
for (Mention mention : mentionsToAdd) {
NamedEntity entity = new NamedEntity(aJCas);
int startOffset = 0;
int endOffset = 0;
startOffset = bannerSentence.getTokens().get(mention.getStart()).getStart();
endOffset = bannerSentence.getTokens().get(mention.getEnd() - 1).getEnd();
entity.setBegin(offset + startOffset);
entity.setEnd(offset + endOffset);
entity.setEntityType(mention.getEntityType().getText());
entity.setConfidence(mention.getProbability());
entity.setSource(Constants.CRF);
entity.setEntityType(ChemicalType.SYSTEMATIC.toString());
entity.addToIndexes();
assert entity.getCoveredText().equals(mention.getText());
}
return mentionsToAdd;
}
}