package de.berlin.hu.uima.ae;
import de.berlin.hu.chemspot.ChemSpotConfiguration;
import de.berlin.hu.types.PubmedDocument;
import de.berlin.hu.util.Constants;
import org.apache.uima.analysis_component.JCasAnnotator_ImplBase;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.jcas.JCas;
import org.u_compare.shared.semantic.NamedEntity;
import org.u_compare.shared.semantic.chemical.Chemical;
import org.u_compare.shared.syntactic.Sentence;
import org.uimafit.util.JCasUtil;
import java.util.*;
import java.util.regex.Pattern;
/**
* Merges annotations from the CRF and the dictionary, favoring entities extracted by the CRF over the dictionary.
*/
public class AnnotationMergerAE extends JCasAnnotator_ImplBase {
private static final int DEFAULT_DICTIONARY_FILTER_LENGTH = ChemSpotConfiguration.getDictionaryFilterLength() != -1 ? ChemSpotConfiguration.getDictionaryFilterLength() : 2;
/* (non-Javadoc)
* @see org.apache.uima.analysis_component.JCasAnnotator_ImplBase#process(org.apache.uima.jcas.JCas)
*/
@Override
public void process(JCas aJCas) throws AnalysisEngineProcessException {
List<PubmedDocument> documents = new ArrayList<PubmedDocument>();
for (PubmedDocument doc : JCasUtil.iterate(aJCas, PubmedDocument.class)) {
documents.add(doc);
}
if (documents.isEmpty()) {
documents.add(null);
}
int eumed_overlaps = 0;
int total = 0;
for (PubmedDocument document : documents) {
//System.out.println("\nMerging annotations of document " + (document != null ? document.getPmid() : ""));
Iterator<NamedEntity> entityIterator = document != null ? JCasUtil.iterator(document, NamedEntity.class, true, true) : JCasUtil.iterator(aJCas, NamedEntity.class);
List<NamedEntity> entities = new ArrayList<NamedEntity>();
List<String> abbreviations = new ArrayList<String>();
Map<String, List<NamedEntity>> chemicalsMap = new HashMap<String, List<NamedEntity>>();
while (entityIterator.hasNext()) {
NamedEntity namedEntity = (NamedEntity) entityIterator.next();
if (!Constants.GOLDSTANDARD.equals(namedEntity.getSource())) {
entities.add(namedEntity);
String chemName = namedEntity.getCoveredText().trim().toLowerCase();
if (!chemicalsMap.containsKey(chemName)) {
chemicalsMap.put(chemName, new ArrayList<NamedEntity>());
}
chemicalsMap.get(chemName).add(namedEntity);
}
}
Comparator<NamedEntity> comp = new Comparator<NamedEntity>() {
public int compare(NamedEntity m1, NamedEntity m2) {
return m1.getBegin() - m2.getBegin();
}
};
Collections.sort(entities, comp);
List<NamedEntity> chemicals = new ArrayList<NamedEntity>();
boolean filtered = false;
//FIXME: use drug if it is identical to CRF match to obtain CAS Registry ID
NamedEntity lastEntity = null;
List<String> nonChemicalAbbreviations = new ArrayList<String>();
Iterator<Sentence> sentenceIterator = document != null ? JCasUtil.iterator(document, Sentence.class, true, true) : JCasUtil.iterator(aJCas, Sentence.class);
Sentence sentence = sentenceIterator.hasNext() ? sentenceIterator.next() : null;
for (NamedEntity entity : entities) {
if (sentence != null) {
while (sentence.getEnd() < entity.getBegin() && sentenceIterator.hasNext()) {
sentence = sentenceIterator.next();
lastEntity = null;
}
}
if (nonChemicalAbbreviations.contains(entity.getCoveredText().trim().toLowerCase())) {
entity.removeFromIndexes(aJCas);
filtered = true;
//System.out.println("Filtered subsequent mention of non-Chemical abbreviation: " + entity.getCoveredText() + " -> " + entity.getId());
}
boolean isChemAbbreviation = abbreviations.contains(entity.getCoveredText().trim().toLowerCase());
if (!isChemAbbreviation && !filtered && Constants.ABBREV.equals(entity.getSource())) {
String name = null;
if (entity.getId() != null) {
name = entity.getId().trim().toLowerCase();
entity.setId(null);
}
if (name != null && chemicalsMap.containsKey(name)) {
List<NamedEntity> chems = chemicalsMap.get(name);
for (NamedEntity c : chems) {
if (!Constants.ABBREV.equals(c.getSource())) {
isChemAbbreviation = true;
break;
}
}
}
/*if (!isChemAbbreviation && chemicalsMap.containsKey(entity.getCoveredText().toLowerCase())) {
List<NamedEntity> chems = chemicalsMap.get(entity.getCoveredText().toLowerCase());
for (NamedEntity c : chems) {
if (!Constants.ABBREV.equals(c.getSource())) {
isChemAbbreviation = true;
break;
}
}
}
if (!isChemAbbreviation && Normalizer.getIds() != null) {
isChemAbbreviation = Normalizer.getIds().containsKey(entity.getCoveredText().trim().toLowerCase());
}*/
if (isChemAbbreviation) {
abbreviations.add(entity.getCoveredText().trim().toLowerCase());
} else {
entity.removeFromIndexes(aJCas);
filtered = true;
nonChemicalAbbreviations.add(entity.getCoveredText().trim().toLowerCase());
List<NamedEntity> filteredEntities = new ArrayList<NamedEntity>();
for (NamedEntity e : chemicals) {
if (entity.getCoveredText().trim().equalsIgnoreCase(e.getCoveredText().trim())) {
filteredEntities.add(e);
e.removeFromIndexes();
//System.out.println("Removing previous occurence of " + entity.getCoveredText());
}
}
chemicals.removeAll(filteredEntities);
}
}
if (!filtered && lastEntity != null && crosses(lastEntity, entity)) {
if (Constants.ABBREV.equals(lastEntity.getSource())) {
if (isReplaceByAbbreviation(lastEntity, entity) && (!Constants.ABBREV.equals(entity.getSource()) || isChemAbbreviation)) {
//System.out.printf("replacing %s annotation %s at [%d-%d], because it was identified as an abbreviation for %s%n", entity.getSource(), entity.getCoveredText(), entity.getBegin(), entity.getEnd(), lastEntity.getId());
entity.removeFromIndexes(aJCas);
filtered = true;
} else {
lastEntity.removeFromIndexes(aJCas);
chemicals.remove(lastEntity);
}
} else if (Constants.ABBREV.equals(entity.getSource())) {
boolean isRemove = false;
if (isChemAbbreviation && isReplaceByAbbreviation(entity, lastEntity)) {
//System.out.printf("replacing %s annotation %s at [%d-%d], because it was identified as an abbreviation for %s%n", lastEntity.getSource(), lastEntity.getCoveredText(), lastEntity.getBegin(), lastEntity.getEnd(), entity.getId());
isRemove = true;
} else if (!isChemAbbreviation) {
//System.out.printf("removing %s annotation %s at [%d-%d], because it does not appear to be a chemical abbreviation (%s = %s)%n", lastEntity.getSource(), lastEntity.getCoveredText(), lastEntity.getBegin(), lastEntity.getEnd(), entity.getCoveredText(), entity.getId());
isRemove = true;
filtered = true;
}
if (isRemove) {
lastEntity.removeFromIndexes(aJCas);
chemicals.remove(lastEntity);
}
} else if (Constants.DICTIONARY.equals(lastEntity.getSource())
&& !Constants.DICTIONARY.equals(entity.getSource())) {
lastEntity.removeFromIndexes(aJCas);
chemicals.remove(lastEntity);
} else if (!Constants.DICTIONARY.equals(lastEntity.getSource())
&& Constants.DICTIONARY.equals(entity.getSource())) {
entity.removeFromIndexes(aJCas);
filtered = true;
} else if (Constants.SUM_TAGGER.equals(lastEntity.getSource())
&& !Constants.SUM_TAGGER.equals(entity.getSource())) {
lastEntity.removeFromIndexes(aJCas);
chemicals.remove(lastEntity);
} else if (!Constants.SUM_TAGGER.equals(lastEntity.getSource())
&& Constants.SUM_TAGGER.equals(entity.getSource())) {
entity.removeFromIndexes(aJCas);
filtered = true;
} else if (Constants.CRF.equals(entity.getSource())) {
lastEntity.removeFromIndexes(aJCas);
chemicals.remove(lastEntity);
} else {
if (lastEntity.getCoveredText().length() > entity.getCoveredText().length()) {
entity.removeFromIndexes(aJCas);
filtered = true;
} else {
lastEntity.removeFromIndexes(aJCas);
chemicals.remove(lastEntity);
}
}
}
if (lastEntity != null && !filtered && !crosses(lastEntity, entity) && entity.getBegin() - lastEntity.getEnd() < 10) {
if (entity.getCAS().getDocumentText().substring(lastEntity.getEnd(), entity.getBegin()).matches(" ")) {
entity.setBegin(lastEntity.getBegin());
lastEntity.removeFromIndexes();
chemicals.remove(lastEntity);
}
}
if (!filtered && Constants.DICTIONARY.equals(entity.getSource()) && entity.getEnd() - entity.getBegin() <= DEFAULT_DICTIONARY_FILTER_LENGTH) {
entity.removeFromIndexes(aJCas);
filtered = true;
}
if (!filtered) {
if (lastEntity != null && crosses(lastEntity, entity) && Constants.EUMED.equals(entity.getSource()) && !Constants.EUMED.equals(lastEntity.getSource())) {
eumed_overlaps++;
}
total++;
chemicals.add(entity);
lastEntity = entity;
}
filtered = false;
}
convertNamedEntitiesToChemicals(aJCas, chemicals);
}
//System.out.println("drug overlaps: " + drug_overlaps + " / " + total);
}
private void convertNamedEntitiesToChemicals(JCas aJCas,
List<NamedEntity> chemicals) {
for (NamedEntity entity : chemicals) {
Chemical chemical = new Chemical(aJCas, entity.getBegin(), entity.getEnd());
chemical.setSource(entity.getSource());
chemical.setId(/*entity.getSource() + ": " + */entity.getId());
chemical.setConfidence(entity.getConfidence());
chemical.setEntityType(entity.getEntityType());
chemical.addToIndexes();
entity.removeFromIndexes(aJCas);
}
}
private boolean crosses(NamedEntity lastEntity, NamedEntity entity) {
if (lastEntity == null) {
return false;
}
if (lastEntity.getBegin() <= entity.getBegin() && entity.getEnd() <= lastEntity.getEnd()) {
return true;
}
if (entity.getBegin() <= lastEntity.getBegin() && lastEntity.getEnd() <= entity.getEnd()) {
return true;
}
if (lastEntity.getBegin() <= entity.getBegin() && entity.getBegin() <= lastEntity.getEnd()) {
return true;
}
return false;
}
private boolean isReplaceByAbbreviation(NamedEntity abbr, NamedEntity otherEntity) {
if (otherEntity == null) {
return true;
}
String abbrText = abbr.getCoveredText();
String othText = otherEntity.getCoveredText();
// prefer longer abbreviation
if (Constants.ABBREV.equals(otherEntity.getSource())) {
return abbr.getCoveredText().length() > otherEntity.getCoveredText().length();
} else {
// replace annotations such as "ATP-site-directed"
if (othText.matches(Pattern.quote(abbrText) + "-[a-z\\-]+")) {
return true;
}
}
// prefer more meaningful abbreviation annotation
return abbr.getBegin() == otherEntity.getBegin() && abbr.getEnd() == otherEntity.getEnd();
}
}