/* * Copyright (c) 2012. Humboldt-Universität zu Berlin, Dept. of Computer Science and Dept. * of Wissensmanagement in der Bioinformatik * ------------------------------- * * THE ACCOMPANYING PROGRAM IS PROVIDED UNDER THE TERMS OF THIS COMMON PUBLIC * LICENSE ("AGREEMENT"). ANY USE, REPRODUCTION OR DISTRIBUTION OF THE PROGRAM * CONSTITUTES RECIPIENT'S ACCEPTANCE OF THIS AGREEMENT. * * http://www.opensource.org/licenses/cpl1.0 */ package de.berlin.hu.uima.ae.tagger.brics; import de.berlin.hu.chemspot.Mention; import de.berlin.hu.uima.ae.normalizer.Normalizer; import de.berlin.hu.util.Constants; import de.berlin.hu.util.Constants.ChemicalType; import org.apache.uima.UimaContext; import org.apache.uima.analysis_component.JCasAnnotator_ImplBase; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; import org.u_compare.shared.semantic.Chemical; import org.uimafit.util.JCasUtil; import java.io.BufferedReader; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStreamReader; import java.util.*; /** * User: Tim Rocktaeschel * Date: 7/2/12 * Time: 2:11 PM */ public class BricsTagger extends JCasAnnotator_ImplBase { public static final String PATH_TO_DICTIONARY = "DrugBankMatcherDictionaryAutomat"; public static final String IDS = "Ids"; //list of invalid suffixes taken from Hettne et al. (2009) private Set<String> suffixes; //FIXME: implement another AE for that private BricsMatcher matcher; @Override public void initialize(UimaContext aContext) throws ResourceInitializationException { super.initialize(aContext); suffixes = new HashSet<String>(); if (aContext.getConfigParameterValue(PATH_TO_DICTIONARY) != null && !aContext.getConfigParameterValue(PATH_TO_DICTIONARY).toString().isEmpty()) { try { BufferedReader reader = new BufferedReader(new InputStreamReader(this.getClass().getClassLoader().getResourceAsStream("resources/suffixes.txt"))); String line = reader.readLine(); while (line != null) { suffixes.add(line); line = reader.readLine(); } matcher = new BricsMatcher(aContext.getConfigParameterValue(PATH_TO_DICTIONARY).toString()); } catch (FileNotFoundException e) { throw new ResourceInitializationException(e); } catch (IOException e) { throw new ResourceInitializationException(e); } catch (ClassNotFoundException e) { throw new ResourceInitializationException(e); } } else if (Normalizer.getIds() != null) { try { matcher = new BricsMatcher(); } catch (IOException e) { throw new ResourceInitializationException(e); } catch (ClassNotFoundException e) { throw new ResourceInitializationException(e); } } } /* @Override public void process(JCas jCas) throws AnalysisEngineProcessException { String docText = jCas.getDocumentText(); Collection<Mention> matches = matcher.match(docText); for (Mention match : matches) { Chemical annotation = new Chemical(jCas); annotation.setBegin(match.getStart()); annotation.setEnd(match.getEnd()); annotation.setSource(Constants.DICTIONARY); annotation.addToIndexes(); } } */ @Override public void process(JCas aJCas) throws AnalysisEngineProcessException { String docText = aJCas.getDocumentText(); List<Mention> matches = new ArrayList<Mention>(); try { long start = System.currentTimeMillis(); matches = new ArrayList<Mention>(matcher.match(docText)); long time = System.currentTimeMillis() - start; } catch (Error e) { throw new AnalysisEngineProcessException(e); } catch (IllegalStateException e) { if (e.toString().contains("Automaton matched the empty string")); //FIXME: What goes wrong here? else throw new AnalysisEngineProcessException(e); } Comparator<Mention> comp = new Comparator<Mention>() { public int compare(Mention m1, Mention m2) { return m1.getStart() - m2.getStart(); } }; //sort mentions by start position Collections.sort(matches, comp); Mention lastMention = null; Chemical lastDrug = null; for (Mention mention : matches) { int begin = mention.getStart(); int end = mention.getEnd(); //String id = mention.getIdsToString(); String id = ""; //filter mentions if (!filter(mention)) { //only keep mention if it is not included in the previous one if (!overlaps(lastMention, mention)) { if (overlaps(mention, lastMention)) { lastDrug.removeFromIndexes(); lastMention = null; } lastDrug = processMention(aJCas, docText, lastDrug, begin, end, id); } lastMention = mention; } } //test whether overlaps were resolved correctly! Comparator<Chemical> comp2 = new Comparator<Chemical>() { public int compare(Chemical m1, Chemical m2) { return m1.getBegin() - m2.getBegin(); } }; List<Chemical> entities = new ArrayList<Chemical>(JCasUtil.select(aJCas, Chemical.class)); Collections.sort(entities, comp2); List<Chemical> chemicalsToRemove = new ArrayList<Chemical>(); Chemical lastChemical = null; for (Chemical chemical : entities) { if (Constants.DICTIONARY.equals(chemical.getSource())) { //if they cross if (lastChemical != null && ( lastChemical.getBegin() <= chemical.getBegin() && chemical.getEnd() <= lastChemical.getEnd() || lastChemical.getBegin() <= chemical.getBegin() && chemical.getBegin() <= lastChemical.getEnd() )) { //keep the longer one if (lastChemical.getCoveredText().length() > chemical.getCoveredText().length()) { chemicalsToRemove.add(chemical); } else { chemicalsToRemove.add(lastChemical); } //throw new IllegalStateException(lastChemical.getCoveredText() + " overlaps " + chemical.getCoveredText()); } lastChemical = chemical; } } for (Chemical chemical : chemicalsToRemove) { chemical.removeFromIndexes(aJCas); } } //FIXME: implement match expansion and boundary correction in a separate AE private Chemical processMention(JCas aJCas, String docText, Chemical lastDrug, int begin, int end, String id) { int originalBegin = begin; int originalEnd = end; boolean matchExpansion = false; //expand mentions (by simulating a coarse tokenizer) if (matchExpansion) { begin = findLeftBorder(docText, begin); end = findRightBorder(docText, end); } //remove erroneous last character if ((docText.charAt(end-1)+"").matches("[.,;:]")) { end--; } if (docText.charAt(end-1) == '(') { end--; } if (docText.charAt(end-1) == '[') { end--; } if (docText.charAt(begin) == ')') { begin++; } if (docText.charAt(begin) == ']') { begin++; } if (docText.charAt(begin) == '(' && docText.charAt(end-1) == ')') { begin++; end--; } if (docText.charAt(begin) == '[' && docText.charAt(end-1) == ']') { begin++; end--; } int stack = 0; for (int i = begin; i < end; i++) { char c = docText.charAt(i); if (c == '(') { stack++; } else if (c == ')') { stack--; } } if (stack > 0 && docText.charAt(begin) == '(') { begin++; } if (stack < 0 && docText.charAt(end-1) == ')') { end--; } int stack2 = 0; for (int i = begin; i < end; i++) { char c = docText.charAt(i); if (c == '[') { stack2++; } else if (c == ']') { stack2--; } } if (stack2 > 0 && docText.charAt(begin) == '[') { begin++; } if (stack2 < 0 && docText.charAt(end-1) == ']') { end--; } String mentionText = docText.substring(begin, end); for (String suffix : suffixes) { if (mentionText.endsWith(suffix)) { end = end - suffix.length(); break; } } boolean borderHasChanged = (originalBegin != begin) || (originalEnd != end); //create new annotation if (lastDrug == null || lastDrug.getBegin() != begin) { lastDrug = createDrugAnnotation(aJCas, begin, end, id, borderHasChanged); } return lastDrug; } private Chemical createDrugAnnotation(JCas aJCas, int begin, int end, String id, boolean borderHasChanged) { Chemical drug = new Chemical(aJCas); drug.setBegin(begin); drug.setEnd(end); //ID is set by normalizer drug.setId(""); drug.setSource(Constants.DICTIONARY); drug.setEntityType(aJCas.getDocumentText().substring(begin, end).matches("\\[a-zA-Z]") ? ChemicalType.TRIVIAL.toString() : ChemicalType.SYSTEMATIC.toString()); drug.addToIndexes(); return drug; } private int findRightBorder(String docText, int end) { for (int i = end; i < docText.length(); i++) { if (docText.charAt(i) == ' ' || docText.charAt(i) == '\n' || docText.charAt(i) == '\r' || docText.charAt(i) == '\t') { return i; } } return docText.length(); } private int findLeftBorder(String docText, int begin) { for (int i = begin; i > 0; i--) { if (docText.charAt(i) == ' ' || docText.charAt(i) == '\n' || docText.charAt(i) == '\r' || docText.charAt(i) == '\t') { return i+1; } } return 0; } private boolean filter(Mention mention) { //forget about ambiguous one or two letter entities if (mention.getEnd() - mention.getStart() < 3) { return true; } //test if it is a real number only if (mention.getText().matches("[-0-9]+[.,]+[0-9.,]+|[0-9,]+")) { return true; } return false; } private boolean overlaps(Mention lastMention, Mention mention) { if (lastMention == null || mention == null) { return false; } if (lastMention.getStart() <= mention.getStart() && mention.getEnd() <= lastMention.getEnd()) { return true; } else { return false; } } }