BricsTagger.java example

Explorer

ChemSpot-master
- src
  - main
    - java
      - de
        berlin
        hu
        banner
        featuresets
        KlingerLikeFeatureSet.java
        LWhitespace.java
        RWhitespace.java
        util
        ConfigUtil.java
        chemspot
        App.java
        ChemSpot.java
        ChemSpotArguments.java
        ChemSpotConfiguration.java
        ChemSpotFactory.java
        ChemicalNEREvaluator.java
        Mention.java
        uima
        ae
        AnnotationImporterAE.java
        AnnotationMergerAE.java
        expander
        MentionExpander.java
        feature
        FeatureGeneratorApp.java
        FeatureToken.java
        FeatureTokenGenerator.java
        filter
        PosFilter.java
        StopwordFilter.java
        SuffixFilter.java
        normalizer
        Normalizer.java
        StringComparator.java
        tagger
        abbrev
        AbbreviationTagger.java
        ExtractAbbrev.java
        banner
        BannerTagger.java
        CRFWrapper.java
        brics
        BricsMatcher.java
        BricsTagger.java
        DictionaryUpdater.java
        drug
        EumedNERTagger.java
        simple
        ChemicalFormulaTagger.java
        tokenizer
        FineTokenizerAE.java
        cc
        banner
        trainer
        BannerTrainer.java
        eval
        ComparableAnnotation.java
        Evaluation.java
        SeparateEvaluation.java
        cr
        chemdner
        CHEMDNERReader.java
        craft
        CraftCR.java
        ddi
        DDICorpusCR.java
        parser
        DDICorpusContentHandlerImpl.java
        iob
        IOBDirectoryCollectionReader.java
        txt
        gz
        ZipFileCollectionReader.java
        xml
        NaCTeMCollectionReader.java
        PatentCorpusCollectionReader.java
        XMLCollectionReader.java
        util
        DDIToUCompareConverter.java
        OpenNLPToUCompareSentenceConverterAE.java
        OpenNLPToUCompareTokenConverterAE.java
        Util.java
        util
        Constants.java
        wbi
        common
        research
        EvalMeasures.java
        Evaluator.java
    - types

/*
 * Copyright (c) 2012. Humboldt-Universität zu Berlin, Dept. of Computer Science and Dept.
 * of Wissensmanagement in der Bioinformatik
 * -------------------------------
 *
 * THE ACCOMPANYING PROGRAM IS PROVIDED UNDER THE TERMS OF THIS COMMON PUBLIC
 * LICENSE ("AGREEMENT"). ANY USE, REPRODUCTION OR DISTRIBUTION OF THE PROGRAM
 * CONSTITUTES RECIPIENT'S ACCEPTANCE OF THIS AGREEMENT.
 *
 * http://www.opensource.org/licenses/cpl1.0
 */

package de.berlin.hu.uima.ae.tagger.brics;

import de.berlin.hu.chemspot.Mention;
import de.berlin.hu.uima.ae.normalizer.Normalizer;
import de.berlin.hu.util.Constants;
import de.berlin.hu.util.Constants.ChemicalType;

import org.apache.uima.UimaContext;
import org.apache.uima.analysis_component.JCasAnnotator_ImplBase;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import org.u_compare.shared.semantic.Chemical;
import org.uimafit.util.JCasUtil;

import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.*;

/**
 * User: Tim Rocktaeschel
 * Date: 7/2/12
 * Time: 2:11 PM
 */
public class BricsTagger extends JCasAnnotator_ImplBase {
	public static final String PATH_TO_DICTIONARY = "DrugBankMatcherDictionaryAutomat";
	public static final String IDS = "Ids";
    //list of invalid suffixes taken from Hettne et al. (2009)
	private Set<String> suffixes; //FIXME: implement another AE for that
	private BricsMatcher matcher;

	@Override
	public void initialize(UimaContext aContext) throws ResourceInitializationException {
		super.initialize(aContext);
		suffixes = new HashSet<String>();

		if (aContext.getConfigParameterValue(PATH_TO_DICTIONARY) != null && !aContext.getConfigParameterValue(PATH_TO_DICTIONARY).toString().isEmpty()) {
			try {
				BufferedReader reader = new BufferedReader(new InputStreamReader(this.getClass().getClassLoader().getResourceAsStream("resources/suffixes.txt")));
				String line = reader.readLine();
				while (line != null) {
					suffixes.add(line);
					line = reader.readLine();
				}
				matcher = new BricsMatcher(aContext.getConfigParameterValue(PATH_TO_DICTIONARY).toString());
			} catch (FileNotFoundException e) {
				throw new ResourceInitializationException(e);
			} catch (IOException e) {
				throw new ResourceInitializationException(e);
			} catch (ClassNotFoundException e) {
	            throw new ResourceInitializationException(e);
	        }
		} else if (Normalizer.getIds() != null) {
			try {
				matcher = new BricsMatcher();
			} catch (IOException e) {
				throw new ResourceInitializationException(e);
			} catch (ClassNotFoundException e) {
				throw new ResourceInitializationException(e);
			}
		}
    }

/*    @Override
    public void process(JCas jCas) throws AnalysisEngineProcessException {
        String docText = jCas.getDocumentText();
        Collection<Mention> matches = matcher.match(docText);

        for (Mention match : matches) {
            Chemical annotation = new Chemical(jCas);
            annotation.setBegin(match.getStart());
            annotation.setEnd(match.getEnd());
            annotation.setSource(Constants.DICTIONARY);
            annotation.addToIndexes();
        }
    }   */


    @Override
   	public void process(JCas aJCas) throws AnalysisEngineProcessException {
   		String docText = aJCas.getDocumentText();
   		List<Mention> matches = new ArrayList<Mention>();

   		try {
   			long start = System.currentTimeMillis();
   			matches = new ArrayList<Mention>(matcher.match(docText));
   			long time = System.currentTimeMillis() - start;

   		} catch (Error e) {
   			throw new AnalysisEngineProcessException(e);
   		} catch (IllegalStateException e) {
               if (e.toString().contains("Automaton matched the empty string")); //FIXME: What goes wrong here?
               else throw new AnalysisEngineProcessException(e);
   		}

   		Comparator<Mention> comp = new Comparator<Mention>() {
   			public int compare(Mention m1, Mention m2) {
   				return m1.getStart() - m2.getStart();
   			}
   		};

   		//sort mentions by start position
   		Collections.sort(matches, comp);
   		Mention lastMention = null;
   		Chemical lastDrug = null;

   		for (Mention mention : matches) {
   			int begin = mention.getStart();
   			int end = mention.getEnd();
   			//String id = mention.getIdsToString();
   			String id = "";

   			//filter mentions
   			if (!filter(mention)) {
   				//only keep mention if it is not included in the previous one
   				if (!overlaps(lastMention, mention)) {
   					if (overlaps(mention, lastMention)) {
   						lastDrug.removeFromIndexes();
   						lastMention = null;
   					}
   					lastDrug = processMention(aJCas, docText, lastDrug, begin, end, id);
   				}
   				lastMention = mention;
   			}
   		}


   		//test whether overlaps were resolved correctly!
   		Comparator<Chemical> comp2 = new Comparator<Chemical>() {
   			public int compare(Chemical m1, Chemical m2) {
   				return m1.getBegin() - m2.getBegin();
   			}
   		};

   		List<Chemical> entities = new ArrayList<Chemical>(JCasUtil.select(aJCas, Chemical.class));
   		Collections.sort(entities, comp2);

   		List<Chemical> chemicalsToRemove = new ArrayList<Chemical>();

   		Chemical lastChemical = null;
   		for (Chemical chemical : entities) {
   			if (Constants.DICTIONARY.equals(chemical.getSource())) {
   				//if they cross
   				if (lastChemical != null && (
   						lastChemical.getBegin() <= chemical.getBegin() && chemical.getEnd() <= lastChemical.getEnd()
   						|| lastChemical.getBegin() <= chemical.getBegin() && chemical.getBegin() <= lastChemical.getEnd()
   						)) {
   					//keep the longer one
   					if (lastChemical.getCoveredText().length() > chemical.getCoveredText().length()) {
   						chemicalsToRemove.add(chemical);
   					} else {
   						chemicalsToRemove.add(lastChemical);
   					}
   					//throw new IllegalStateException(lastChemical.getCoveredText() + " overlaps " + chemical.getCoveredText());
   				}
   				lastChemical = chemical;
   			}
   		}

   		for (Chemical chemical : chemicalsToRemove) {
   			chemical.removeFromIndexes(aJCas);
   		}
   	}

       //FIXME: implement match expansion and boundary correction in a separate AE
   	private Chemical processMention(JCas aJCas, String docText, Chemical lastDrug,
   			int begin, int end, String id) {
   		int originalBegin = begin;
   		int originalEnd = end;
        boolean matchExpansion = false;
   		//expand mentions (by simulating a coarse tokenizer)
   		if (matchExpansion) {
   			begin = findLeftBorder(docText, begin);
   			end = findRightBorder(docText, end);
   		}

   		//remove erroneous last character
   		if ((docText.charAt(end-1)+"").matches("[.,;:]")) {
   			end--;
   		}

   		if (docText.charAt(end-1) == '(') {
   			end--;
   		}
   		if (docText.charAt(end-1) == '[') {
   			end--;
   		}

   		if (docText.charAt(begin) == ')') {
   			begin++;
   		}
   		if (docText.charAt(begin) == ']') {
   			begin++;
   		}

   		if (docText.charAt(begin) == '(' && docText.charAt(end-1) == ')') {
   			begin++;
   			end--;
   		}

   		if (docText.charAt(begin) == '[' && docText.charAt(end-1) == ']') {
   			begin++;
   			end--;
   		}

   		int stack = 0;
   		for (int i = begin; i < end; i++) {
   			char c = docText.charAt(i);
   			if (c == '(') {
   				stack++;
   			} else if (c == ')') {
   				stack--;
   			}
   		}

   		if (stack > 0 && docText.charAt(begin) == '(') {
   			begin++;
   		}

   		if (stack < 0 && docText.charAt(end-1) == ')') {
   			end--;
   		}

   		int stack2 = 0;
   		for (int i = begin; i < end; i++) {
   			char c = docText.charAt(i);
   			if (c == '[') {
   				stack2++;
   			} else if (c == ']') {
   				stack2--;
   			}
   		}

   		if (stack2 > 0 && docText.charAt(begin) == '[') {
   			begin++;
   		}

   		if (stack2 < 0 && docText.charAt(end-1) == ']') {
   			end--;
   		}

   		String mentionText = docText.substring(begin, end);
   		for (String suffix : suffixes) {
   			if (mentionText.endsWith(suffix)) {
   				end = end - suffix.length();
   				break;
   			}
   		}

   		boolean borderHasChanged = (originalBegin != begin) || (originalEnd != end);

   		//create new annotation
   		if (lastDrug == null || lastDrug.getBegin() != begin) {
   			lastDrug = createDrugAnnotation(aJCas, begin, end, id, borderHasChanged);
   		}
   		return lastDrug;
   	}


   	private Chemical createDrugAnnotation(JCas aJCas, int begin, int end, String id, boolean borderHasChanged) {
   		Chemical drug = new Chemical(aJCas);
   		drug.setBegin(begin);
   		drug.setEnd(end);
        //ID is set by normalizer
        drug.setId("");
   		drug.setSource(Constants.DICTIONARY);
   		drug.setEntityType(aJCas.getDocumentText().substring(begin, end).matches("\\[a-zA-Z]") ? ChemicalType.TRIVIAL.toString() : ChemicalType.SYSTEMATIC.toString());
   		drug.addToIndexes();
   		return drug;
   	}


   	private int findRightBorder(String docText, int end) {
   		for (int i = end; i < docText.length(); i++) {
   			if (docText.charAt(i) == ' '
   					|| docText.charAt(i) == '\n'
   					|| docText.charAt(i) == '\r'
   					|| docText.charAt(i) == '\t') {
   				return i;
   			}
   		}
   		return docText.length();
   	}

   	private int findLeftBorder(String docText, int begin) {
   		for (int i = begin; i > 0; i--) {
   			if (docText.charAt(i) == ' '
   					|| docText.charAt(i) == '\n'
   					|| docText.charAt(i) == '\r'
   					|| docText.charAt(i) == '\t') {
   				return i+1;
   			}
   		}
   		return 0;
   	}


   	private boolean filter(Mention mention) {
   		//forget about ambiguous one or two letter entities
   		if (mention.getEnd() - mention.getStart() < 3) {
   			return true;
   		}

   		//test if it is a real number only
   		if (mention.getText().matches("[-0-9]+[.,]+[0-9.,]+|[0-9,]+")) {
   			return true;
   		}
   		return false;
   	}

   	private boolean overlaps(Mention lastMention, Mention mention) {
   		if (lastMention == null || mention == null) {
   			return false;
   		}

   		if (lastMention.getStart() <= mention.getStart() && mention.getEnd() <= lastMention.getEnd()) {
   			return true;
   		} else {
   			return false;
   		}
   	}
}