ExtractAbbrev.java example

Explorer

ChemSpot-master
- src
  - main
    - java
      - de
        berlin
        hu
        banner
        featuresets
        KlingerLikeFeatureSet.java
        LWhitespace.java
        RWhitespace.java
        util
        ConfigUtil.java
        chemspot
        App.java
        ChemSpot.java
        ChemSpotArguments.java
        ChemSpotConfiguration.java
        ChemSpotFactory.java
        ChemicalNEREvaluator.java
        Mention.java
        uima
        ae
        AnnotationImporterAE.java
        AnnotationMergerAE.java
        expander
        MentionExpander.java
        feature
        FeatureGeneratorApp.java
        FeatureToken.java
        FeatureTokenGenerator.java
        filter
        PosFilter.java
        StopwordFilter.java
        SuffixFilter.java
        normalizer
        Normalizer.java
        StringComparator.java
        tagger
        abbrev
        AbbreviationTagger.java
        ExtractAbbrev.java
        banner
        BannerTagger.java
        CRFWrapper.java
        brics
        BricsMatcher.java
        BricsTagger.java
        DictionaryUpdater.java
        drug
        EumedNERTagger.java
        simple
        ChemicalFormulaTagger.java
        tokenizer
        FineTokenizerAE.java
        cc
        banner
        trainer
        BannerTrainer.java
        eval
        ComparableAnnotation.java
        Evaluation.java
        SeparateEvaluation.java
        cr
        chemdner
        CHEMDNERReader.java
        craft
        CraftCR.java
        ddi
        DDICorpusCR.java
        parser
        DDICorpusContentHandlerImpl.java
        iob
        IOBDirectoryCollectionReader.java
        txt
        gz
        ZipFileCollectionReader.java
        xml
        NaCTeMCollectionReader.java
        PatentCorpusCollectionReader.java
        XMLCollectionReader.java
        util
        DDIToUCompareConverter.java
        OpenNLPToUCompareSentenceConverterAE.java
        OpenNLPToUCompareTokenConverterAE.java
        Util.java
        util
        Constants.java
        wbi
        common
        research
        EvalMeasures.java
        Evaluator.java
    - types

package de.berlin.hu.uima.ae.tagger.abbrev;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.nio.MappedByteBuffer;
import java.nio.channels.FileChannel;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.StringTokenizer;
import java.util.Vector;

import de.berlin.hu.chemspot.Mention;
import de.berlin.hu.util.Constants;


/**
 * The ExtractAbbrev class implements a simple algorithm for
 * extraction of abbreviations and their definitions from biomedical text.
 * Abbreviations (short forms) are extracted from the input file, and those abbreviations
 * for which a definition (long form) is found are printed out, along with that definition,
 * one per line.
 *
 * A file consisting of short-form/long-form pairs (tab separated) can be specified
 * in tandem with the -testlist option for the purposes of evaluating the algorithm.
 *
 * @see <a href="http://biotext.berkeley.edu/papers/psb03.pdf">A Simple Algorithm for Identifying Abbreviation Definitions in Biomedical Text</a> 
 * A.S. Schwartz, M.A. Hearst; Pacific Symposium on Biocomputing 8:451-462(2003) 
 * for a detailed description of the algorithm.  
 *
 * @author Ariel Schwartz
 * @version 03/12/03
 */
public class ExtractAbbrev {

	Map<String, Vector<String>> mTestDefinitions = new HashMap<String, Vector<String>>();
	Map<String, Vector<String>> mStats = new HashMap<String, Vector<String>>();
	int truePositives = 0, falsePositives = 0, falseNegatives = 0, trueNegatives = 0;
	char delimiter = '\t';
	boolean testMode = false;

	private boolean isValidShortForm(String str) {
		return (hasLetter(str) && (Character.isLetterOrDigit(str.charAt(0)) || (str.charAt(0) == '(')));
	}

	private boolean hasLetter(String str) {
		for (int i=0; i < str.length() ; i++)
			if (Character.isLetter(str.charAt(i)))
				return true;
		return false;
	}

	private boolean hasCapital(String str) {
		for (int i=0; i < str.length() ; i++)
			if (Character.isUpperCase(str.charAt(i)))
				return true;
		return false;
	}

	private void loadTrueDefinitions(String inFile) {
		String abbrString, defnString, str = "";
		Vector<String> entry;
		Map<String, Vector<String>> definitions = mTestDefinitions;

		try {
			BufferedReader fin = new BufferedReader(new FileReader (inFile));
			while ((str = fin.readLine()) != null) {
				int j = str.indexOf(delimiter);
				abbrString = str.substring(0,j).trim();
				defnString = str.substring(j,str.length()).trim();		
				entry = (Vector<String>)definitions.get(abbrString);
				if (entry == null)
					entry = new Vector<String>();
				entry.add(defnString);
				definitions.put(abbrString, entry);
			}
		} catch (Exception e) {
			e.printStackTrace();
			System.out.println(str);
		}
	}

	private boolean isTrueDefinition(String shortForm, String longForm) {
		Vector<?> entry;
		Iterator<?> itr;

		entry = (Vector<?>)mTestDefinitions.get(shortForm);
		if (entry == null)
			return false;
		itr = entry.iterator();
		while(itr.hasNext()){
			if (itr.next().toString().equalsIgnoreCase(longForm))
				return true;
		}
		return false;
	}

	private String readFromFile(String file) throws IOException {
		FileInputStream stream = new FileInputStream(new File(file));
		  try {
		    FileChannel fc = stream.getChannel();
		    MappedByteBuffer bb = fc.map(FileChannel.MapMode.READ_ONLY, 0, fc.size());
		    /* Instead of using default, pass in a decoder. */
		    return Charset.defaultCharset().decode(bb).toString();
		  }
		  finally {
		    stream.close();
		  }

	}
	
	public List<Mention> getMentionsFromFile(String inFile) throws IOException {
		try {
			return getMentions(readFromFile(inFile));
		} catch (FileNotFoundException e) {
			e.printStackTrace();
		}

		return new ArrayList<Mention>();
	}

	public List<Mention> getMentions(String text) {
		String tmpStr, longForm = "", shortForm = "";
		String currSentence = "";
		int openParenIndex, closeParenIndex = -1, sentenceEnd, newCloseParenIndex, tmpIndex = -1;

		StringTokenizer shortTokenizer;
		List<Mention> result = new ArrayList<Mention>();

		try {
			int offset = 0;
			currSentence = text;
			openParenIndex =  currSentence.indexOf(" (");
			do {
				int begin = 0, end = 0;
				
				if (openParenIndex > -1) {
					openParenIndex++;
				}
				
				sentenceEnd = Math.max(currSentence.lastIndexOf(". "), currSentence.lastIndexOf(", "));
				if ((openParenIndex == -1) && (sentenceEnd == -1)) {
					//Do nothing
				} else if (openParenIndex == -1) {
					currSentence = currSentence.substring(sentenceEnd + 2);
					offset = sentenceEnd + 2;
				} else if ((closeParenIndex = currSentence.indexOf(')',openParenIndex)) > -1){
					sentenceEnd = Math.max(currSentence.lastIndexOf(". ", openParenIndex), 
							currSentence.lastIndexOf(", ", openParenIndex));
					if (sentenceEnd == -1) {
						sentenceEnd = -2;
					}
					longForm = currSentence.substring(sentenceEnd + 2, openParenIndex);
					shortForm = currSentence.substring(openParenIndex + 1, closeParenIndex);
					begin = openParenIndex + 1;
					end = closeParenIndex;
				}
				
				if (shortForm.length() > 0 || longForm.length() > 0) {
					if (shortForm.length() > 1 && longForm.length() > 1) {
						if ((shortForm.indexOf('(') > -1) && 
								((newCloseParenIndex = currSentence.indexOf(')', closeParenIndex + 1)) > -1)){
							shortForm = currSentence.substring(openParenIndex + 1, newCloseParenIndex);
							closeParenIndex = newCloseParenIndex;
							begin = openParenIndex + 1;
							end = closeParenIndex;
							
						}
						if ((tmpIndex = shortForm.indexOf(", ")) > -1) {
							shortForm = shortForm.substring(0, tmpIndex);
							end = begin + tmpIndex;
						}
						if ((tmpIndex = shortForm.indexOf("; ")) > -1) {
							shortForm = shortForm.substring(0, tmpIndex);
							end = begin + tmpIndex;
						}
						
						shortTokenizer = new StringTokenizer(shortForm);
						if (shortTokenizer.countTokens() > 2 || shortForm.length() > longForm.length()) {
							// Long form in ( )
							tmpIndex = currSentence.lastIndexOf(" ", openParenIndex - 2);
							tmpStr = currSentence.substring(tmpIndex + 1, openParenIndex - 1);
							longForm = shortForm;
							shortForm = tmpStr;
							begin = tmpIndex + 1;
							end = openParenIndex - 1;
							if (! hasCapital(shortForm))
								shortForm = "";
						}
						if (isValidShortForm(shortForm)) {
							String abbreviation = currSentence.substring(begin, end);
							String bestLongForm = extractAbbrPair(shortForm.trim(), longForm.trim());
							if (bestLongForm != null && !bestLongForm.isEmpty()) {
								begin += offset;
								end += offset;
								/*String inText = text.substring(begin, end);
								System.out.println(abbreviation);
								System.out.println(bestLongForm);
								System.out.println(inText);
								System.out.println();*/
								Mention mention = new Mention(begin, end, abbreviation, bestLongForm, Constants.ABBREV, null);
								
								result.add(mention);
							}
						}
					}
					currSentence = currSentence.substring(closeParenIndex + 1);
					offset += closeParenIndex + 1;
				} else if (openParenIndex > -1) {
					if ((currSentence.length() - openParenIndex) > 200)
						// Matching close paren was not found
						currSentence = currSentence.substring(openParenIndex + 1);
					break; // Read next line
				}
				shortForm = "";
				longForm = "";
			} while ((openParenIndex =  currSentence.indexOf(" (")) > -1);
		} catch (Exception ioe) {
			ioe.printStackTrace();
			System.out.println(currSentence);
			System.out.println(tmpIndex);
		}
		return result;
	}

	private String findBestLongForm(String shortForm, String longForm) {
		int sIndex;
		int lIndex;
		char currChar;

		sIndex = shortForm.length() - 1;
		lIndex = longForm.length() - 1;
		for ( ; sIndex >= 0; sIndex--) {
			currChar = Character.toLowerCase(shortForm.charAt(sIndex));
			if (!Character.isLetterOrDigit(currChar))
				continue;
			while (((lIndex >= 0) && (Character.toLowerCase(longForm.charAt(lIndex)) != currChar)) ||
					((sIndex == 0) && (lIndex > 0) && (Character.isLetterOrDigit(longForm.charAt(lIndex - 1)))))
				lIndex--;
			if (lIndex < 0)
				return null;
			lIndex--;
		}
		lIndex = longForm.lastIndexOf(" ", lIndex) + 1;
		return longForm.substring(lIndex);
	}

	private String extractAbbrPair(String shortForm, String longForm) {
		String bestLongForm;
		StringTokenizer tokenizer;
		int longFormSize, shortFormSize;

		if (shortForm.length() == 1)
			return null;
		bestLongForm = findBestLongForm(shortForm, longForm);
		if (bestLongForm == null)
			return null;
		tokenizer = new StringTokenizer(bestLongForm, " \t\n\r\f-");
		longFormSize = tokenizer.countTokens();
		shortFormSize = shortForm.length();
		for (int i=shortFormSize - 1; i >= 0; i--)
			if (!Character.isLetterOrDigit(shortForm.charAt(i)))
				shortFormSize--;
		if (bestLongForm.length() < shortForm.length() || 
				bestLongForm.indexOf(shortForm + " ") > -1 ||
				bestLongForm.endsWith(shortForm) ||
				longFormSize > shortFormSize * 2 ||
				longFormSize > shortFormSize + 5 ||
				shortFormSize > 10)
			return null;

		if (testMode) {
			if (isTrueDefinition(shortForm, bestLongForm)) {
				System.out.println(shortForm + delimiter + bestLongForm + delimiter + "TP");
				truePositives++;
			}
			else {
				falsePositives++;
				System.out.println(shortForm + delimiter + bestLongForm + delimiter + "FP");
			}	
		}
		else {
			return bestLongForm;
		}
		
		return "";
	}


	private static void usage() {
		System.err.println("Usage: ExtractAbbrev [-options] <filename>");
		System.err.println("       <filename> contains text from which abbreviations are extracted" );
		System.err.println("       -testlist <file> = list of true abbreviation definition pairs");
		System.err.println("       -usage or -help = this message");
		System.exit(1);
	}

	public static void main(String[] args) {
		ExtractAbbrev extractAbbrev = new ExtractAbbrev();
		String filename =  null;
		String testList = null;

		//parse arguments
		for (int i = 0; i < args.length; i++) {
			if (args[i].equals("-testlist")) {
				if (i == args.length - 1) {
					usage();
				}
				testList = args[++i];
				extractAbbrev.testMode = true;
			} else if (args[i].equals("-usage")) {
				usage();
			} else if (args[i].equals("-help")) {
				usage();
			} else {
				filename = args[i];
				// Must be last arg
				if (i != args.length - 1) {
					usage();
				}
			}
		}
		if (filename == null) {
			usage();
		}

		if (extractAbbrev.testMode)
			extractAbbrev.loadTrueDefinitions(testList);
		extractAbbrev.getMentions(filename);
		if (extractAbbrev.testMode)
			System.out.println("TP: " + extractAbbrev.truePositives + " FP: " + extractAbbrev.falsePositives + 
					" FN: " + extractAbbrev.falseNegatives + " TN: " + extractAbbrev.trueNegatives);
	}
}