package edu.cmu.minorthird.text; import java.io.BufferedReader; import java.io.FileReader; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.StringTokenizer; import java.util.Vector; import org.apache.log4j.Logger; /** * The ExtractAbbrev class implements a simple algorithm for * extraction of abbreviations and their definitions from biomedical * text. Abbreviations (short forms) are extracted from the input * file, and those abbreviations for which a definition (long form) is * found are printed out, along with that definition, one per line. * * See: * A Simple Algorithm for Identifying Abbreviation Definitions in Biomedical Text * A.S. Schwartz, M.A. Hearst; Pacific Symposium on Biocomputing 8:451-462(2003) * for a detailed description of the algorithm. (http://biotext.berkeley.edu/papers/psb03.pdf) * * @author Ariel Schwartz and William Cohen * @version 03/12/03 and 11/04 * */ public class ExtractAbbrev extends AbstractAnnotator { private Logger log = Logger.getLogger(ExtractAbbrev.class); /** The annotation type provided by this annotator. */ public static final String PROVIDED_ANNOTATION = "abbrev"; /** The type asserted for extracted acronyms */ public static final String SHORT_FORM_TYPE = "abbrevShort"; /** The type asserted for expansions of extracted acronyms */ public static final String LONG_FORM_TYPE = "abbrevLong"; /** This property links an extracted acronym to its expansion */ public static final String LONG_FORM_PROP = "expansion"; /** This property links an expansion to its acronym */ public static final String SHORT_FORM_PROP = "acronym"; private Map<String,Vector<String>> mTestDefinitions = new HashMap<String,Vector<String>>(); private int truePositives = 0, falsePositives = 0, falseNegatives = 0, trueNegatives = 0; private static final char DELIMITER = '\t'; private boolean testMode = false; // // added by wcohen - implements the AbstractAnnotator interface // private List<StringSpan> accum = new ArrayList<StringSpan>(); // accumulator, used by Schwartz's code private boolean annotationMode = false; // flag, used by Schwartz's code @Override protected void doAnnotate(MonotonicTextLabels labels) { annotationMode = true; int k=0; for (Iterator<Span> i=labels.getTextBase().documentSpanIterator(); i.hasNext(); ) { accum.clear(); Span doc = i.next(); String s = doc.getDocumentContents(); // call Schwartz's code to fill up accum with short,long pairs extractAbbrPairsFromString(s); // build annotations based on the contents of the accumulator for (Iterator<StringSpan> j=accum.iterator(); j.hasNext(); ) { StringSpan shortForm = j.next(); StringSpan longForm = j.next(); Span shortSpan = doc.charIndexSubSpan(shortForm.lo, shortForm.hi); log.debug("shortSpan["+shortForm.lo+".."+shortForm.hi+"] of doc: near '"+ doc.getDocumentContents().substring(shortForm.lo,shortForm.hi)+"'"); log.debug("shortForm='"+shortForm.asString()+"' shortSpan='"+shortSpan.asString()+"'"); Span longSpan = doc.charIndexSubSpan(longForm.lo, longForm.hi); labels.addToType(shortSpan, SHORT_FORM_TYPE); labels.addToType(longSpan, LONG_FORM_TYPE); k++; labels.setProperty( shortSpan, SHORT_FORM_PROP, Integer.toString(k)); labels.setProperty( longSpan, LONG_FORM_PROP, Integer.toString(k)); } } annotationMode = false; labels.setAnnotatedBy( PROVIDED_ANNOTATION ); } @Override public String explainAnnotation(TextLabels labels,Span span) { return "No explanation implemented."; } private boolean isValidShortForm(String str) { return (hasLetter(str) && (Character.isLetterOrDigit(str.charAt(0)) || (str.charAt(0) == '('))); } private boolean hasLetter(String str) { for (int i=0; i < str.length() ; i++) if (Character.isLetter(str.charAt(i))) return true; return false; } private boolean hasCapital(String str) { for (int i=0; i < str.length() ; i++) if (Character.isUpperCase(str.charAt(i))) return true; return false; } private void loadTrueDefinitions(String inFile) { String abbrString, defnString, str = ""; Vector<String> entry; Map<String,Vector<String>> definitions = mTestDefinitions; try { BufferedReader fin = new BufferedReader(new FileReader (inFile)); while ((str = fin.readLine()) != null) { int j = str.indexOf(DELIMITER); abbrString = str.substring(0,j).trim(); defnString = str.substring(j,str.length()).trim(); entry = definitions.get(abbrString); if (entry == null) entry = new Vector<String>(); entry.add(defnString); definitions.put(abbrString, entry); } } catch (Exception e) { e.printStackTrace(); System.out.println(str); } } private boolean isTrueDefinition(String shortForm, String longForm) { Vector<String> entry; Iterator<String> itr; entry = mTestDefinitions.get(shortForm); if (entry == null) return false; itr = entry.iterator(); while(itr.hasNext()){ if (itr.next().toString().equalsIgnoreCase(longForm)) return true; } return false; } private void extractAbbrPairsFromFile(String inFile) { try { BufferedReader fin = new BufferedReader(new FileReader (inFile)); String content = ""; String line; while ((line = fin.readLine()) != null) { content += line+" "; } extractAbbrPairsFromString(content); } catch (Exception ioe) { ioe.printStackTrace(); } } private void extractAbbrPairsFromString(String currString) { String[] sentence = currString.split("\\.\\s{2}"); int offset = 0; for (int i=0; i<sentence.length; i++) { extractAbbrPairsFromSentence(new StringSpan(currString,offset,offset+sentence[i].length())); offset += sentence[i].length()+3; // the +3 is for the period and the two following spaces } } /** Holds a substring of a longer string. */ static private class StringSpan { static public final StringSpan EMPTY = new StringSpan("",0,0); String base; int lo,hi; String mySubstring; public StringSpan(String b,int lo,int hi) { this.base=b; this.lo=lo; this.hi=hi; mySubstring = base.substring(lo,hi); } public StringSpan(StringSpan ss,int lo,int hi) { this.base=ss.base; this.lo=ss.lo+lo; this.hi=ss.lo+hi; this.mySubstring = this.base.substring(this.lo,this.hi); } //public int offset() { return lo; } public int length() { return hi-lo; } public char charAt(int i) { return mySubstring.charAt(i); } public int indexOf(char ch) { return mySubstring.indexOf(ch); } public int indexOf(char ch,int fromIndex) { return mySubstring.indexOf(ch,fromIndex); } public int indexOf(String s) { return mySubstring.indexOf(s); } public int lastIndexOf(String s) { return mySubstring.lastIndexOf(s); } public int lastIndexOf(String s,int fromIndex) { return mySubstring.lastIndexOf(s,fromIndex); } public String asString() { return mySubstring; } public StringSpan substring(int newLo,int newHi) { return new StringSpan(base,lo+newLo,lo+newHi); } public StringSpan substring(int newLo) { return new StringSpan(base,lo+newLo,hi); } public StringSpan trim() { StringSpan ss = new StringSpan(base,lo,hi); while (ss.lo<ss.hi && Character.isWhitespace(ss.base.charAt(ss.lo))) ss.lo++; while (ss.hi>ss.lo && Character.isWhitespace(ss.base.charAt(ss.hi-1))) ss.hi--; ss.mySubstring = ss.base.substring(ss.lo,ss.hi); return ss; } } private void extractAbbrPairsFromSentence(StringSpan currSentence) { StringSpan longForm = StringSpan.EMPTY; StringSpan shortForm = StringSpan.EMPTY; int openParenIndex, closeParenIndex = -1, sentenceEnd, newCloseParenIndex, tmpIndex = -1; StringTokenizer shortTokenizer; log.debug("finding pairs in '"+currSentence.asString()+"'"); openParenIndex = currSentence.indexOf(" ("); do { if (openParenIndex > -1) openParenIndex++; sentenceEnd = Math.max(currSentence.lastIndexOf(". "), currSentence.lastIndexOf(", ")); if ((openParenIndex == -1) && (sentenceEnd == -1)) { //Do nothing } else if (openParenIndex == -1) { currSentence = currSentence.substring(sentenceEnd + 2); } else if ((closeParenIndex = currSentence.indexOf(')',openParenIndex)) > -1){ sentenceEnd = Math.max(currSentence.lastIndexOf(". ", openParenIndex), currSentence.lastIndexOf(", ", openParenIndex)); if (sentenceEnd == -1) sentenceEnd = -2; //longForm = currSentence.substring(sentenceEnd + 2, openParenIndex); //shortForm = currSentence.substring(openParenIndex + 1, closeParenIndex); longForm = new StringSpan(currSentence,sentenceEnd+2,openParenIndex); shortForm = new StringSpan(currSentence,openParenIndex+1,closeParenIndex); } if (shortForm.length() > 0 || longForm.length() > 0) { if (shortForm.length() > 1 && longForm.length() > 1) { if ((shortForm.indexOf('(') > -1) && ((newCloseParenIndex = currSentence.indexOf(')', closeParenIndex + 1)) > -1)){ //shortForm = currSentence.substring(openParenIndex + 1, newCloseParenIndex); shortForm = new StringSpan(currSentence, openParenIndex + 1, newCloseParenIndex); closeParenIndex = newCloseParenIndex; } if ((tmpIndex = shortForm.indexOf(", ")) > -1) shortForm = shortForm.substring(0, tmpIndex); if ((tmpIndex = shortForm.indexOf("; ")) > -1) shortForm = shortForm.substring(0, tmpIndex); shortTokenizer = new StringTokenizer(shortForm.asString()); if (shortTokenizer.countTokens() > 2 || shortForm.length() > longForm.length()) { // Long form in ( ) tmpIndex = currSentence.lastIndexOf(" ", openParenIndex - 2); //tmpStr = currSentence.substring(tmpIndex + 1, openParenIndex - 1); StringSpan tmpStr = new StringSpan(currSentence,tmpIndex + 1, openParenIndex - 1); longForm = shortForm; shortForm = tmpStr; if (! hasCapital(shortForm.asString())) shortForm = StringSpan.EMPTY; } if (isValidShortForm(shortForm.asString())){ extractAbbrPair(shortForm.trim(), longForm.trim()); } } currSentence = currSentence.substring(closeParenIndex + 1); } else if (openParenIndex > -1) { if ((currSentence.length() - openParenIndex) > 200) // Matching close paren was not found currSentence = currSentence.substring(openParenIndex + 1); break; // Read next line } shortForm = StringSpan.EMPTY; longForm = StringSpan.EMPTY; } while ((openParenIndex = currSentence.indexOf(" (")) > -1); } private StringSpan findBestLongForm(StringSpan shortForm, StringSpan longForm) { int sIndex; int lIndex; char currChar; sIndex = shortForm.length() - 1; lIndex = longForm.length() - 1; for ( ; sIndex >= 0; sIndex--) { currChar = Character.toLowerCase(shortForm.charAt(sIndex)); if (!Character.isLetterOrDigit(currChar)) continue; while (((lIndex >= 0) && (Character.toLowerCase(longForm.charAt(lIndex)) != currChar)) || ((sIndex == 0) && (lIndex > 0) && (Character.isLetterOrDigit(longForm.charAt(lIndex - 1))))) lIndex--; if (lIndex < 0) return null; lIndex--; } lIndex = longForm.lastIndexOf(" ", lIndex) + 1; return longForm.substring(lIndex); } private void extractAbbrPair(StringSpan shortForm, StringSpan longForm) { StringSpan bestLongForm; StringTokenizer tokenizer; int longFormSize, shortFormSize; log.debug("finding long form for '"+shortForm.asString()+"' and '"+longForm.asString()+"'"); if (shortForm.length() == 1) return; bestLongForm = findBestLongForm(shortForm, longForm); if (bestLongForm == null) return; tokenizer = new StringTokenizer(bestLongForm.asString(), " \t\n\r\f-"); longFormSize = tokenizer.countTokens(); shortFormSize = shortForm.length(); for (int i=shortFormSize - 1; i >= 0; i--) if (!Character.isLetterOrDigit(shortForm.charAt(i))) shortFormSize--; if (bestLongForm.length() < shortForm.length() || bestLongForm.indexOf(shortForm.asString() + " ") > -1 || bestLongForm.asString().endsWith(shortForm.asString()) || longFormSize > shortFormSize * 2 || longFormSize > shortFormSize + 5 || shortFormSize > 10) return; // at this point we have bestLongForm as expansion of shortForm if (annotationMode) { accum.add( shortForm ); accum.add( bestLongForm ); } if (testMode) { if (isTrueDefinition(shortForm.asString(), bestLongForm.asString())) { System.out.println(shortForm.asString() + DELIMITER + bestLongForm.asString() + DELIMITER + "TP"); truePositives++; } else { falsePositives++; System.out.println(shortForm.asString() + DELIMITER + bestLongForm.asString() + DELIMITER + "FP"); } } else if (!annotationMode) { System.out.println(shortForm.asString() + DELIMITER + bestLongForm.asString()); } } private static void usage() { System.err.println("Usage: ExtractAbbrev [-options] <filename>"); System.err.println(" <filename> contains text from which abbreviations are extracted" ); System.err.println(" -testlist <file> = list of true abbreviation definition pairs"); System.err.println(" -usage or -help = this message"); System.exit(1); } public static void main(String[] args) { // String shortForm, longForm, defnString, str; ExtractAbbrev extractAbbrev = new ExtractAbbrev(); String filename = null; String testList = null; //parse arguments for (int i = 0; i < args.length; i++) { if (args[i].equals("-testlist")) { if (i == args.length - 1) { usage(); } testList = args[++i]; extractAbbrev.testMode = true; } else if (args[i].equals("-usage")) { usage(); } else if (args[i].equals("-help")) { usage(); } else { filename = args[i]; // Must be last arg if (i != args.length - 1) { usage(); } } } if (filename == null) { usage(); } if (extractAbbrev.testMode) extractAbbrev.loadTrueDefinitions(testList); extractAbbrev.extractAbbrPairsFromFile(filename); if (extractAbbrev.testMode) System.out.println("TP: " + extractAbbrev.truePositives + " FP: " + extractAbbrev.falsePositives + " FN: " + extractAbbrev.falseNegatives + " TN: " + extractAbbrev.trueNegatives); } }