package de.berlin.hu.uima.ae.tagger.abbrev;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.nio.MappedByteBuffer;
import java.nio.channels.FileChannel;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.StringTokenizer;
import java.util.Vector;
import de.berlin.hu.chemspot.Mention;
import de.berlin.hu.util.Constants;
/**
* The ExtractAbbrev class implements a simple algorithm for
* extraction of abbreviations and their definitions from biomedical text.
* Abbreviations (short forms) are extracted from the input file, and those abbreviations
* for which a definition (long form) is found are printed out, along with that definition,
* one per line.
*
* A file consisting of short-form/long-form pairs (tab separated) can be specified
* in tandem with the -testlist option for the purposes of evaluating the algorithm.
*
* @see <a href="http://biotext.berkeley.edu/papers/psb03.pdf">A Simple Algorithm for Identifying Abbreviation Definitions in Biomedical Text</a>
* A.S. Schwartz, M.A. Hearst; Pacific Symposium on Biocomputing 8:451-462(2003)
* for a detailed description of the algorithm.
*
* @author Ariel Schwartz
* @version 03/12/03
*/
public class ExtractAbbrev {
Map<String, Vector<String>> mTestDefinitions = new HashMap<String, Vector<String>>();
Map<String, Vector<String>> mStats = new HashMap<String, Vector<String>>();
int truePositives = 0, falsePositives = 0, falseNegatives = 0, trueNegatives = 0;
char delimiter = '\t';
boolean testMode = false;
private boolean isValidShortForm(String str) {
return (hasLetter(str) && (Character.isLetterOrDigit(str.charAt(0)) || (str.charAt(0) == '(')));
}
private boolean hasLetter(String str) {
for (int i=0; i < str.length() ; i++)
if (Character.isLetter(str.charAt(i)))
return true;
return false;
}
private boolean hasCapital(String str) {
for (int i=0; i < str.length() ; i++)
if (Character.isUpperCase(str.charAt(i)))
return true;
return false;
}
private void loadTrueDefinitions(String inFile) {
String abbrString, defnString, str = "";
Vector<String> entry;
Map<String, Vector<String>> definitions = mTestDefinitions;
try {
BufferedReader fin = new BufferedReader(new FileReader (inFile));
while ((str = fin.readLine()) != null) {
int j = str.indexOf(delimiter);
abbrString = str.substring(0,j).trim();
defnString = str.substring(j,str.length()).trim();
entry = (Vector<String>)definitions.get(abbrString);
if (entry == null)
entry = new Vector<String>();
entry.add(defnString);
definitions.put(abbrString, entry);
}
} catch (Exception e) {
e.printStackTrace();
System.out.println(str);
}
}
private boolean isTrueDefinition(String shortForm, String longForm) {
Vector<?> entry;
Iterator<?> itr;
entry = (Vector<?>)mTestDefinitions.get(shortForm);
if (entry == null)
return false;
itr = entry.iterator();
while(itr.hasNext()){
if (itr.next().toString().equalsIgnoreCase(longForm))
return true;
}
return false;
}
private String readFromFile(String file) throws IOException {
FileInputStream stream = new FileInputStream(new File(file));
try {
FileChannel fc = stream.getChannel();
MappedByteBuffer bb = fc.map(FileChannel.MapMode.READ_ONLY, 0, fc.size());
/* Instead of using default, pass in a decoder. */
return Charset.defaultCharset().decode(bb).toString();
}
finally {
stream.close();
}
}
public List<Mention> getMentionsFromFile(String inFile) throws IOException {
try {
return getMentions(readFromFile(inFile));
} catch (FileNotFoundException e) {
e.printStackTrace();
}
return new ArrayList<Mention>();
}
public List<Mention> getMentions(String text) {
String tmpStr, longForm = "", shortForm = "";
String currSentence = "";
int openParenIndex, closeParenIndex = -1, sentenceEnd, newCloseParenIndex, tmpIndex = -1;
StringTokenizer shortTokenizer;
List<Mention> result = new ArrayList<Mention>();
try {
int offset = 0;
currSentence = text;
openParenIndex = currSentence.indexOf(" (");
do {
int begin = 0, end = 0;
if (openParenIndex > -1) {
openParenIndex++;
}
sentenceEnd = Math.max(currSentence.lastIndexOf(". "), currSentence.lastIndexOf(", "));
if ((openParenIndex == -1) && (sentenceEnd == -1)) {
//Do nothing
} else if (openParenIndex == -1) {
currSentence = currSentence.substring(sentenceEnd + 2);
offset = sentenceEnd + 2;
} else if ((closeParenIndex = currSentence.indexOf(')',openParenIndex)) > -1){
sentenceEnd = Math.max(currSentence.lastIndexOf(". ", openParenIndex),
currSentence.lastIndexOf(", ", openParenIndex));
if (sentenceEnd == -1) {
sentenceEnd = -2;
}
longForm = currSentence.substring(sentenceEnd + 2, openParenIndex);
shortForm = currSentence.substring(openParenIndex + 1, closeParenIndex);
begin = openParenIndex + 1;
end = closeParenIndex;
}
if (shortForm.length() > 0 || longForm.length() > 0) {
if (shortForm.length() > 1 && longForm.length() > 1) {
if ((shortForm.indexOf('(') > -1) &&
((newCloseParenIndex = currSentence.indexOf(')', closeParenIndex + 1)) > -1)){
shortForm = currSentence.substring(openParenIndex + 1, newCloseParenIndex);
closeParenIndex = newCloseParenIndex;
begin = openParenIndex + 1;
end = closeParenIndex;
}
if ((tmpIndex = shortForm.indexOf(", ")) > -1) {
shortForm = shortForm.substring(0, tmpIndex);
end = begin + tmpIndex;
}
if ((tmpIndex = shortForm.indexOf("; ")) > -1) {
shortForm = shortForm.substring(0, tmpIndex);
end = begin + tmpIndex;
}
shortTokenizer = new StringTokenizer(shortForm);
if (shortTokenizer.countTokens() > 2 || shortForm.length() > longForm.length()) {
// Long form in ( )
tmpIndex = currSentence.lastIndexOf(" ", openParenIndex - 2);
tmpStr = currSentence.substring(tmpIndex + 1, openParenIndex - 1);
longForm = shortForm;
shortForm = tmpStr;
begin = tmpIndex + 1;
end = openParenIndex - 1;
if (! hasCapital(shortForm))
shortForm = "";
}
if (isValidShortForm(shortForm)) {
String abbreviation = currSentence.substring(begin, end);
String bestLongForm = extractAbbrPair(shortForm.trim(), longForm.trim());
if (bestLongForm != null && !bestLongForm.isEmpty()) {
begin += offset;
end += offset;
/*String inText = text.substring(begin, end);
System.out.println(abbreviation);
System.out.println(bestLongForm);
System.out.println(inText);
System.out.println();*/
Mention mention = new Mention(begin, end, abbreviation, bestLongForm, Constants.ABBREV, null);
result.add(mention);
}
}
}
currSentence = currSentence.substring(closeParenIndex + 1);
offset += closeParenIndex + 1;
} else if (openParenIndex > -1) {
if ((currSentence.length() - openParenIndex) > 200)
// Matching close paren was not found
currSentence = currSentence.substring(openParenIndex + 1);
break; // Read next line
}
shortForm = "";
longForm = "";
} while ((openParenIndex = currSentence.indexOf(" (")) > -1);
} catch (Exception ioe) {
ioe.printStackTrace();
System.out.println(currSentence);
System.out.println(tmpIndex);
}
return result;
}
private String findBestLongForm(String shortForm, String longForm) {
int sIndex;
int lIndex;
char currChar;
sIndex = shortForm.length() - 1;
lIndex = longForm.length() - 1;
for ( ; sIndex >= 0; sIndex--) {
currChar = Character.toLowerCase(shortForm.charAt(sIndex));
if (!Character.isLetterOrDigit(currChar))
continue;
while (((lIndex >= 0) && (Character.toLowerCase(longForm.charAt(lIndex)) != currChar)) ||
((sIndex == 0) && (lIndex > 0) && (Character.isLetterOrDigit(longForm.charAt(lIndex - 1)))))
lIndex--;
if (lIndex < 0)
return null;
lIndex--;
}
lIndex = longForm.lastIndexOf(" ", lIndex) + 1;
return longForm.substring(lIndex);
}
private String extractAbbrPair(String shortForm, String longForm) {
String bestLongForm;
StringTokenizer tokenizer;
int longFormSize, shortFormSize;
if (shortForm.length() == 1)
return null;
bestLongForm = findBestLongForm(shortForm, longForm);
if (bestLongForm == null)
return null;
tokenizer = new StringTokenizer(bestLongForm, " \t\n\r\f-");
longFormSize = tokenizer.countTokens();
shortFormSize = shortForm.length();
for (int i=shortFormSize - 1; i >= 0; i--)
if (!Character.isLetterOrDigit(shortForm.charAt(i)))
shortFormSize--;
if (bestLongForm.length() < shortForm.length() ||
bestLongForm.indexOf(shortForm + " ") > -1 ||
bestLongForm.endsWith(shortForm) ||
longFormSize > shortFormSize * 2 ||
longFormSize > shortFormSize + 5 ||
shortFormSize > 10)
return null;
if (testMode) {
if (isTrueDefinition(shortForm, bestLongForm)) {
System.out.println(shortForm + delimiter + bestLongForm + delimiter + "TP");
truePositives++;
}
else {
falsePositives++;
System.out.println(shortForm + delimiter + bestLongForm + delimiter + "FP");
}
}
else {
return bestLongForm;
}
return "";
}
private static void usage() {
System.err.println("Usage: ExtractAbbrev [-options] <filename>");
System.err.println(" <filename> contains text from which abbreviations are extracted" );
System.err.println(" -testlist <file> = list of true abbreviation definition pairs");
System.err.println(" -usage or -help = this message");
System.exit(1);
}
public static void main(String[] args) {
ExtractAbbrev extractAbbrev = new ExtractAbbrev();
String filename = null;
String testList = null;
//parse arguments
for (int i = 0; i < args.length; i++) {
if (args[i].equals("-testlist")) {
if (i == args.length - 1) {
usage();
}
testList = args[++i];
extractAbbrev.testMode = true;
} else if (args[i].equals("-usage")) {
usage();
} else if (args[i].equals("-help")) {
usage();
} else {
filename = args[i];
// Must be last arg
if (i != args.length - 1) {
usage();
}
}
}
if (filename == null) {
usage();
}
if (extractAbbrev.testMode)
extractAbbrev.loadTrueDefinitions(testList);
extractAbbrev.getMentions(filename);
if (extractAbbrev.testMode)
System.out.println("TP: " + extractAbbrev.truePositives + " FP: " + extractAbbrev.falsePositives +
" FN: " + extractAbbrev.falseNegatives + " TN: " + extractAbbrev.trueNegatives);
}
}