/*
* Copyright (c) 2012. Humboldt-Universität zu Berlin, Dept. of Computer Science and Dept.
* of Wissensmanagement in der Bioinformatik
* -------------------------------
*
* THE ACCOMPANYING PROGRAM IS PROVIDED UNDER THE TERMS OF THIS COMMON PUBLIC
* LICENSE ("AGREEMENT"). ANY USE, REPRODUCTION OR DISTRIBUTION OF THE PROGRAM
* CONSTITUTES RECIPIENT'S ACCEPTANCE OF THIS AGREEMENT.
*
* http://www.opensource.org/licenses/cpl1.0
*/
package de.berlin.hu.uima.ae.tagger.brics;
import de.berlin.hu.chemspot.Mention;
import de.berlin.hu.uima.ae.normalizer.Normalizer;
import dk.brics.automaton.Automaton;
import dk.brics.automaton.AutomatonMatcher;
import dk.brics.automaton.RunAutomaton;
import dk.brics.automaton.State;
import dk.brics.automaton.StringUnionOperations;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Enumeration;
import java.util.HashSet;
import java.util.List;
import java.util.zip.ZipEntry;
import java.util.zip.ZipFile;
/**
* User: Tim Rocktaeschel
* Date: 7/2/12
* Time: 2:20 PM
*/
public class BricsMatcher {
public static final int DEFAULT_TERMS_PER_AUTOMATON = 100000;
private Collection<RunAutomaton> matchers = new ArrayList<RunAutomaton>();
public BricsMatcher() throws IOException, ClassNotFoundException {
this(Normalizer.getIds().keySet(), DEFAULT_TERMS_PER_AUTOMATON);
}
public BricsMatcher(Collection<String> chemicals) throws IOException, ClassNotFoundException {
System.out.print("Creating brics automaton...");
matchers.add(BricsMatcher.createAutomaton(chemicals));
System.out.println("Done.");
}
public BricsMatcher(Collection<String> chemicals, int termsPerAutomaton) throws IOException, ClassNotFoundException {
System.out.print("Creating brics automata");
matchers.addAll(BricsMatcher.createAutomata(chemicals, termsPerAutomaton));
System.out.println("Created " + matchers.size() + " brics automata.");
}
/**
* BricsMatcher loads a set of brics dictionary matchers packed in a zip file.
* @param pathToZippedBinaries Path to the zip file containing a set of brics dictionary matchers.
* @throws IOException
* @throws ClassNotFoundException
*/
public BricsMatcher(String pathToZippedBinaries) throws IOException, ClassNotFoundException {
if (pathToZippedBinaries.endsWith(".zip")) {
ZipFile zipFile = new ZipFile(pathToZippedBinaries);
Enumeration<? extends ZipEntry> entries = zipFile.entries();
while (entries.hasMoreElements()) {
ZipEntry entry = entries.nextElement();
matchers.add(RunAutomaton.load(zipFile.getInputStream(entry)));
}
} else {
matchers.add(RunAutomaton.load(new FileInputStream(pathToZippedBinaries)));
}
System.out.println("Loaded " + matchers.size() + " brics automata.");
}
public static List<RunAutomaton> createAutomata(Collection<String> chemicals, int termsPerAutomaton) {
List<RunAutomaton> result = new ArrayList<RunAutomaton>();
int count = 0;
int total = 0;
List<String> terms = new ArrayList<String>();
for (String chemical : chemicals) {
terms.add(chemical);
count++;
total++;
if (count >= termsPerAutomaton) {
result.add(createAutomaton(terms));
System.out.print(".");
terms.clear();
count = 0;
}
}
if (!terms.isEmpty()) {
result.add(createAutomaton(terms));
System.out.println(" Done.");
terms.clear();
count = 0;
}
return result;
}
public static RunAutomaton createAutomaton(Collection<String> chemicals) {
List<String> sortedList = new ArrayList<String>(chemicals);
Collections.sort(sortedList, StringUnionOperations.LEXICOGRAPHIC_ORDER);
String[] sortedArray = sortedList.toArray(new String[sortedList.size()]);
sortedList = null;
State state = StringUnionOperations.build(sortedArray);
Automaton automaton = new Automaton();
automaton.setInitialState(state);
RunAutomaton runAutomaton = new RunAutomaton(automaton);
return runAutomaton;
}
/**
* Uses the set of brics dictionary matchers to extract mentions of chemical entities in natural language text.
* @param text Input natural language text.
* @return a collection of Mentions.
*/
public Collection<Mention> match(String text) {
Collection<Mention> matches = new HashSet<Mention>();
for (RunAutomaton automat : matchers) {
AutomatonMatcher matcher = automat.newMatcher(text);
while (matcher.find()) {
char left = ' ';
char right = ' ';
char nright = ' ';
try {
left = text.charAt(matcher.start() - 1);
} catch (ArrayIndexOutOfBoundsException e) {
//ignore
} catch (StringIndexOutOfBoundsException e) {
//ignore
}
try {
right = text.charAt(matcher.end());
} catch (ArrayIndexOutOfBoundsException e) {
//ignore
} catch (StringIndexOutOfBoundsException e) {
//ignore
}
try {
nright = text.charAt(matcher.end() + 1);
} catch (ArrayIndexOutOfBoundsException e) {
//ignore
} catch (StringIndexOutOfBoundsException e) {
//ignore
}
String coveredText = text.substring(matcher.start(), matcher.end());
//only add if not within a text and longer than two characters
if (coveredText.length() > 2 && !Character.isLetter(left) &&
(!Character.isLetter(right) || (right == 's' && Character.isLetter(nright)))) {
matches.add(new Mention(matcher.start(), matcher.end() + (right == 's' ? 1 : 0), text.substring(matcher.start(), matcher.end())));
}
}
}
return matches;
}}