/*
* Copyright (c) 2012. Humboldt-Universität zu Berlin, Dept. of Computer Science and Dept.
* of Wissensmanagement in der Bioinformatik
* -------------------------------
*
* THE ACCOMPANYING PROGRAM IS PROVIDED UNDER THE TERMS OF THIS COMMON PUBLIC
* LICENSE ("AGREEMENT"). ANY USE, REPRODUCTION OR DISTRIBUTION OF THE PROGRAM
* CONSTITUTES RECIPIENT'S ACCEPTANCE OF THIS AGREEMENT.
*
* http://www.opensource.org/licenses/cpl1.0
*/
package de.berlin.hu.chemspot;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
import org.apache.uima.UIMAException;
import org.apache.uima.UIMAFramework;
import org.apache.uima.cas.impl.XmiCasSerializer;
import org.apache.uima.collection.CollectionException;
import org.apache.uima.collection.CollectionReader;
import org.apache.uima.examples.SourceDocumentInformation;
import org.apache.uima.examples.xmi.XmiCollectionReader;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.metadata.TypeSystemDescription;
import org.apache.uima.tools.components.FileSystemCollectionReader;
import org.apache.uima.util.CasCopier;
import org.apache.uima.util.XMLInputSource;
import org.apache.uima.util.XMLSerializer;
import org.u_compare.shared.semantic.NamedEntity;
import org.uimafit.factory.CollectionReaderFactory;
import org.uimafit.factory.JCasFactory;
import org.uimafit.util.JCasUtil;
import org.xml.sax.SAXException;
import de.berlin.hu.chemspot.ChemSpotConfiguration.Corpus;
import de.berlin.hu.chemspot.ChemSpotConfiguration.Component;
import de.berlin.hu.types.PubmedDocument;
import de.berlin.hu.uima.ae.tagger.brics.DictionaryUpdater;
import de.berlin.hu.uima.cr.chemdner.CHEMDNERReader;
import de.berlin.hu.uima.cr.ddi.DDICorpusCR;
import de.berlin.hu.util.Constants;
import de.berlin.hu.util.Constants.ChemicalType;
import uk.co.flamingpenguin.jewel.cli.ArgumentValidationException;
import uk.co.flamingpenguin.jewel.cli.CliFactory;
public class App {
private static String pathToModelFile;
private static String pathToSentenceFile;
private static String pathToDictionaryFile = "dict.zip";
private static String pathToIDsFile = "ids.zip";
private static String pathToEumedModel = "multiclass.bin";
private static String pathToOutputFile;
private static boolean convertToIOB = false;
private static ChemSpotArguments arguments;
private static boolean evaluate = false;
private static boolean detailedEvaluation = false;
private static boolean threaded = false;
private static int threadNr = 1;
private static String pathToTextFile;
private static String tagFromCommandLine;
private static Map<Corpus, String> corpora = new HashMap<Corpus, String>();
private static Corpus corpus;
private static String pathToXMIOutput;
private static List<JCas> jcases = null;
private static void initializeFromConfigurationFile(String pathToPropertiesFile) {
// read configuration file
System.out.println("Loading configuration file...");
try {
ChemSpotConfiguration.initialize(pathToPropertiesFile);
} catch (FileNotFoundException e) {
System.out.println("ERROR: The configuration file \"" + pathToPropertiesFile + "\" was not found.");
return;
} catch (IOException e) {
System.out.println("ERROR: A problem occurred while reading the properties file \"" + pathToPropertiesFile + "\"");
e.printStackTrace();
return;
}
// set variables
pathToSentenceFile = ChemSpotConfiguration.getSentenceModelPath();
pathToModelFile = ChemSpotConfiguration.getCRFModelPath();
pathToDictionaryFile = ChemSpotConfiguration.getDictionaryPath();
pathToEumedModel = ChemSpotConfiguration.getDrugModelPath();
pathToOutputFile = ChemSpotConfiguration.getOutputPath();
pathToXMIOutput = ChemSpotConfiguration.getXMIOutputPath();
convertToIOB = ChemSpotConfiguration.isConvertToIob();
evaluate = ChemSpotConfiguration.isEvaluate();
detailedEvaluation = ChemSpotConfiguration.isDetailedEvaluation();
threaded = ChemSpotConfiguration.isThreading();
threadNr = ChemSpotConfiguration.getNumberOfThreads();
pathToIDsFile = ChemSpotConfiguration.getIdsFilePath();
// load corpora definitions
Map<Corpus, String> nonExistent = new HashMap<Corpus, String>();
for (Corpus corpusType : Corpus.values()) {
String pathToCorpus = ChemSpotConfiguration.getPathToCorpus(corpusType);
if (pathToCorpus != null) {
if (new File(pathToCorpus).exists()) {
corpora.put(corpusType, pathToCorpus);
} else {
nonExistent.put(corpusType, pathToCorpus);
}
}
}
// check if corpora exist
if (!nonExistent.isEmpty()) {
System.out.printf("WARNING: %d corpora were defined, but %s actually exist. Please check your configuration file at \"%s\"%n",
+ corpora.size() + nonExistent.size(), corpora.isEmpty() ? "none" : "only " + corpora.size(), pathToPropertiesFile);
if (!corpora.isEmpty()) {
System.out.println("Non-existing corpora:");
for (Corpus key : nonExistent.keySet()) {
System.out.println(" " + key + " --> " + nonExistent.get(key));
}
}
}
// print deactivated components
for (Component component : Component.values()) {
if (!ChemSpotConfiguration.useComponent(component)) {
System.out.printf("%s component is deactivated%n", component.toString().replace('_', ' ').toLowerCase());
}
}
// print disabled annotations
for (ChemicalType type : ChemicalType.values()) {
if (!ChemSpotConfiguration.isAnnotate(type)) {
System.out.printf("Annotation of %s is disabled%n", type.toString());
} else if (!ChemSpotConfiguration.isAnnotateEumed(type)) {
System.out.printf("Annotation of %s is disabled for eumed tagger%n", type.toString());
}
}
}
private static Corpus promptForCorpus() throws IOException {
Corpus result = null;
if (corpora.isEmpty()) {
throw new IOException("There are no corpora defined.");
}
BufferedReader reader = new BufferedReader(new InputStreamReader(System.in));
List<Corpus> definedCorpora = new ArrayList<Corpus>(corpora.keySet());
Collections.sort(definedCorpora);
while (result == null) {
System.out.println();
System.out.println("There are several corpora defined. Which one would you like to use?");
int i = 1;
for (Corpus corpus : definedCorpora) {
System.out.printf("%d: %s%n", i++, corpus);
}
System.out.println();
String input = reader.readLine();
try {
int k = Integer.valueOf(input);
if (k > 0 && k <= definedCorpora.size()) {
result = definedCorpora.get(k-1);
} else {
System.out.println(k + " is not a valid index. Please try again.");
}
continue;
} catch (NumberFormatException e) {
// do nothing
}
try {
Corpus corpus = Corpus.valueOf(input.toUpperCase());
if (definedCorpora.contains(corpus)) {
result = corpus;
} else {
System.out.println("The " + corpus + " corpus is not defined. Please try again.");
}
continue;
} catch (IllegalArgumentException e) {
// do nothing
}
System.out.println((input.isEmpty() ? "Your input" : input) + " is neither a valid index nor corpus name. Please try again.");
}
return result;
}
public static void main(String[] args) throws UIMAException, IOException {
try {
// read arguments
arguments = CliFactory.parseArguments(ChemSpotArguments.class, args);
// load properties from file, if there is one
if (arguments.isPathToPropertiesFile()) {
initializeFromConfigurationFile(arguments.getPathToPropertiesFile());
}
// read command line parameters
if (arguments.isPathToCRFModelFile()) {
pathToModelFile = arguments.getPathToCRFModelFile();
}
if (arguments.isPathToSentenceModelFile()) {
pathToSentenceFile = arguments.getPathToSentenceModelFile();
}
if (arguments.isPathToXMIOutput()) {
corpora.put(Corpus.XMI, arguments.getPathToXMIOutput());
}
if (arguments.isPathToDictionary()) {
pathToDictionaryFile = arguments.getPathToDictionary();
}
if (arguments.isPathToIDs()) {
pathToIDsFile = arguments.getPathToIDs();
}
if (arguments.isPathToEumedModelFile()) {
pathToEumedModel = arguments.getPathToEumedModelFile();
}
if (arguments.isThreadNr()) {
threaded = true;
threadNr = arguments.getThreadNr();
}
if (arguments.isPathToTextFile()) {
pathToTextFile = arguments.getPathToTextFile();
} else if (arguments.isTagCommandLine()) {
tagFromCommandLine = arguments.getTagCommandLine();
} else {
if (arguments.isPathToIOBCorpora()) {
corpora.put(Corpus.IOB, arguments.getPathToIOBCorpora());
}
if (arguments.isPathToGZCorpus()) {
corpora.put(Corpus.GZ, arguments.getPathToGZCorpus());
}
if (arguments.isPathToCRAFTCorpus()) {
corpora.put(Corpus.CRAFT, arguments.getPathToCRAFTCorpus());
}
if (arguments.isPathToXMICorpus()) {
corpora.put(Corpus.XMI, arguments.getPathToXMICorpus());
}
if (arguments.isPathToNaCTeMCorpus()) {
corpora.put(Corpus.NACTEM, arguments.getPathToNaCTeMCorpus());
}
if (arguments.isPathToPatentCorpus()) {
corpora.put(Corpus.PATENT, arguments.getPathToNaCTeMCorpus());
}
if (arguments.isPathToDDICorpus()) {
corpora.put(Corpus.DDI, arguments.getPathToDDICorpus());
}
if (arguments.isPathToTextCorpus()) {
corpora.put(Corpus.TXT, arguments.getPathToTextCorpus());
}
if (arguments.isPathToCHEMDNERCorpus()) {
corpora.put(Corpus.CHEMDNER, arguments.getPathToCHEMDNERCorpus());
}
if (arguments.isUpdate()) {
if (pathToDictionaryFile != null && pathToIDsFile != null) {
try {
DictionaryUpdater.initialize();
DictionaryUpdater.updateFiles(new File(pathToDictionaryFile), new File(pathToIDsFile), ChemSpotConfiguration.isRemoveTemporaryUpdateFiles());
System.out.println("Update successful.");
} catch (IOException e) {
System.out.println("Update failed.");
e.printStackTrace();
}
} else {
System.out.println("You need to specify a dictionary and id file for update");
}
}
if (corpora.isEmpty()) {
if (arguments.isUpdate()) {
System.exit(0);
} else {
System.out.println("At least one corpus, a text file or a command line argument has to be provided!");
usage();
}
}
if (corpora.size() == 1) {
corpus = corpora.keySet().iterator().next();
} else {
corpus = promptForCorpus();
}
}
detailedEvaluation = arguments.isDetailedEvaluation() ? true : detailedEvaluation;
evaluate = detailedEvaluation || arguments.isRunEvaluation() ? true : evaluate;
convertToIOB = arguments.isConvertToIOB() ? true : convertToIOB;
if (arguments.isPathToOutputFile()) {
pathToOutputFile = arguments.getPathToOutputFile();
} else if (pathToOutputFile != null && corpus != null) {
pathToOutputFile = pathToOutputFile + corpus + "/";
}
} catch(ArgumentValidationException e) {
System.out.println(e);
usage();
System.exit(0);
}
//initializing ChemSpot with a CRF model file and an LINNAEUS automaton (the latter is optional)
ChemSpot chemspot = new ChemSpot(pathToModelFile, pathToDictionaryFile, pathToSentenceFile, pathToIDsFile, pathToEumedModel);
TypeSystemDescription typeSystem = UIMAFramework.getXMLParser().parseTypeSystemDescription(new XMLInputSource(chemspot.getClass().getClassLoader().getResource("desc/TypeSystem.xml")));
// tag from command line
if (tagFromCommandLine != null) {
List<Mention> mentions = runChemSpot(chemspot, typeSystem, tagFromCommandLine, pathToOutputFile, false);
for (Mention mention : mentions) {
System.out.printf("%d\t%d\t%s\t%s\t%s\n",
mention.getStart(), mention.getEnd(), mention.getText(),
mention.getCHID(), mention.getSource());
}
} else if (arguments.isPathToTextFile()) {
JCas jcas = JCasFactory.createJCas(typeSystem);
if (arguments.isZippedTextFile()) {
ChemSpot.readGZFile(jcas, pathToTextFile);
} else {
ChemSpot.readFile(jcas, pathToTextFile);
}
runChemSpot(chemspot, jcas, pathToOutputFile, false);
} else {
// tag document collection
if (corpus != null) {
String pathToCorpus = corpora.get(corpus);
CollectionReader reader = null;
switch (corpus) {
case IOB:
reader = CollectionReaderFactory.createCollectionReader(UIMAFramework.getXMLParser().parseCollectionReaderDescription(new XMLInputSource(typeSystem.getClass().getClassLoader()
.getResource("desc/cr/ScaiCorpusCR.xml"))), "InputDirectory", pathToCorpus, "UseGoldStandardAnnotations", true, "GoldstandardTypeSuffix" , "", "BrowseSubdirectories", true, "IncludeSuffixes", new String[]{"iob", "iob2"});
break;
case GZ:
reader = CollectionReaderFactory.createCollectionReader(UIMAFramework.getXMLParser().parseCollectionReaderDescription(new XMLInputSource(typeSystem.getClass().getClassLoader()
.getResource("desc/cr/ZipFileCR.xml"))), "InputDirectory", pathToCorpus);
break;
case CRAFT:
reader = CollectionReaderFactory.createCollectionReader(UIMAFramework.getXMLParser().parseCollectionReaderDescription(new XMLInputSource(typeSystem.getClass().getClassLoader()
.getResource("desc/cr/CraftCR.xml"))), XmiCollectionReader.PARAM_INPUTDIR, pathToCorpus);
break;
case NACTEM:
reader = CollectionReaderFactory.createCollectionReader(UIMAFramework.getXMLParser().parseCollectionReaderDescription(new XMLInputSource(typeSystem.getClass().getClassLoader()
.getResource("desc/cr/NaCTeMCollectionReader.xml"))), XmiCollectionReader.PARAM_INPUTDIR, pathToCorpus);
break;
case PATENT:
reader = CollectionReaderFactory.createCollectionReader(UIMAFramework.getXMLParser().parseCollectionReaderDescription(new XMLInputSource(typeSystem.getClass().getClassLoader()
.getResource("desc/cr/PatentCorpusCollectionReader.xml"))), XmiCollectionReader.PARAM_INPUTDIR, pathToCorpus);
break;
case TXT:
reader = CollectionReaderFactory.createCollectionReader(FileSystemCollectionReader.class, FileSystemCollectionReader.PARAM_INPUTDIR, pathToCorpus);
break;
case XMI:
reader = CollectionReaderFactory.createCollectionReader(XmiCollectionReader.class, XmiCollectionReader.PARAM_INPUTDIR, pathToCorpus);
break;
case DDI:
reader = CollectionReaderFactory.createCollectionReader(UIMAFramework.getXMLParser().parseCollectionReaderDescription(new XMLInputSource(typeSystem.getClass().getClassLoader()
.getResource("desc/cr/DDICorpusCR.xml"))), DDICorpusCR.PARAM_INPUTDIR, pathToCorpus, DDICorpusCR.PARAM_SUBDIR, true);
break;
case CHEMDNER:
reader = CollectionReaderFactory.createCollectionReader(UIMAFramework.getXMLParser().parseCollectionReaderDescription(new XMLInputSource(typeSystem.getClass().getClassLoader()
.getResource("desc/cr/CHEMDNERCorpusCR.xml"))), CHEMDNERReader.PARAM_INPUTDIR, pathToCorpus);
break;
default:
throw new IOException("Corpus " + corpus + " does not match any known format.");
}
tagCollection(chemspot, typeSystem, reader, threaded, threadNr);
}
}
}
private static List<Mention> runChemSpot(ChemSpot chemspot, TypeSystemDescription typeSystem, String text, String outputPath, boolean evaluate) {
JCas jcas;
try {
jcas = JCasFactory.createJCas(typeSystem);
} catch (UIMAException e) {
e.printStackTrace();
return new ArrayList<Mention>();
}
jcas.setDocumentText(text);
PubmedDocument pd = new PubmedDocument(jcas);
pd.setBegin(0);
pd.setEnd(text.length());
pd.setPmid("");
pd.addToIndexes(jcas);
return runChemSpot(chemspot, jcas, outputPath, evaluate);
}
private static List<NamedEntity> removeOtherEntities(JCas jcas) {
List<NamedEntity> result = new ArrayList<NamedEntity>();
List<String> sources = new ArrayList<String>();
Iterator<NamedEntity> entities = JCasUtil.iterator(jcas, NamedEntity.class);
while (entities.hasNext()) {
NamedEntity entity = entities.next();
if (Constants.GOLDSTANDARD.equals(entity.getSource())) continue;
if (!sources.contains(entity.getSource())) sources.add(entity.getSource());
result.add(entity);
}
for (NamedEntity ne : result) {
ne.removeFromIndexes();
}
if (!sources.isEmpty()) {
System.out.println("found pre-exisiting entities from: " + sources);
}
return result;
}
private static ChemicalNEREvaluator otherEvaluator = new ChemicalNEREvaluator();
private static List<Mention> runChemSpot(ChemSpot chemspot, JCas jcas, String outputPath, boolean evaluate) {
boolean hasOtherEntities = false;
for (NamedEntity ne : JCasUtil.iterate(jcas, NamedEntity.class)) {
if (!Constants.GOLDSTANDARD.equals(ne.getSource())) {
hasOtherEntities = true;
break;
}
}
if (hasOtherEntities) {
System.out.println("Pre-existing entities found in document. Evaluating and removing them.");
otherEvaluator.evaluate(jcas);
removeOtherEntities(jcas);
}
if (!JCasUtil.iterator(jcas, PubmedDocument.class).hasNext()) {
PubmedDocument pd = new PubmedDocument(jcas);
pd.setBegin(0);
pd.setEnd(jcas.getDocumentText().length());
pd.setPmid("");
pd.addToIndexes(jcas);
}
List<Mention> mentions = chemspot.tag(jcas);
if (evaluate) {
chemspot.getEvaluator().evaluate(jcas);
}
if (pathToOutputFile != null && outputPath != null) {
String output = convertToIOB ? ChemSpot.convertToIOB(jcas) : ChemSpot.serializeAnnotations(jcas);
try {
FileWriter outputFile = outputPath != null ? new FileWriter(new File(outputPath)) : null;
if (outputFile != null) {
outputFile.write(output);
System.out.println("Output written to: " + outputPath);
outputFile.close();
}
} catch (IOException e) {
System.err.println("Error while writing ChemSpot output");
e.printStackTrace();
}
}
if (pathToXMIOutput != null && outputPath != null) {
try {
pathToXMIOutput += !pathToXMIOutput.endsWith("/") && !pathToXMIOutput.endsWith("\\") ? "/" : "";
File xmiOutputFile = new File(pathToXMIOutput + outputPath.replaceFirst(".*/", "").replaceFirst("\\.[^\\.]+$", "") + ".xmi");
xmiOutputFile.getParentFile().mkdirs();
OutputStream out = new FileOutputStream(xmiOutputFile);
XmiCasSerializer serializer = new XmiCasSerializer(jcas.getTypeSystem());
XMLSerializer xmlSerializer = new XMLSerializer(out, false);
serializer.serialize(jcas.getCas(), xmlSerializer.getContentHandler());
out.close();
System.out.println("XMI file written to: " + xmiOutputFile.getCanonicalPath());
} catch (SAXException e) {
e.printStackTrace();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
return mentions;
}
private static void tagCollection(ChemSpot chemspot, TypeSystemDescription typeSystem, CollectionReader reader, boolean threaded, int threads) throws CollectionException, UIMAException, IOException {
ExecutorService threadPool = threaded ? Executors.newFixedThreadPool(threads) : null;
int runNr = 1;
// determine output path (if there is one) and separate it into directory and filename
File outputPath = pathToOutputFile != null ? new File(pathToOutputFile) : (pathToXMIOutput != null ? new File(pathToOutputFile) : null);
String filename = null;
String outputPathString = null;
if (outputPath != null) {
if (outputPath.getName().contains(".")) {
filename = outputPath.getName();
outputPath = outputPath.getAbsoluteFile().getParentFile();
}
if (!outputPath.exists()) {
outputPath.mkdirs();
}
outputPathString = outputPath.getCanonicalPath().replaceAll("\\\\", "/");
outputPathString = !outputPathString.endsWith("/") ? outputPathString + "/" : outputPathString;
}
JCas jcas = JCasFactory.createJCas(typeSystem);
if (threaded) {
jcases = new ArrayList<JCas>();
for (int i = 0; i < threadNr; i++) {
jcases.add(JCasFactory.createJCas(typeSystem));
}
}
while (reader.hasNext()) {
jcas.reset();
reader.getNext(jcas.getCas());
String outputFilePath = null;
String fileType = convertToIOB ? ".iob" : ".chem";
// prepare output file
if (outputPath != null) {
Iterator<SourceDocumentInformation> srcIterator = JCasUtil.iterator(jcas, SourceDocumentInformation.class);
if (filename == null && srcIterator.hasNext()) {
SourceDocumentInformation src = srcIterator.next();
outputFilePath = src.getUri().replaceFirst(".*/", outputPathString) + fileType;
} else {
// simply use the filename if we are just tagging one file
if (runNr == 1 && !reader.hasNext() && filename != null) {
outputFilePath = outputPathString + filename;
// otherwise try using the pubmed id of the document as filename
} else if (JCasUtil.iterate(jcas, PubmedDocument.class).iterator().hasNext()) {
Collection<PubmedDocument> documents = JCasUtil.select(jcas, PubmedDocument.class);
if (documents.size() == 1) {
String pmId = documents.iterator().next().getPmid();
if (pmId != null && !pmId.isEmpty()) {
outputFilePath = outputPathString + pmId + fileType;
}
}
}
// or generate a generic filename
if (outputFilePath == null) {
String prefix = "";
if (filename != null) {
int prefixPos = filename.indexOf('.') > -1 ? filename.indexOf('.') : filename.length();
prefix = filename.substring(0, prefixPos);
}
outputFilePath = String.format("%s%s%04d%s", outputPathString, prefix, runNr, fileType);
}
}
}
// run ChemSpot threaded or...
if (threaded) {
while (jcases.isEmpty()) {
try {
Thread.sleep(1000);
} catch (InterruptedException e) {
// do nothing
}
}
JCas threadJCas = null;
synchronized(jcases) {
threadJCas = jcases.remove(0);
threadJCas.reset();
CasCopier.copyCas(jcas.getCas(), threadJCas.getCas(), true);
}
ChemSpotRun run = new ChemSpotRun(runNr, chemspot, threadJCas, outputFilePath, evaluate);
threadPool.submit(run);
// non-threaded
} else {
runChemSpot(chemspot, jcas, outputFilePath, evaluate);
}
runNr++;
System.out.println();
}
if (threaded) {
// shut down thread pool and block until termination
try {
threadPool.shutdown();
threadPool.awaitTermination(Long.MAX_VALUE, TimeUnit.DAYS);
} catch (InterruptedException e) {
e.printStackTrace();
}
}
if (detailedEvaluation) {
chemspot.getEvaluator().writeDetailedEvaluationResults(outputPathString);
if (otherEvaluator.getTP() + otherEvaluator.getFN() + otherEvaluator.getFP() > 0) {
List<Mention> normalizedAll = new ArrayList<Mention>(otherEvaluator.getNormalizedAll());
List<Mention> normalized = new ArrayList<Mention>(otherEvaluator.getNormalized());
List<Mention> normalizedCorrect = new ArrayList<Mention>(otherEvaluator.getNormalizedCorrect());
normalized.retainAll(chemspot.getEvaluator().getNormalizedCorrect());
normalizedAll.retainAll(chemspot.getEvaluator().getNormalizedCorrect());
normalizedCorrect.retainAll(chemspot.getEvaluator().getNormalizedCorrect());
File normalizedFoundFile = new File(outputPathString + "normalizations-correct-by-ChemSpot.txt");
FileOutputStream writer = new FileOutputStream(normalizedFoundFile);
otherEvaluator.writeNormalizations(writer, normalizedAll, normalized, normalizedCorrect);
writer.close();
System.out.println("Pre-existing normalized entities found by ChemSpot written to: " + normalizedFoundFile.getName());
normalizedAll = new ArrayList<Mention>(otherEvaluator.getNormalizedAll());
normalized = new ArrayList<Mention>(otherEvaluator.getNormalized());
normalizedCorrect = new ArrayList<Mention>(otherEvaluator.getNormalizedCorrect());
normalized.removeAll(chemspot.getEvaluator().getNormalizedCorrect());
normalizedAll.removeAll(chemspot.getEvaluator().getNormalizedCorrect());
normalizedCorrect.removeAll(chemspot.getEvaluator().getNormalizedCorrect());
File notNormalizedFoundFile = new File(outputPathString + "normalizations-not-correct-by-ChemSpot.txt");
writer = new FileOutputStream(notNormalizedFoundFile);
otherEvaluator.writeNormalizations(writer, normalizedAll, normalized, normalizedCorrect);
writer.close();
System.out.println("Pre-existing normalized entities not found by ChemSpot written to: " + notNormalizedFoundFile.getName());
}
}
}
private static class ChemSpotRun implements Runnable {
private int runNr = -1;
private ChemSpot chemspot = null;
private JCas jCas = null;
private String outputFile;
private boolean evaluate;
public ChemSpotRun (int runNr, ChemSpot chemspot, JCas jCas, String outputFile, boolean evaluate) {
this.runNr = runNr;
this.chemspot = chemspot;
this.jCas = jCas;
this.outputFile = outputFile;
this.evaluate = evaluate;
}
public void run() {
System.out.println("Starting run " + runNr);
runChemSpot(chemspot, jCas, outputFile, evaluate);
System.out.println("Run " + runNr + " finished");
synchronized(jcases) {
jcases.add(jCas);
}
}
}
private static void usage() {
System.out.println("usage:");
System.out.println(" arguments:");
System.out.println("\t-m path to a CRF model file (internal default model file will be used if not provided)");
System.out.println("\t-s path to a OpenNLP sentence model file (internal default model file will be used if not provided)");
System.out.println("\t-d path to a zipped set of brics dictionary automata (parameter defaults to 'dict.zip' if not provided)");
System.out.println("\t-i path to a zipped tab-separated text file representing a map of terms to ids (parameter defaults to 'ids.zip' if not provided)");
System.out.println("\t-M path to a multi-class model file (parameter defaults to 'multiclass.bin' if not provided)");
System.out.println();
System.out.println(" flags:");
System.out.println("\t-e if this flag is set, the performance of ChemSpot on an IOB gold-standard corpus (cf. -c) is evaluated");
System.out.println("\t-u if this flag is set, ChemSpot will update the dictionary and ids file");
System.out.println("\t-T number of threads to create when processing a document collection");
System.out.println();
System.out.println(" input control:");
System.out.println("\t-c path to a directory containing corpora in IOB format");
System.out.println("\t-g path to a directory containing gzipped text files");
System.out.println("\t-t path to a text file");
System.out.println("\t-f path to a directory of text files");
System.out.println();
System.out.println(" output control:");
System.out.println("\t-o path to output file");
System.out.println("\t-I if this flag is set, the output will be converted into the IOB format");
System.exit(0);
}
}