App.java example

Explorer
ChemSpot-master
- src
  - main
    - java
      - de
        berlin
        hu
        banner
        featuresets
        KlingerLikeFeatureSet.java
        LWhitespace.java
        RWhitespace.java
        util
        ConfigUtil.java
        chemspot
        App.java
        ChemSpot.java
        ChemSpotArguments.java
        ChemSpotConfiguration.java
        ChemSpotFactory.java
        ChemicalNEREvaluator.java
        Mention.java
        uima
        ae
        AnnotationImporterAE.java
        AnnotationMergerAE.java
        expander
        MentionExpander.java
        feature
        FeatureGeneratorApp.java
        FeatureToken.java
        FeatureTokenGenerator.java
        filter
        PosFilter.java
        StopwordFilter.java
        SuffixFilter.java
        normalizer
        Normalizer.java
        StringComparator.java
        tagger
        abbrev
        AbbreviationTagger.java
        ExtractAbbrev.java
        banner
        BannerTagger.java
        CRFWrapper.java
        brics
        BricsMatcher.java
        BricsTagger.java
        DictionaryUpdater.java
        drug
        EumedNERTagger.java
        simple
        ChemicalFormulaTagger.java
        tokenizer
        FineTokenizerAE.java
        cc
        banner
        trainer
        BannerTrainer.java
        eval
        ComparableAnnotation.java
        Evaluation.java
        SeparateEvaluation.java
        cr
        chemdner
        CHEMDNERReader.java
        craft
        CraftCR.java
        ddi
        DDICorpusCR.java
        parser
        DDICorpusContentHandlerImpl.java
        iob
        IOBDirectoryCollectionReader.java
        txt
        gz
        ZipFileCollectionReader.java
        xml
        NaCTeMCollectionReader.java
        PatentCorpusCollectionReader.java
        XMLCollectionReader.java
        util
        DDIToUCompareConverter.java
        OpenNLPToUCompareSentenceConverterAE.java
        OpenNLPToUCompareTokenConverterAE.java
        Util.java
        util
        Constants.java
        wbi
        common
        research
        EvalMeasures.java
        Evaluator.java
    - types
/*
 * Copyright (c) 2012. Humboldt-Universität zu Berlin, Dept. of Computer Science and Dept.
 * of Wissensmanagement in der Bioinformatik
 * -------------------------------
 *
 * THE ACCOMPANYING PROGRAM IS PROVIDED UNDER THE TERMS OF THIS COMMON PUBLIC
 * LICENSE ("AGREEMENT"). ANY USE, REPRODUCTION OR DISTRIBUTION OF THE PROGRAM
 * CONSTITUTES RECIPIENT'S ACCEPTANCE OF THIS AGREEMENT.
 *
 * http://www.opensource.org/licenses/cpl1.0
 */

package de.berlin.hu.chemspot;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;

import org.apache.uima.UIMAException;
import org.apache.uima.UIMAFramework;
import org.apache.uima.cas.impl.XmiCasSerializer;
import org.apache.uima.collection.CollectionException;
import org.apache.uima.collection.CollectionReader;
import org.apache.uima.examples.SourceDocumentInformation;
import org.apache.uima.examples.xmi.XmiCollectionReader;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.metadata.TypeSystemDescription;
import org.apache.uima.tools.components.FileSystemCollectionReader;
import org.apache.uima.util.CasCopier;
import org.apache.uima.util.XMLInputSource;
import org.apache.uima.util.XMLSerializer;
import org.u_compare.shared.semantic.NamedEntity;
import org.uimafit.factory.CollectionReaderFactory;
import org.uimafit.factory.JCasFactory;
import org.uimafit.util.JCasUtil;
import org.xml.sax.SAXException;

import de.berlin.hu.chemspot.ChemSpotConfiguration.Corpus;
import de.berlin.hu.chemspot.ChemSpotConfiguration.Component;
import de.berlin.hu.types.PubmedDocument;
import de.berlin.hu.uima.ae.tagger.brics.DictionaryUpdater;
import de.berlin.hu.uima.cr.chemdner.CHEMDNERReader;
import de.berlin.hu.uima.cr.ddi.DDICorpusCR;
import de.berlin.hu.util.Constants;
import de.berlin.hu.util.Constants.ChemicalType;

import uk.co.flamingpenguin.jewel.cli.ArgumentValidationException;
import uk.co.flamingpenguin.jewel.cli.CliFactory;

public class App {
	private static String pathToModelFile;
	private static String pathToSentenceFile;
	private static String pathToDictionaryFile = "dict.zip";
	private static String pathToIDsFile = "ids.zip";
	private static String pathToEumedModel = "multiclass.bin";
	private static String pathToOutputFile;
	private static boolean convertToIOB = false;
	private static ChemSpotArguments arguments;
	private static boolean evaluate = false;
	private static boolean detailedEvaluation = false;
	private static boolean threaded = false;
	private static int threadNr = 1;
	private static String pathToTextFile;
    private static String tagFromCommandLine;
    private static Map<Corpus, String> corpora = new HashMap<Corpus, String>();
    private static Corpus corpus;
    private static String pathToXMIOutput;
    
    private static List<JCas> jcases = null;

    private static void initializeFromConfigurationFile(String pathToPropertiesFile) {
    	// read configuration file
    	System.out.println("Loading configuration file...");
    	try {
			ChemSpotConfiguration.initialize(pathToPropertiesFile);
		} catch (FileNotFoundException e) {
			System.out.println("ERROR: The configuration file \"" + pathToPropertiesFile + "\" was not found.");
			return;
		} catch (IOException e) {
			System.out.println("ERROR: A problem occurred while reading the properties file \"" + pathToPropertiesFile + "\"");
			e.printStackTrace();
			return;
		}
    	
    	// set variables
    	pathToSentenceFile = ChemSpotConfiguration.getSentenceModelPath();
    	pathToModelFile = ChemSpotConfiguration.getCRFModelPath();
    	pathToDictionaryFile = ChemSpotConfiguration.getDictionaryPath();
    	pathToEumedModel = ChemSpotConfiguration.getDrugModelPath();
    	
    	pathToOutputFile = ChemSpotConfiguration.getOutputPath();
    	pathToXMIOutput = ChemSpotConfiguration.getXMIOutputPath();
    	convertToIOB = ChemSpotConfiguration.isConvertToIob();
    	
    	evaluate = ChemSpotConfiguration.isEvaluate();
    	detailedEvaluation = ChemSpotConfiguration.isDetailedEvaluation();
    	
    	threaded = ChemSpotConfiguration.isThreading();
    	threadNr = ChemSpotConfiguration.getNumberOfThreads();
        
        pathToIDsFile = ChemSpotConfiguration.getIdsFilePath();
        
        // load corpora definitions
        Map<Corpus, String> nonExistent = new HashMap<Corpus, String>();
        for (Corpus corpusType : Corpus.values()) {
        	String pathToCorpus = ChemSpotConfiguration.getPathToCorpus(corpusType);
        	
        	if (pathToCorpus != null) {
        		if (new File(pathToCorpus).exists()) {
        			corpora.put(corpusType, pathToCorpus);
        		} else {
        			nonExistent.put(corpusType, pathToCorpus);
        		}
        	}
        }
        
        // check if corpora exist
        if (!nonExistent.isEmpty()) {
        	System.out.printf("WARNING: %d corpora were defined, but %s actually exist. Please check your configuration file at \"%s\"%n",
        			+ corpora.size() + nonExistent.size(), corpora.isEmpty() ? "none" : "only " + corpora.size(), pathToPropertiesFile);
        	if (!corpora.isEmpty()) {
        		System.out.println("Non-existing corpora:");
        		for (Corpus key : nonExistent.keySet()) {
        			System.out.println("  " + key + " --> " + nonExistent.get(key));
        		}
        	}
        }
        
        // print deactivated components
        for (Component component : Component.values()) {
        	if (!ChemSpotConfiguration.useComponent(component)) {
        		System.out.printf("%s component is deactivated%n", component.toString().replace('_', ' ').toLowerCase());
        	}
        }
        
        // print disabled annotations
        for (ChemicalType type : ChemicalType.values()) {
        	if (!ChemSpotConfiguration.isAnnotate(type)) {
        		System.out.printf("Annotation of %s is disabled%n", type.toString());
        	} else if (!ChemSpotConfiguration.isAnnotateEumed(type)) {
        		System.out.printf("Annotation of %s is disabled for eumed tagger%n", type.toString());
        	}
        }
    }
    
    private static Corpus promptForCorpus() throws IOException {
    	Corpus result = null;
    	
    	if (corpora.isEmpty()) {
    		throw new IOException("There are no corpora defined.");
    	}
    	
    	BufferedReader reader = new BufferedReader(new InputStreamReader(System.in));
    	
    	List<Corpus> definedCorpora = new ArrayList<Corpus>(corpora.keySet());
    	Collections.sort(definedCorpora);
    	
    	while (result == null) {
    		System.out.println();
    		System.out.println("There are several corpora defined. Which one would you like to use?");
    		
	    	int i = 1;
	    	for (Corpus corpus : definedCorpora) {
	    		System.out.printf("%d: %s%n", i++, corpus);
	    	}
	    	System.out.println();
	    	
	    	String input = reader.readLine();
	    	
	    	try {
	    		int k = Integer.valueOf(input);
	    		
	    		if (k > 0 && k <= definedCorpora.size()) {
	    			result = definedCorpora.get(k-1);
	    		} else {
	    			System.out.println(k + " is not a valid index. Please try again.");
	    		}
	    		
	    		continue;
	    	} catch (NumberFormatException e) {
	    		// do nothing
	    	}
	    	
	    	try {
	    		Corpus corpus = Corpus.valueOf(input.toUpperCase());
	    		
	    		if (definedCorpora.contains(corpus)) {
	    			result = corpus;
	    		} else {
	    			System.out.println("The " + corpus + " corpus is not defined. Please try again.");
	    		}
	    		
	    		continue;
	    	} catch (IllegalArgumentException e) {
	    		// do nothing
	    	}
	    	
	    	System.out.println((input.isEmpty() ? "Your input" : input) + " is neither a valid index nor corpus name. Please try again.");
    	}
    	
    	return result;
    }
    
    public static void main(String[] args) throws UIMAException, IOException {
		try {
			// read arguments
			arguments = CliFactory.parseArguments(ChemSpotArguments.class, args);
			
			// load properties from file, if there is one
			if (arguments.isPathToPropertiesFile()) {
				initializeFromConfigurationFile(arguments.getPathToPropertiesFile());
			}
			
			// read command line parameters
			if (arguments.isPathToCRFModelFile()) {
				pathToModelFile = arguments.getPathToCRFModelFile();
			}
			if (arguments.isPathToSentenceModelFile()) {
				pathToSentenceFile = arguments.getPathToSentenceModelFile();
			}
			if (arguments.isPathToXMIOutput()) {
				corpora.put(Corpus.XMI, arguments.getPathToXMIOutput());
			}	
			if (arguments.isPathToDictionary()) {
				pathToDictionaryFile = arguments.getPathToDictionary();
			} 
			if (arguments.isPathToIDs()) {
         		pathToIDsFile = arguments.getPathToIDs();
            }
			if (arguments.isPathToEumedModelFile()) {
         		pathToEumedModel = arguments.getPathToEumedModelFile();
            }
			if (arguments.isThreadNr()) {
				threaded = true;
         		threadNr = arguments.getThreadNr();
            }
			if (arguments.isPathToTextFile()) {
				pathToTextFile = arguments.getPathToTextFile();
			} else if (arguments.isTagCommandLine()) {
                tagFromCommandLine = arguments.getTagCommandLine();
			} else {
				if (arguments.isPathToIOBCorpora()) {
					corpora.put(Corpus.IOB, arguments.getPathToIOBCorpora());
		        } 
            	if (arguments.isPathToGZCorpus()) {
            		corpora.put(Corpus.GZ, arguments.getPathToGZCorpus());
	            }
            	if (arguments.isPathToCRAFTCorpus()) {
	            	corpora.put(Corpus.CRAFT, arguments.getPathToCRAFTCorpus());
	            }
            	if (arguments.isPathToXMICorpus()) {
					corpora.put(Corpus.XMI, arguments.getPathToXMICorpus());
				}
            	if (arguments.isPathToNaCTeMCorpus()) {
					corpora.put(Corpus.NACTEM, arguments.getPathToNaCTeMCorpus());
				}
            	if (arguments.isPathToPatentCorpus()) {
					corpora.put(Corpus.PATENT, arguments.getPathToNaCTeMCorpus());
				}
            	if (arguments.isPathToDDICorpus()) {
					corpora.put(Corpus.DDI, arguments.getPathToDDICorpus());
            	}
            	if (arguments.isPathToTextCorpus()) {
					corpora.put(Corpus.TXT, arguments.getPathToTextCorpus());
            	}
            	if (arguments.isPathToCHEMDNERCorpus()) {
					corpora.put(Corpus.CHEMDNER, arguments.getPathToCHEMDNERCorpus());
            	}
            	if (arguments.isUpdate()) {
            		if (pathToDictionaryFile != null && pathToIDsFile != null) {
            			try {
            				DictionaryUpdater.initialize();
            				DictionaryUpdater.updateFiles(new File(pathToDictionaryFile), new File(pathToIDsFile), ChemSpotConfiguration.isRemoveTemporaryUpdateFiles());
            				System.out.println("Update successful.");
            			} catch (IOException e) {
            				System.out.println("Update failed.");
            				e.printStackTrace();
            			}
            		} else {
            			System.out.println("You need to specify a dictionary and id file for update");
            		}
            	}
            	
            	if (corpora.isEmpty()) {
            		if (arguments.isUpdate()) {
            			System.exit(0);
            		} else {
	            		System.out.println("At least one corpus, a text file or a command line argument has to be provided!");
	            		usage();
            		}
            	}
            	
    			if (corpora.size() == 1) {
    				corpus = corpora.keySet().iterator().next();
    			} else {
    				corpus = promptForCorpus();
    			}
			}

			detailedEvaluation = arguments.isDetailedEvaluation() ? true : detailedEvaluation;
			evaluate = detailedEvaluation || arguments.isRunEvaluation() ? true : evaluate;
			convertToIOB = arguments.isConvertToIOB() ? true : convertToIOB;
			
			if (arguments.isPathToOutputFile()) {
				pathToOutputFile = arguments.getPathToOutputFile();
			} else if (pathToOutputFile != null &&  corpus != null) {
				pathToOutputFile = pathToOutputFile + corpus + "/";
			}
		} catch(ArgumentValidationException e) {
			System.out.println(e);
			usage();
            System.exit(0);
		}

        //initializing ChemSpot with a CRF model file and an LINNAEUS automaton (the latter is optional)
        ChemSpot chemspot = new ChemSpot(pathToModelFile, pathToDictionaryFile, pathToSentenceFile, pathToIDsFile, pathToEumedModel);

        TypeSystemDescription typeSystem = UIMAFramework.getXMLParser().parseTypeSystemDescription(new XMLInputSource(chemspot.getClass().getClassLoader().getResource("desc/TypeSystem.xml")));
        
        // tag from command line
        if (tagFromCommandLine != null) {
            List<Mention> mentions = runChemSpot(chemspot, typeSystem, tagFromCommandLine, pathToOutputFile, false);
            for (Mention mention : mentions) {
                System.out.printf("%d\t%d\t%s\t%s\t%s\n",
                     mention.getStart(), mention.getEnd(), mention.getText(),
                     mention.getCHID(), mention.getSource());
            }
        } else if (arguments.isPathToTextFile()) {
        	JCas jcas = JCasFactory.createJCas(typeSystem);
        	
            if (arguments.isZippedTextFile()) {
                ChemSpot.readGZFile(jcas, pathToTextFile);
            } else {
                ChemSpot.readFile(jcas, pathToTextFile);
            }
        	
        	runChemSpot(chemspot, jcas, pathToOutputFile, false);
        } else {
        	// tag document collection
            if (corpus != null) {
            	String pathToCorpus = corpora.get(corpus);
            	
            	CollectionReader reader = null;            	
            	switch (corpus) {
            	case IOB:
            		reader = CollectionReaderFactory.createCollectionReader(UIMAFramework.getXMLParser().parseCollectionReaderDescription(new XMLInputSource(typeSystem.getClass().getClassLoader()
                            .getResource("desc/cr/ScaiCorpusCR.xml"))), "InputDirectory", pathToCorpus, "UseGoldStandardAnnotations", true, "GoldstandardTypeSuffix" , "", "BrowseSubdirectories", true, "IncludeSuffixes", new String[]{"iob", "iob2"});
            		break;
            	case GZ:
                	reader = CollectionReaderFactory.createCollectionReader(UIMAFramework.getXMLParser().parseCollectionReaderDescription(new XMLInputSource(typeSystem.getClass().getClassLoader()
                            .getResource("desc/cr/ZipFileCR.xml"))), "InputDirectory", pathToCorpus);
                	break;
            	case CRAFT:
            		reader = CollectionReaderFactory.createCollectionReader(UIMAFramework.getXMLParser().parseCollectionReaderDescription(new XMLInputSource(typeSystem.getClass().getClassLoader()
                            .getResource("desc/cr/CraftCR.xml"))), XmiCollectionReader.PARAM_INPUTDIR, pathToCorpus);
            		break;
            	case NACTEM: 
            		reader = CollectionReaderFactory.createCollectionReader(UIMAFramework.getXMLParser().parseCollectionReaderDescription(new XMLInputSource(typeSystem.getClass().getClassLoader()
                        .getResource("desc/cr/NaCTeMCollectionReader.xml"))), XmiCollectionReader.PARAM_INPUTDIR, pathToCorpus);
            		break;
            	case PATENT:
            		reader = CollectionReaderFactory.createCollectionReader(UIMAFramework.getXMLParser().parseCollectionReaderDescription(new XMLInputSource(typeSystem.getClass().getClassLoader()
                            .getResource("desc/cr/PatentCorpusCollectionReader.xml"))), XmiCollectionReader.PARAM_INPUTDIR, pathToCorpus);
            		break;
            	case TXT:
            		reader = CollectionReaderFactory.createCollectionReader(FileSystemCollectionReader.class, FileSystemCollectionReader.PARAM_INPUTDIR, pathToCorpus);
            		break;
            	case XMI:
            		reader = CollectionReaderFactory.createCollectionReader(XmiCollectionReader.class, XmiCollectionReader.PARAM_INPUTDIR, pathToCorpus);
            		break;
            	case DDI:
            		reader = CollectionReaderFactory.createCollectionReader(UIMAFramework.getXMLParser().parseCollectionReaderDescription(new XMLInputSource(typeSystem.getClass().getClassLoader()
                            .getResource("desc/cr/DDICorpusCR.xml"))), DDICorpusCR.PARAM_INPUTDIR, pathToCorpus, DDICorpusCR.PARAM_SUBDIR, true);   
            		break;
            	case CHEMDNER:
            		reader = CollectionReaderFactory.createCollectionReader(UIMAFramework.getXMLParser().parseCollectionReaderDescription(new XMLInputSource(typeSystem.getClass().getClassLoader()
                            .getResource("desc/cr/CHEMDNERCorpusCR.xml"))), CHEMDNERReader.PARAM_INPUTDIR, pathToCorpus);   
            		break;
            	default:
            		throw new IOException("Corpus " + corpus + " does not match any known format.");
            	}

            	tagCollection(chemspot, typeSystem, reader, threaded, threadNr);
            } 
        }
	}
    
    private static List<Mention> runChemSpot(ChemSpot chemspot, TypeSystemDescription typeSystem, String text, String outputPath, boolean evaluate) {
    	JCas jcas;
		try {
			jcas = JCasFactory.createJCas(typeSystem);
		} catch (UIMAException e) {
			e.printStackTrace();
			return new ArrayList<Mention>();
		}
        jcas.setDocumentText(text);
        PubmedDocument pd = new PubmedDocument(jcas);
        pd.setBegin(0);
        pd.setEnd(text.length());
        pd.setPmid("");
        pd.addToIndexes(jcas);
        return runChemSpot(chemspot, jcas, outputPath, evaluate);
    }
    
    private static List<NamedEntity> removeOtherEntities(JCas jcas) {
    	List<NamedEntity> result = new ArrayList<NamedEntity>();
    	List<String> sources = new ArrayList<String>();
    	
    	Iterator<NamedEntity> entities = JCasUtil.iterator(jcas, NamedEntity.class);
        while (entities.hasNext()) {
        	NamedEntity entity = entities.next();
        	if (Constants.GOLDSTANDARD.equals(entity.getSource())) continue;
        	if (!sources.contains(entity.getSource())) sources.add(entity.getSource());
        	result.add(entity);
        }
        
        for (NamedEntity ne : result) {
			ne.removeFromIndexes();
		}
        
        if (!sources.isEmpty()) {
        	System.out.println("found pre-exisiting entities from: " + sources);
        }
    	
    	return result;
    }
    
    private static ChemicalNEREvaluator otherEvaluator = new ChemicalNEREvaluator();
    private static List<Mention> runChemSpot(ChemSpot chemspot, JCas jcas, String outputPath, boolean evaluate) {
    	boolean hasOtherEntities = false;
    	
    	for (NamedEntity ne : JCasUtil.iterate(jcas, NamedEntity.class)) {
    		if (!Constants.GOLDSTANDARD.equals(ne.getSource())) {
    			hasOtherEntities = true;
    			break;
    		}
    	}
    	
    	if (hasOtherEntities) {
    		System.out.println("Pre-existing entities found in document. Evaluating and removing them.");
    		otherEvaluator.evaluate(jcas);
    		removeOtherEntities(jcas);
    	}
    	
    	if (!JCasUtil.iterator(jcas, PubmedDocument.class).hasNext()) {
	    	PubmedDocument pd = new PubmedDocument(jcas);
	        pd.setBegin(0);
	        pd.setEnd(jcas.getDocumentText().length());
	        pd.setPmid("");
	        pd.addToIndexes(jcas);
    	}
    	
    	List<Mention> mentions = chemspot.tag(jcas);
    	if (evaluate) {
    		chemspot.getEvaluator().evaluate(jcas);
    	}
    	
    	if (pathToOutputFile != null && outputPath != null) {
	    	String output = convertToIOB ? ChemSpot.convertToIOB(jcas) : ChemSpot.serializeAnnotations(jcas);
	    	try {
		    	FileWriter outputFile = outputPath != null ? new FileWriter(new File(outputPath)) : null;
		        if (outputFile != null) {
		        	outputFile.write(output);
		        	System.out.println("Output written to: " + outputPath);
		        	outputFile.close();
		        }
		        
	    	} catch (IOException e) {
	    		System.err.println("Error while writing ChemSpot output");
	    		e.printStackTrace();
	    	}
    	}
    	
    	if (pathToXMIOutput != null && outputPath != null) {
    		try {
    			pathToXMIOutput += !pathToXMIOutput.endsWith("/") && !pathToXMIOutput.endsWith("\\") ? "/" : "";
    			File xmiOutputFile = new File(pathToXMIOutput + outputPath.replaceFirst(".*/", "").replaceFirst("\\.[^\\.]+$", "") + ".xmi");
    			xmiOutputFile.getParentFile().mkdirs();
	    		OutputStream out = new FileOutputStream(xmiOutputFile);
	    		XmiCasSerializer serializer = new XmiCasSerializer(jcas.getTypeSystem());
	    		XMLSerializer xmlSerializer = new XMLSerializer(out, false);
    		
				serializer.serialize(jcas.getCas(), xmlSerializer.getContentHandler());
				out.close();
				
				System.out.println("XMI file written to: " + xmiOutputFile.getCanonicalPath());
			} catch (SAXException e) {
				e.printStackTrace();
			} catch (FileNotFoundException e) {
				e.printStackTrace();
			} catch (IOException e) {
				e.printStackTrace();
			}
    	}
    	
    	return mentions;
    }
    
    private static void tagCollection(ChemSpot chemspot, TypeSystemDescription typeSystem, CollectionReader reader, boolean threaded, int threads) throws CollectionException, UIMAException, IOException {
    	ExecutorService threadPool = threaded ? Executors.newFixedThreadPool(threads) : null;
    	int runNr = 1;
    	
    	// determine output path (if there is one) and separate it into directory and filename
    	File outputPath = pathToOutputFile != null ? new File(pathToOutputFile) : (pathToXMIOutput != null ? new File(pathToOutputFile) : null);
    	String filename = null;
    	String outputPathString = null;
    	if (outputPath != null) {
    		if (outputPath.getName().contains(".")) {
        		filename = outputPath.getName();
        		outputPath = outputPath.getAbsoluteFile().getParentFile();
        	}
        	
        	if (!outputPath.exists()) {
        		outputPath.mkdirs();
        	}
        	outputPathString = outputPath.getCanonicalPath().replaceAll("\\\\", "/");
            outputPathString = !outputPathString.endsWith("/") ? outputPathString + "/" : outputPathString;
    	}
    	
    	JCas jcas = JCasFactory.createJCas(typeSystem);
    	
    	if (threaded) {
    		jcases = new ArrayList<JCas>();
    		
    		for (int i = 0; i < threadNr; i++) {
    			jcases.add(JCasFactory.createJCas(typeSystem));
    		}
    	}
    	while (reader.hasNext()) {
            jcas.reset();
            reader.getNext(jcas.getCas());
            
            String outputFilePath = null;
            String fileType = convertToIOB ? ".iob" : ".chem";
            
            // prepare output file
            if (outputPath != null) {
	            Iterator<SourceDocumentInformation> srcIterator = JCasUtil.iterator(jcas, SourceDocumentInformation.class);
	            if (filename == null && srcIterator.hasNext()) {
	    	        SourceDocumentInformation src = srcIterator.next();
	    	        outputFilePath = src.getUri().replaceFirst(".*/", outputPathString) + fileType;
	            } else {
	            	// simply use the filename if we are just tagging one file
	            	if (runNr == 1 && !reader.hasNext() && filename != null) {
	            		outputFilePath = outputPathString + filename;
	            	// otherwise try using the pubmed id of the document as filename
	            	} else if (JCasUtil.iterate(jcas, PubmedDocument.class).iterator().hasNext()) {
	            		Collection<PubmedDocument> documents = JCasUtil.select(jcas, PubmedDocument.class);
	            		if (documents.size() == 1) {
	            			String pmId = documents.iterator().next().getPmid();
	            			
	            			if (pmId != null && !pmId.isEmpty()) {
	            				outputFilePath = outputPathString + pmId + fileType;
	            			}
	            		}
	            	} 

	            	// or generate a generic filename
	            	if (outputFilePath == null) {
	            		String prefix = "";
	            		if (filename != null) {
	            			int prefixPos = filename.indexOf('.') > -1 ? filename.indexOf('.') : filename.length();
	            			prefix = filename.substring(0, prefixPos);
	            		}
		            	outputFilePath = String.format("%s%s%04d%s", outputPathString, prefix, runNr, fileType);
	            	}
	            }
            }

            // run ChemSpot threaded or...
            if (threaded) {
            	while (jcases.isEmpty()) {
            		try {
						Thread.sleep(1000);
					} catch (InterruptedException e) {
						// do nothing
					}
            	}
            	
            	JCas threadJCas = null;
            	synchronized(jcases) {
	            	threadJCas = jcases.remove(0);
	            	threadJCas.reset();
	            	CasCopier.copyCas(jcas.getCas(), threadJCas.getCas(), true);
            	}
            	
	            ChemSpotRun run = new ChemSpotRun(runNr, chemspot, threadJCas, outputFilePath, evaluate);
	            threadPool.submit(run);
	        // non-threaded
            } else {
            	runChemSpot(chemspot, jcas, outputFilePath, evaluate);
            }
            
            runNr++;
            System.out.println();
        }
    	
    	if (threaded) {
    		// shut down thread pool and block until termination
    		try {
    			threadPool.shutdown();
				threadPool.awaitTermination(Long.MAX_VALUE, TimeUnit.DAYS);
			} catch (InterruptedException e) {
				e.printStackTrace();
			}
    	}
    	
    	if (detailedEvaluation) {
    		chemspot.getEvaluator().writeDetailedEvaluationResults(outputPathString);
    		
    		if (otherEvaluator.getTP() + otherEvaluator.getFN() + otherEvaluator.getFP() > 0) {
    			List<Mention> normalizedAll = new ArrayList<Mention>(otherEvaluator.getNormalizedAll());
    			List<Mention> normalized = new ArrayList<Mention>(otherEvaluator.getNormalized());
    			List<Mention> normalizedCorrect = new ArrayList<Mention>(otherEvaluator.getNormalizedCorrect());
    			
    			normalized.retainAll(chemspot.getEvaluator().getNormalizedCorrect());
    			normalizedAll.retainAll(chemspot.getEvaluator().getNormalizedCorrect());
    			normalizedCorrect.retainAll(chemspot.getEvaluator().getNormalizedCorrect());
    			
    			File normalizedFoundFile = new File(outputPathString + "normalizations-correct-by-ChemSpot.txt");
    			FileOutputStream writer = new FileOutputStream(normalizedFoundFile);
	    		
	    	    otherEvaluator.writeNormalizations(writer, normalizedAll, normalized, normalizedCorrect);

	    		writer.close();
	    		System.out.println("Pre-existing normalized entities found by ChemSpot written to: " + normalizedFoundFile.getName());
	    		
    			normalizedAll = new ArrayList<Mention>(otherEvaluator.getNormalizedAll());
    			normalized = new ArrayList<Mention>(otherEvaluator.getNormalized());
    			normalizedCorrect = new ArrayList<Mention>(otherEvaluator.getNormalizedCorrect());
    			
    			normalized.removeAll(chemspot.getEvaluator().getNormalizedCorrect());
    			normalizedAll.removeAll(chemspot.getEvaluator().getNormalizedCorrect());
    			normalizedCorrect.removeAll(chemspot.getEvaluator().getNormalizedCorrect());
    			
    			File notNormalizedFoundFile = new File(outputPathString + "normalizations-not-correct-by-ChemSpot.txt");
    			writer = new FileOutputStream(notNormalizedFoundFile);
	    		
	    	    otherEvaluator.writeNormalizations(writer, normalizedAll, normalized, normalizedCorrect);

	    		writer.close();
	    		System.out.println("Pre-existing normalized entities not found by ChemSpot written to: " + notNormalizedFoundFile.getName());
    		}
    	}
    }
    
    private static class ChemSpotRun implements Runnable {
    	private int runNr = -1;
    	private ChemSpot chemspot = null;
    	private JCas jCas = null;
    	private String outputFile;
    	private boolean evaluate;
    	
    	public ChemSpotRun (int runNr, ChemSpot chemspot, JCas jCas, String outputFile, boolean evaluate) {
    		this.runNr = runNr;
    		this.chemspot = chemspot;
    		this.jCas = jCas;
    		this.outputFile = outputFile;
    		this.evaluate = evaluate;
    	}
    	
		public void run() {
			System.out.println("Starting run " + runNr);
			runChemSpot(chemspot, jCas, outputFile, evaluate);
			System.out.println("Run " + runNr + " finished");
			
			synchronized(jcases) {
				jcases.add(jCas);
			}
		}
    }

	private static void usage() {
		System.out.println("usage:");
		System.out.println("  arguments:");
        System.out.println("\t-m path to a CRF model file (internal default model file will be used if not provided)");
        System.out.println("\t-s path to a OpenNLP sentence model file (internal default model file will be used if not provided)");
        System.out.println("\t-d path to a zipped set of brics dictionary automata (parameter defaults to 'dict.zip' if not provided)");
        System.out.println("\t-i path to a zipped tab-separated text file representing a map of terms to ids (parameter defaults to 'ids.zip' if not provided)");
        System.out.println("\t-M path to a multi-class model file (parameter defaults to 'multiclass.bin' if not provided)");
        System.out.println();
        System.out.println("  flags:");
        System.out.println("\t-e if this flag is set, the performance of ChemSpot on an IOB gold-standard corpus (cf. -c) is evaluated");
        System.out.println("\t-u if this flag is set, ChemSpot will update the dictionary and ids file");
        System.out.println("\t-T number of threads to create when processing a document collection");
        System.out.println();
        System.out.println("  input control:");
		System.out.println("\t-c path to a directory containing corpora in IOB format");
		System.out.println("\t-g path to a directory containing gzipped text files");
		System.out.println("\t-t path to a text file");
		System.out.println("\t-f path to a directory of text files");
		System.out.println();
        System.out.println("  output control:");
		System.out.println("\t-o path to output file");
		System.out.println("\t-I if this flag is set, the output will be converted into the IOB format");
        System.exit(0);
	}
}