ChemSpot.java example

Explorer
ChemSpot-master
- src
  - main
    - java
      - de
        berlin
        hu
        banner
        featuresets
        KlingerLikeFeatureSet.java
        LWhitespace.java
        RWhitespace.java
        util
        ConfigUtil.java
        chemspot
        App.java
        ChemSpot.java
        ChemSpotArguments.java
        ChemSpotConfiguration.java
        ChemSpotFactory.java
        ChemicalNEREvaluator.java
        Mention.java
        uima
        ae
        AnnotationImporterAE.java
        AnnotationMergerAE.java
        expander
        MentionExpander.java
        feature
        FeatureGeneratorApp.java
        FeatureToken.java
        FeatureTokenGenerator.java
        filter
        PosFilter.java
        StopwordFilter.java
        SuffixFilter.java
        normalizer
        Normalizer.java
        StringComparator.java
        tagger
        abbrev
        AbbreviationTagger.java
        ExtractAbbrev.java
        banner
        BannerTagger.java
        CRFWrapper.java
        brics
        BricsMatcher.java
        BricsTagger.java
        DictionaryUpdater.java
        drug
        EumedNERTagger.java
        simple
        ChemicalFormulaTagger.java
        tokenizer
        FineTokenizerAE.java
        cc
        banner
        trainer
        BannerTrainer.java
        eval
        ComparableAnnotation.java
        Evaluation.java
        SeparateEvaluation.java
        cr
        chemdner
        CHEMDNERReader.java
        craft
        CraftCR.java
        ddi
        DDICorpusCR.java
        parser
        DDICorpusContentHandlerImpl.java
        iob
        IOBDirectoryCollectionReader.java
        txt
        gz
        ZipFileCollectionReader.java
        xml
        NaCTeMCollectionReader.java
        PatentCorpusCollectionReader.java
        XMLCollectionReader.java
        util
        DDIToUCompareConverter.java
        OpenNLPToUCompareSentenceConverterAE.java
        OpenNLPToUCompareTokenConverterAE.java
        Util.java
        util
        Constants.java
        wbi
        common
        research
        EvalMeasures.java
        Evaluator.java
    - types
/*
 * Copyright (c) 2012. Humboldt-Universität zu Berlin, Dept. of Computer Science and Dept.
 * of Wissensmanagement in der Bioinformatik
 * -------------------------------
 *
 * THE ACCOMPANYING PROGRAM IS PROVIDED UNDER THE TERMS OF THIS COMMON PUBLIC
 * LICENSE ("AGREEMENT"). ANY USE, REPRODUCTION OR DISTRIBUTION OF THE PROGRAM
 * CONSTITUTES RECIPIENT'S ACCEPTANCE OF THIS AGREEMENT.
 *
 * http://www.opensource.org/licenses/cpl1.0
 */

package de.berlin.hu.chemspot;

import de.berlin.hu.chemspot.ChemSpotConfiguration.Component;
import de.berlin.hu.types.PubmedDocument;
import de.berlin.hu.uima.ae.feature.FeatureTokenGenerator;
import de.berlin.hu.uima.ae.feature.FeatureTokenGenerator.Feature_Phase;
import de.berlin.hu.uima.ae.tagger.brics.BricsTagger;
import de.berlin.hu.uima.ae.tagger.drug.EumedNERTagger;
import de.berlin.hu.util.Constants;
import de.berlin.hu.util.Constants.ChemicalID;

import org.apache.uima.UIMAException;
import org.apache.uima.UIMAFramework;
import org.apache.uima.analysis_engine.AnalysisEngine;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.CAS;
import org.apache.uima.examples.SourceDocumentInformation;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.metadata.TypeSystemDescription;
import org.apache.uima.util.XMLInputSource;
import org.u_compare.shared.semantic.NamedEntity;
import org.u_compare.shared.syntactic.Token;
import org.uimafit.factory.AnalysisEngineFactory;
import org.uimafit.factory.JCasFactory;
import org.uimafit.util.JCasUtil;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.MappedByteBuffer;
import java.nio.channels.FileChannel;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.zip.GZIPInputStream;

public class ChemSpot {
	private static final String CRF_MODEL_RESOURCE_PATH = "resources/banner/model.bin";
	private static final String SENTENCE_MODEL_RESOURCE_PATH = "resources/genia/SentDetectGenia.bin.gz";
	
	// map for holding jCas objects for threads that are using the tag(String) method
    private static Map<Long, JCas> jCases = new HashMap<Long, JCas>();
	
    private TypeSystemDescription typeSystem;
    private AnalysisEngine posTagger;
    private AnalysisEngine sentenceDetector;
    private AnalysisEngine sentenceConverter;
    private AnalysisEngine tokenConverter;
    private AnalysisEngine crfTagger;
    private AnalysisEngine dictionaryTagger;
    private AnalysisEngine chemicalFormulaTagger;
    private AnalysisEngine abbrevTagger;
    private AnalysisEngine drugTagger;
    private AnalysisEngine annotationMerger;
    private AnalysisEngine fineTokenizer;
    private AnalysisEngine stopwordFilter;
    private AnalysisEngine mentionExpander;
    private AnalysisEngine normalizer;
    private FeatureTokenGenerator featureGenerator;
    
    private ChemicalNEREvaluator evaluator;

    public ChemSpot() {
    	this(null, null, null, null);
    }
    
    /**
     * Initializes ChemSpot without a dictionary automaton and a normalizer.
     * @param pathToCRFModelFile the Path to a CRF model
     */
    public ChemSpot(String pathToCRFModelFile, String pathToSentenceModelFile) {
        this(pathToCRFModelFile, null, pathToSentenceModelFile, null);
    }

    /**
     * Initializes ChemSpot without a normalizer.
     * @param pathToCRFModelFile the Path to a CRF model
     */
    public ChemSpot(String pathToCRFModelFile, String pathToDictionaryFile, String pathToSentenceModelFile) {
    	this(pathToCRFModelFile, pathToDictionaryFile, pathToSentenceModelFile, null);
    }
    
    /**
     * Initializes ChemSpot without a normalizer.
     * @param pathToCRFModelFile the Path to a CRF model
     */
    public ChemSpot(String pathToCRFModelFile, String pathToDictionaryFile, String pathToSentenceModelFile, String pathToIDs) {
    	this(pathToCRFModelFile, pathToDictionaryFile, pathToSentenceModelFile, pathToIDs, null);
    }

    /**
     * Initializes ChemSpot with a CRF model, an OpenNLP sentence model and a dictionary automaton.
     * @param pathToCRFModelFile the path to a CRF model
     * @param pathToDictionaryFile the path to a dictionary automaton
     */
    public ChemSpot(String pathToCRFModelFile, String pathToDictionaryFile, String pathToSentenceModelFile, String pathToIDs, String pathToEumedModel) {
    	try {
    		// converting CRF and sentence model paths to URLs to allow loading of models from jar file
    		pathToCRFModelFile = pathToCRFModelFile == null ? this.getClass().getClassLoader().getResource(CRF_MODEL_RESOURCE_PATH).toString() : new File(pathToCRFModelFile).toURI().toURL().toString(); 
        	pathToSentenceModelFile = pathToSentenceModelFile == null ? this.getClass().getClassLoader().getResource(SENTENCE_MODEL_RESOURCE_PATH).toString() : new File(pathToSentenceModelFile).toURI().toURL().toString();
    		
            typeSystem = UIMAFramework.getXMLParser().parseTypeSystemDescription(new XMLInputSource(this.getClass().getClassLoader().getResource("desc/TypeSystem.xml")));
            
            if (ChemSpotConfiguration.useComponent(Component.TOKENIZER)) {
	            fineTokenizer = AnalysisEngineFactory.createAnalysisEngine(UIMAFramework.getXMLParser().parseAnalysisEngineDescription(new XMLInputSource(this.getClass().getClassLoader()
	                    .getResource("desc/ae/tokenizer/FineGrainedTokenizerAE.xml"))), CAS.NAME_DEFAULT_SOFA);
	            tokenConverter = AnalysisEngineFactory.createPrimitive(UIMAFramework.getXMLParser().parseAnalysisEngineDescription(new XMLInputSource(this.getClass().getClassLoader()
	                    .getResource("desc/ae/converter/OpenNLPToUCompareTokenConverterAE.xml"))));
            }
            
            if (ChemSpotConfiguration.useComponent(Component.POS_TAGGER)) {
	            posTagger = AnalysisEngineFactory.createAnalysisEngine(UIMAFramework.getXMLParser().parseAnalysisEngineDescription(new XMLInputSource(this.getClass().getClassLoader()
	                   .getResource("desc/ae/tagger/opennlp/PosTagger.xml"))), CAS.NAME_DEFAULT_SOFA);
            }
            
            if (ChemSpotConfiguration.useComponent(Component.SENTENCE_DETECTOR)) {
	            sentenceDetector = AnalysisEngineFactory.createPrimitive(UIMAFramework.getXMLParser().parseAnalysisEngineDescription(new XMLInputSource(this.getClass().getClassLoader()
	                    .getResource("desc/ae/tagger/opennlp/SentenceDetector.xml"))), "opennlp.uima.ModelName", pathToSentenceModelFile);
	            sentenceConverter = AnalysisEngineFactory.createAnalysisEngine(UIMAFramework.getXMLParser().parseAnalysisEngineDescription(new XMLInputSource(this.getClass().getClassLoader()
	                    .getResource("desc/ae/converter/OpenNLPToUCompareSentenceConverterAE.xml"))), CAS.NAME_DEFAULT_SOFA);
            }
            
            if (ChemSpotConfiguration.useComponent(Component.CRF)) {
	            System.out.println("Loading CRF...");
	            crfTagger = AnalysisEngineFactory.createPrimitive(UIMAFramework.getXMLParser().parseAnalysisEngineDescription(new XMLInputSource(this.getClass().getClassLoader()
	                    .getResource("desc/banner/tagger/BANNERTaggerAE.xml"))),  "BannerModelFile", pathToCRFModelFile);
            }
            
            if (ChemSpotConfiguration.useComponent(Component.DICTIONARY)) {
            	if (pathToDictionaryFile != null) {
            		if (new File(pathToDictionaryFile).exists()) {
            			System.out.println("Loading dictionary...");
		                dictionaryTagger = AnalysisEngineFactory.createPrimitive(UIMAFramework.getXMLParser().parseAnalysisEngineDescription(new XMLInputSource(this.getClass().getClassLoader()
		                        .getResource("desc/ae/tagger/BricsTaggerAE.xml"))), "DrugBankMatcherDictionaryAutomat", pathToDictionaryFile);
            		}  else {
             			System.out.println("Dictionary file '" + pathToDictionaryFile +  "' does not exist. Tagging without dictionary...");
             		}
            	} else {
            		System.out.println("No dictionary location specified! Tagging without dictionary...");
            	}
            }
            
            if (ChemSpotConfiguration.useComponent(Component.SUM_TAGGER)) {
	            chemicalFormulaTagger = AnalysisEngineFactory.createAnalysisEngine(UIMAFramework.getXMLParser().parseAnalysisEngineDescription(new XMLInputSource(this.getClass().getClassLoader()
	                    .getResource("desc/ae/tagger/ChemicalFormulaTaggerAE.xml"))), CAS.NAME_DEFAULT_SOFA);
        	}
            
            if (ChemSpotConfiguration.useComponent(Component.ABBREV)) {
	            abbrevTagger = AnalysisEngineFactory.createAnalysisEngine(UIMAFramework.getXMLParser().parseAnalysisEngineDescription(new XMLInputSource(this.getClass().getClassLoader()
	                    .getResource("desc/ae/tagger/AbbreviationTaggerAE.xml"))), CAS.NAME_DEFAULT_SOFA);
            }
            
            if (ChemSpotConfiguration.useComponent(Component.EUMED_TAGGER)) {
            	if (pathToEumedModel != null) {
            		if (new File(pathToEumedModel).exists()) {
            			System.out.println("Initializing multi-class tagger...");
            			drugTagger = AnalysisEngineFactory.createPrimitive(UIMAFramework.getXMLParser().parseAnalysisEngineDescription(new XMLInputSource(this.getClass().getClassLoader()
        	                    .getResource("desc/ae/tagger/EumedTaggerAE.xml"))), EumedNERTagger.PATH_TO_EUMED_MODEL, pathToEumedModel);
            		}  else {
             			System.out.println("Multi-class model file '" + pathToEumedModel +  "' does not exist. Tagging without multi-class tagger...");
             		}
            	} else {
            		System.out.println("No multi-class model location specified! Tagging without multi-class tagger...");
            	}
            }
            
            if (ChemSpotConfiguration.useComponent(Component.MENTION_EXPANDER)) {
            	mentionExpander = AnalysisEngineFactory.createAnalysisEngine(UIMAFramework.getXMLParser().parseAnalysisEngineDescription(new XMLInputSource(this.getClass().getClassLoader()
            			.getResource("desc/ae/expander/MentionExpanderAE.xml"))), CAS.NAME_DEFAULT_SOFA);
            }
            
            if (ChemSpotConfiguration.useComponent(Component.ANNOTATION_MERGER)) {
            	annotationMerger = AnalysisEngineFactory.createAnalysisEngine(UIMAFramework.getXMLParser().parseAnalysisEngineDescription(new XMLInputSource(this.getClass().getClassLoader()
                        .getResource("desc/ae/AnnotationMergerAE.xml"))), CAS.NAME_DEFAULT_SOFA);
            }
            
            if (ChemSpotConfiguration.useComponent(Component.NORMALIZER) || ChemSpotConfiguration.useComponent(Component.CHEMHITS)) {
            	if (pathToIDs != null) {
            		if (new File(pathToIDs).exists()) {
	            		normalizer = AnalysisEngineFactory.createPrimitive(UIMAFramework.getXMLParser().parseAnalysisEngineDescription(new XMLInputSource(this.getClass().getClassLoader()
	            				.getResource("desc/ae/normalizer/NormalizerAE.xml"))), "PathToIDs", pathToIDs);
	            		if (ChemSpotConfiguration.useComponent(Component.DICTIONARY) && ChemSpotConfiguration.initializeDictionaryFromNormalizer()) {
	            			dictionaryTagger = AnalysisEngineFactory.createPrimitive(UIMAFramework.getXMLParser().parseAnalysisEngineDescription(new XMLInputSource(this.getClass().getClassLoader()
			                        .getResource("desc/ae/tagger/BricsTaggerAE.xml"))), BricsTagger.PATH_TO_DICTIONARY, "");
	            		}
            		} else {
            			System.out.println("Normalization ids file '" + pathToIDs +  "' does not exist. Tagging without subsequent normalization...");
            		}
            	} else System.out.println("No location for ids specified! Tagging without subsequent normalization...");
            }
            
            if (ChemSpotConfiguration.useComponent(Component.STOPWORD_FILTER)) {
	            stopwordFilter = AnalysisEngineFactory.createAnalysisEngine(UIMAFramework.getXMLParser().parseAnalysisEngineDescription(new XMLInputSource(this.getClass().getClassLoader()
	                    .getResource("desc/ae/filter/StopwordFilterAE.xml"))), CAS.NAME_DEFAULT_SOFA);
            }
            
            if (ChemSpotConfiguration.useComponent(Component.FEATURE_GENERATOR)) {
	            featureGenerator = new FeatureTokenGenerator();
            }
            
            setEvaluator(new ChemicalNEREvaluator());
            
            System.out.println("Finished initializing ChemSpot.");
        } catch (UIMAException e) {
            System.err.println("Failed initializing ChemSpot.");
            e.printStackTrace();
        } catch (IOException e) {
            System.err.println("Failed initializing ChemSpot.");
            e.printStackTrace();
        }
    }

    /**
     * Returns all mentions (non-goldstandard entities) from a jcas object.
     * 
     * @param jcas the jcas
     * @return
     */
    public static List<Mention> getMentions(JCas jcas) {
    	List<Mention> mentions = new ArrayList<Mention>();
        Iterator<NamedEntity> entities = JCasUtil.iterator(jcas, NamedEntity.class);
        while (entities.hasNext()) {
            NamedEntity entity = entities.next();
            //disregards gold-standard mentions
            if (!Constants.GOLDSTANDARD.equals(entity.getSource())) {
            	Mention mention = new Mention(entity);
            	if (ChemSpotConfiguration.isAnnotate(mention.getType())) {
            		mentions.add(mention);
            	}
            }
        }

        return mentions;
    }
    
    /**
     * Returns all goldstandard entities from a jcas object.
     * 
     * @param jcas the jcas
     * @return
     */
    public static List<Mention> getGoldstandardAnnotations(JCas jcas) {
    	List<Mention> result = new ArrayList<Mention>();
        Iterator<NamedEntity> entities = JCasUtil.iterator(jcas, NamedEntity.class);
        while (entities.hasNext()) {
            NamedEntity entity = entities.next();
            if (Constants.GOLDSTANDARD.equals(entity.getSource())) {
                result.add(new Mention(entity));
            }
        }

        return result;
    }
    
    /**
     * Reads a text from a file and puts the content into the provided jcas.
     * 
     * @param jcas the jcas
     * @param pathToFile the path to the text file
     * @throws IOException
     */
    public static void readFile(JCas jcas, String pathToFile) throws IOException {
    	FileInputStream stream = new FileInputStream(new File(pathToFile));
    	String text = null;
		try {
			FileChannel fc = stream.getChannel();
			MappedByteBuffer bb = fc.map(FileChannel.MapMode.READ_ONLY, 0, fc.size());
			text = Charset.defaultCharset().decode(bb).toString();
		} finally {
			stream.close();
		}
		
		jcas.setDocumentText(text);
        PubmedDocument pd = new PubmedDocument(jcas);
        pd.setBegin(0);
        pd.setEnd(text.length());
        pd.setPmid("");
        pd.addToIndexes(jcas);
        
        SourceDocumentInformation srcDocInfo = new SourceDocumentInformation(jcas);
        srcDocInfo.setUri(new File(pathToFile).getAbsoluteFile().toURI().toString());
        srcDocInfo.setOffsetInSource(0);
        srcDocInfo.setDocumentSize((int) new File(pathToFile).length());
        srcDocInfo.setBegin(0);
        srcDocInfo.setEnd(text.length());
        srcDocInfo.addToIndexes();
    }
    
    /**
     * Reads a text from a gzipped file and puts the content into the provided jcas.
     * 
     * @param jcas the jcas
     * @param pathToFile the path to the text file
     * @throws IOException
     */
    public static void readGZFile(JCas jcas, String pathToFile) throws IOException {
        File file = new File(pathToFile);
        String text;
        BufferedReader reader = new BufferedReader(
                new InputStreamReader(
                        new GZIPInputStream(
                                new FileInputStream(file)) ) );

        StringBuilder textBuffer = new StringBuilder();
        Integer currindex = -1;
        while(reader.ready()){
            PubmedDocument pmdoc = new PubmedDocument(jcas);
            String s = reader.readLine();
            if (s != null) {
                //split line into pmid and text
                String pmid = s.substring(0, s.indexOf("\t"));
                String annot = s.substring(s.indexOf("\t"));
                //two = splitFirst(s, "\t");
                pmdoc.setPmid(pmid);

                //append text
                textBuffer.append(annot).append("\n");
                pmdoc.setBegin(currindex + 1);
                Integer len = annot.length();
                currindex = currindex + len + 1;
                pmdoc.setEnd(currindex);
                pmdoc.addToIndexes();
            }
        }

        text = textBuffer.toString();

        //put document in CAS
        jcas.setDocumentText(text);
        SourceDocumentInformation srcDocInfo = new SourceDocumentInformation(jcas);
        srcDocInfo.setUri(file.getAbsoluteFile().toURI().toString());
        srcDocInfo.setOffsetInSource(0);
        srcDocInfo.setDocumentSize((int) file.length());
        srcDocInfo.setBegin(0);
        srcDocInfo.setEnd(currindex);
        srcDocInfo.addToIndexes();
    }
    
    private static long start = 0;
    public static void printTime(String action) {
    	if (ChemSpotConfiguration.useComponent(Component.PROFILER)) {
    		System.out.printf("%s: %.1f s%n", action, (System.currentTimeMillis() - start) / 1000.0);
    		start = System.currentTimeMillis();
    	}
    }
    
    private static void startTimer() {
    	start = ChemSpotConfiguration.useComponent(Component.PROFILER) ? System.currentTimeMillis() : 0;
    	if (start != 0) {
    		System.out.println("start profiling...");
    	}
    }
    
    /**
     * Finds chemical entities in the document of a {@code JCas} object and returns a list of mentions.
     * @param jcas contains the document text
     * @return a list of mentions
     */
    public List<Mention> tag(JCas jcas) {
    	List<NamedEntity> otherEntities = null;
    	startTimer();
        try {
        	if (fineTokenizer != null) {
        		fineTokenizer.process(jcas);
        		printTime("tokenization");
        	}
        	
            synchronized (this) {
            	if (sentenceDetector != null) {
            		sentenceDetector.process(jcas);
            		printTime("sentence detector");
            	}
            	if (posTagger != null) {
            		posTagger.process(jcas);
            		printTime("POS tagger");
            	}
            }
            if (tokenConverter != null) {
            	tokenConverter.process(jcas);
            	printTime("token converter");
            }
            if (sentenceConverter != null) {
            	sentenceConverter.process(jcas);
            	printTime("sentence converter");
            }
            if (crfTagger != null) {
            	crfTagger.process(jcas);
            	printTime("crf tagger");
            }
            if (dictionaryTagger != null) {
            	dictionaryTagger.process(jcas);
            	printTime("dictionary tagger");
            }
            if (chemicalFormulaTagger != null) {
            	chemicalFormulaTagger.process(jcas);
            	printTime("chemical formula tagger");
            }
            if (abbrevTagger != null) {
            	abbrevTagger.process(jcas);
            	printTime("abbreviation tagger");
            }
            if (drugTagger != null) {
            	drugTagger.process(jcas);
            	printTime("drug tagger");
            }
            if (featureGenerator != null) {
            	if (normalizer != null) {
            		normalizer.process(jcas);
            	}
            	featureGenerator.process(jcas, Feature_Phase.PHASE1);
            	printTime("feature generation phase 1 (+ preliminary normalization run)");
            }
            if (stopwordFilter != null) {
            	//stopwordFilter.process(jcas);
            	printTime("stopword filter");
            }
            if (mentionExpander != null) {
            	mentionExpander.process(jcas);
            	printTime("mention expander");
            }
            if (featureGenerator != null) {
            	featureGenerator.process(jcas, Feature_Phase.PHASE2);
            	printTime("feature generator phase 2");
            }
            if (annotationMerger != null) {
            	annotationMerger.process(jcas);
            	printTime("annotation merger");
            }
            if (featureGenerator != null) {
            	featureGenerator.process(jcas, Feature_Phase.PHASE3);
            	printTime("feature generator phase 3");
            }
            if (normalizer != null) {
            	normalizer.process(jcas);
            	printTime("normalizer");
            }
            if (featureGenerator != null) {
            	featureGenerator.process(jcas, Feature_Phase.PHASE4);
            	printTime("feature generator phase 4");
            }
        } catch (AnalysisEngineProcessException e) {
            System.err.println("Failed to extract chemicals from text.");
            e.printStackTrace();
        } finally {
        	if (otherEntities != null && !otherEntities.isEmpty()) {
        		for (NamedEntity ne : otherEntities) {
        			ne.addToIndexes();
        		}
        	}
        }
        
        return getMentions(jcas);
    	
    	/*Oscar oscar = new Oscar();
    	ChemicalEntityRecogniser recogniser = new MEMMRecogniser(new PubMedModel(), OntologyTerms.getDefaultInstance(), new ChemNameDictRegistry(Locale.ENGLISH));
    	
    	List<PubmedDocument> documents = new ArrayList<PubmedDocument>();
    	for (PubmedDocument doc : JCasUtil.iterate(jcas, PubmedDocument.class)) {
    		documents.add(doc);
    	}
    	if (documents.isEmpty()) {
    		PubmedDocument doc = new PubmedDocument(jcas);
			doc.setBegin(0);
			doc.setEnd(jcas.getDocumentText().length());
			doc.setPmid("");
			doc.addToIndexes(jcas);	
    		documents.add(doc);
    	}
    	for (PubmedDocument doc : documents) {
	    	List<uk.ac.cam.ch.wwmm.oscar.document.NamedEntity> entities = recogniser.findNamedEntities(oscar.tokenise(doc.getCoveredText()), ResolutionMode.REMOVE_BLOCKED);
	    	for (uk.ac.cam.ch.wwmm.oscar.document.NamedEntity rne : entities) {
	    	    if (!rne.getType().isInstance(NamedEntityType.COMPOUND)){
					continue;
				}
	    		
	    	    NamedEntity entity = new NamedEntity(jcas);
	    	    entity.setBegin(doc.getBegin() + rne.getStart());
	    	    entity.setEnd(doc.getBegin() + rne.getEnd());
	    	    for (String id : rne.getOntIds()) {
	    	    	if (id.contains("CHEBI:")) {
	    	    		entity.setId("," + id);
	    	    	}
	    	    }
	    	    entity.setSource("OSCAR");
	    	    entity.addToIndexes();
	    	}
    	}
    	
    	return null;*/
    }

    /**
     * Finds chemical entities in a {@code text} and returns a list of mentions.
     * @param text natural language text from which ChemSpot shall extract chemical entities
     * @return a list of mentions
     * @throws UIMAException 
     */
    public List<Mention> tag(String text) {
    	// get JCas object for currently executed thread
    	long threadId = Thread.currentThread().getId();
    	
    	// create new jcas if necessary (i.e. a thread calls this method for the first time)
    	if (!jCases.containsKey(threadId)) {
    		synchronized (jCases) {
    			try {
					jCases.put(threadId, JCasFactory.createJCas(typeSystem));
				} catch (UIMAException e) {
					throw new RuntimeException(e);
				}
    		}
    	}
    	// get jcas
    	JCas jcas = jCases.get(threadId);
    	jcas.reset();    	
    	
    	// TODO: for applications that create an excessive amount of threads it would be best
    	// to release the jcas object once a calling thread dies in order to reduce memory consumption.
    	// This would probably require a new thread for each one that calls this method (to call its 
    	// thread.join() method), which seems like a bit of an overkill for applications with few threads.
        
        jcas.setDocumentText(text);
        PubmedDocument pd = new PubmedDocument(jcas);
        pd.setBegin(0);
        pd.setEnd(text.length());
        pd.setPmid("");
        pd.addToIndexes(jcas);
        return tag(jcas);
    }

    /**
     * Converts all annotations from jcas to the IOB format
     * 
     * @param jcas the jcas
     * @return
     */
    public static String convertToIOB(JCas jcas) {
    	StringBuilder sb = new StringBuilder();
        HashMap<String, ArrayList<NamedEntity>> goldAnnotations = new HashMap<String, ArrayList<NamedEntity>>();
        HashMap<String, ArrayList<NamedEntity>> pipelineAnnotations = new HashMap<String, ArrayList<NamedEntity>>();
    	
    	System.out.println("Converting annotations to IOB format...");
    	
        Iterator<PubmedDocument> abstracts = JCasUtil.iterator(jcas, PubmedDocument.class);
        while (abstracts.hasNext()) {
            PubmedDocument pubmedAbstract = abstracts.next();
            sb.append("### ").append(pubmedAbstract.getPmid()).append("\n");
            int offset = pubmedAbstract.getBegin();
            String pmid = pubmedAbstract.getPmid();

            List<Token> tokens = JCasUtil.selectCovered(Token.class, pubmedAbstract);
            for (Token token : tokens) {
                token.setLabel("O");
            }

            List<NamedEntity> entities = JCasUtil.selectCovered(NamedEntity.class, pubmedAbstract);
            for (NamedEntity entity : entities) {
                int firstTokenBegin = 0;
                int lastTokenEnd = 0;
                
                String id = "";
                Mention m = new Mention(entity);
                for (ChemicalID type : ChemicalID.values()) {
                	String tempId = m.getId(type);
                	id += (!id.isEmpty() ? "\t" : "") + (tempId != null && !tempId.isEmpty() ? tempId : "");
                }
                if (!Constants.GOLDSTANDARD.equals(entity.getSource())) {
                    if (pipelineAnnotations.containsKey(pmid)) {
                        pipelineAnnotations.get(pmid).add(entity);
                    } else {
                        ArrayList<NamedEntity> tempArray = new ArrayList<NamedEntity>();
                        tempArray.add(entity);
                        pipelineAnnotations.put(pmid, tempArray);
                    }
                    
                    String labelName = m.getType().toString();
                    List<Token> entityTokens = JCasUtil.selectCovered(Token.class, entity);
                    boolean first = true;
                    for (Token token : entityTokens) {
                        if (first) {
                            if (id.isEmpty()) token.setLabel("B-" + labelName); else token.setLabel("B-" + labelName + "\t" + id);
                            first = false;
                            firstTokenBegin = token.getBegin();
                        } else {
                            token.setLabel("I-" + labelName);
                        }
                        lastTokenEnd = token.getEnd();
                    }
                    assert entity.getBegin() == firstTokenBegin : (id + ": " + entity.getBegin() + " -> " + firstTokenBegin);
                    assert entity.getEnd() == lastTokenEnd : (id + ": " + entity.getEnd() + " -> " + lastTokenEnd);
                } else {
                    if (goldAnnotations.containsKey(pmid)) {
                        goldAnnotations.get(pmid).add(entity);
                    } else {
                        ArrayList<NamedEntity> tempArray = new ArrayList<NamedEntity>();
                        tempArray.add(entity);
                        goldAnnotations.put(pmid, tempArray);
                    }
                }
            }

            List<Token> tokensToPrint = JCasUtil.selectCovered(Token.class, pubmedAbstract);
            boolean firstToken = true;
            for (Token token : tokensToPrint) {
                if (firstToken && (token.getBegin() - offset) != 0) {
                    sb.append(" " + "\t" + 0 + "\t").append(token.getBegin() - offset).append("\t\t|O\n");
                }
                firstToken = false;
                sb.append(token.getCoveredText()).append("\t").append(token.getBegin() - offset).append("\t").append(token.getEnd() - offset).append("\t\t|").append(token.getLabel()).append("\n");
            }
        }
        
        return sb.toString();
    }
    
    public static String serializeAnnotations(JCas jcas) {
        int offset;
        StringBuilder sb = new StringBuilder();
        Iterator<PubmedDocument> documentIterator = JCasUtil.iterator(jcas, PubmedDocument.class);
        while (documentIterator.hasNext()) {
            PubmedDocument document = documentIterator.next();
            offset = document.getBegin();
            String pmid = document.getPmid();
            int numberOfEntities = 0;
            Iterator<NamedEntity> entityIterator = JCasUtil.iterator(document, NamedEntity.class, true, true);
            while (entityIterator.hasNext()) {
                NamedEntity entity = entityIterator.next();
                if (!Constants.GOLDSTANDARD.equals(entity.getSource())) {
                    //offset fix for GeneView
                    //int begin = entity.getBegin() - offset;
                    int begin = entity.getBegin() - offset - 1;
                    //int end = entity.getEnd() - offset - 1;
                    int end = entity.getEnd() - offset - 2;
                    String text = entity.getCoveredText();
                    
                    String id = "";
                    Mention m = new Mention(entity);
                    for (ChemicalID type : ChemicalID.values()) {
                    	String tempId = m.getId(type);
                    	id += "\t" + (tempId != null && !tempId.isEmpty() ? tempId : "");
                    }
                    
                    sb.append(pmid + "\t" + begin + "\t" + end + "\t" + text + "\t" + m.getType().toString() + id + "\n");
                }
                numberOfEntities++;
            }
            if (numberOfEntities == 0) {
            	sb.append(pmid + "\t-1\t-1\t\\N\t\\N\t\\N" + new String(new char[ChemicalID.values().length]).replace("\0", "\t") + "\n");
            }
        }
        
        return sb.toString();
    }

	public ChemicalNEREvaluator getEvaluator() {
		return evaluator;
	}

	public void setEvaluator(ChemicalNEREvaluator evaluator) {
		this.evaluator = evaluator;
	}
	
	public FeatureTokenGenerator getFeatureTokenGenerator() {
		return featureGenerator;
	}
}