/* * Copyright (c) 2012. Humboldt-Universität zu Berlin, Dept. of Computer Science and Dept. * of Wissensmanagement in der Bioinformatik * ------------------------------- * * THE ACCOMPANYING PROGRAM IS PROVIDED UNDER THE TERMS OF THIS COMMON PUBLIC * LICENSE ("AGREEMENT"). ANY USE, REPRODUCTION OR DISTRIBUTION OF THE PROGRAM * CONSTITUTES RECIPIENT'S ACCEPTANCE OF THIS AGREEMENT. * * http://www.opensource.org/licenses/cpl1.0 */ package de.berlin.hu.chemspot; import de.berlin.hu.chemspot.ChemSpotConfiguration.Component; import de.berlin.hu.types.PubmedDocument; import de.berlin.hu.uima.ae.feature.FeatureTokenGenerator; import de.berlin.hu.uima.ae.feature.FeatureTokenGenerator.Feature_Phase; import de.berlin.hu.uima.ae.tagger.brics.BricsTagger; import de.berlin.hu.uima.ae.tagger.drug.EumedNERTagger; import de.berlin.hu.util.Constants; import de.berlin.hu.util.Constants.ChemicalID; import org.apache.uima.UIMAException; import org.apache.uima.UIMAFramework; import org.apache.uima.analysis_engine.AnalysisEngine; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.cas.CAS; import org.apache.uima.examples.SourceDocumentInformation; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.metadata.TypeSystemDescription; import org.apache.uima.util.XMLInputSource; import org.u_compare.shared.semantic.NamedEntity; import org.u_compare.shared.syntactic.Token; import org.uimafit.factory.AnalysisEngineFactory; import org.uimafit.factory.JCasFactory; import org.uimafit.util.JCasUtil; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStreamReader; import java.nio.MappedByteBuffer; import java.nio.channels.FileChannel; import java.nio.charset.Charset; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.zip.GZIPInputStream; public class ChemSpot { private static final String CRF_MODEL_RESOURCE_PATH = "resources/banner/model.bin"; private static final String SENTENCE_MODEL_RESOURCE_PATH = "resources/genia/SentDetectGenia.bin.gz"; // map for holding jCas objects for threads that are using the tag(String) method private static Map<Long, JCas> jCases = new HashMap<Long, JCas>(); private TypeSystemDescription typeSystem; private AnalysisEngine posTagger; private AnalysisEngine sentenceDetector; private AnalysisEngine sentenceConverter; private AnalysisEngine tokenConverter; private AnalysisEngine crfTagger; private AnalysisEngine dictionaryTagger; private AnalysisEngine chemicalFormulaTagger; private AnalysisEngine abbrevTagger; private AnalysisEngine drugTagger; private AnalysisEngine annotationMerger; private AnalysisEngine fineTokenizer; private AnalysisEngine stopwordFilter; private AnalysisEngine mentionExpander; private AnalysisEngine normalizer; private FeatureTokenGenerator featureGenerator; private ChemicalNEREvaluator evaluator; public ChemSpot() { this(null, null, null, null); } /** * Initializes ChemSpot without a dictionary automaton and a normalizer. * @param pathToCRFModelFile the Path to a CRF model */ public ChemSpot(String pathToCRFModelFile, String pathToSentenceModelFile) { this(pathToCRFModelFile, null, pathToSentenceModelFile, null); } /** * Initializes ChemSpot without a normalizer. * @param pathToCRFModelFile the Path to a CRF model */ public ChemSpot(String pathToCRFModelFile, String pathToDictionaryFile, String pathToSentenceModelFile) { this(pathToCRFModelFile, pathToDictionaryFile, pathToSentenceModelFile, null); } /** * Initializes ChemSpot without a normalizer. * @param pathToCRFModelFile the Path to a CRF model */ public ChemSpot(String pathToCRFModelFile, String pathToDictionaryFile, String pathToSentenceModelFile, String pathToIDs) { this(pathToCRFModelFile, pathToDictionaryFile, pathToSentenceModelFile, pathToIDs, null); } /** * Initializes ChemSpot with a CRF model, an OpenNLP sentence model and a dictionary automaton. * @param pathToCRFModelFile the path to a CRF model * @param pathToDictionaryFile the path to a dictionary automaton */ public ChemSpot(String pathToCRFModelFile, String pathToDictionaryFile, String pathToSentenceModelFile, String pathToIDs, String pathToEumedModel) { try { // converting CRF and sentence model paths to URLs to allow loading of models from jar file pathToCRFModelFile = pathToCRFModelFile == null ? this.getClass().getClassLoader().getResource(CRF_MODEL_RESOURCE_PATH).toString() : new File(pathToCRFModelFile).toURI().toURL().toString(); pathToSentenceModelFile = pathToSentenceModelFile == null ? this.getClass().getClassLoader().getResource(SENTENCE_MODEL_RESOURCE_PATH).toString() : new File(pathToSentenceModelFile).toURI().toURL().toString(); typeSystem = UIMAFramework.getXMLParser().parseTypeSystemDescription(new XMLInputSource(this.getClass().getClassLoader().getResource("desc/TypeSystem.xml"))); if (ChemSpotConfiguration.useComponent(Component.TOKENIZER)) { fineTokenizer = AnalysisEngineFactory.createAnalysisEngine(UIMAFramework.getXMLParser().parseAnalysisEngineDescription(new XMLInputSource(this.getClass().getClassLoader() .getResource("desc/ae/tokenizer/FineGrainedTokenizerAE.xml"))), CAS.NAME_DEFAULT_SOFA); tokenConverter = AnalysisEngineFactory.createPrimitive(UIMAFramework.getXMLParser().parseAnalysisEngineDescription(new XMLInputSource(this.getClass().getClassLoader() .getResource("desc/ae/converter/OpenNLPToUCompareTokenConverterAE.xml")))); } if (ChemSpotConfiguration.useComponent(Component.POS_TAGGER)) { posTagger = AnalysisEngineFactory.createAnalysisEngine(UIMAFramework.getXMLParser().parseAnalysisEngineDescription(new XMLInputSource(this.getClass().getClassLoader() .getResource("desc/ae/tagger/opennlp/PosTagger.xml"))), CAS.NAME_DEFAULT_SOFA); } if (ChemSpotConfiguration.useComponent(Component.SENTENCE_DETECTOR)) { sentenceDetector = AnalysisEngineFactory.createPrimitive(UIMAFramework.getXMLParser().parseAnalysisEngineDescription(new XMLInputSource(this.getClass().getClassLoader() .getResource("desc/ae/tagger/opennlp/SentenceDetector.xml"))), "opennlp.uima.ModelName", pathToSentenceModelFile); sentenceConverter = AnalysisEngineFactory.createAnalysisEngine(UIMAFramework.getXMLParser().parseAnalysisEngineDescription(new XMLInputSource(this.getClass().getClassLoader() .getResource("desc/ae/converter/OpenNLPToUCompareSentenceConverterAE.xml"))), CAS.NAME_DEFAULT_SOFA); } if (ChemSpotConfiguration.useComponent(Component.CRF)) { System.out.println("Loading CRF..."); crfTagger = AnalysisEngineFactory.createPrimitive(UIMAFramework.getXMLParser().parseAnalysisEngineDescription(new XMLInputSource(this.getClass().getClassLoader() .getResource("desc/banner/tagger/BANNERTaggerAE.xml"))), "BannerModelFile", pathToCRFModelFile); } if (ChemSpotConfiguration.useComponent(Component.DICTIONARY)) { if (pathToDictionaryFile != null) { if (new File(pathToDictionaryFile).exists()) { System.out.println("Loading dictionary..."); dictionaryTagger = AnalysisEngineFactory.createPrimitive(UIMAFramework.getXMLParser().parseAnalysisEngineDescription(new XMLInputSource(this.getClass().getClassLoader() .getResource("desc/ae/tagger/BricsTaggerAE.xml"))), "DrugBankMatcherDictionaryAutomat", pathToDictionaryFile); } else { System.out.println("Dictionary file '" + pathToDictionaryFile + "' does not exist. Tagging without dictionary..."); } } else { System.out.println("No dictionary location specified! Tagging without dictionary..."); } } if (ChemSpotConfiguration.useComponent(Component.SUM_TAGGER)) { chemicalFormulaTagger = AnalysisEngineFactory.createAnalysisEngine(UIMAFramework.getXMLParser().parseAnalysisEngineDescription(new XMLInputSource(this.getClass().getClassLoader() .getResource("desc/ae/tagger/ChemicalFormulaTaggerAE.xml"))), CAS.NAME_DEFAULT_SOFA); } if (ChemSpotConfiguration.useComponent(Component.ABBREV)) { abbrevTagger = AnalysisEngineFactory.createAnalysisEngine(UIMAFramework.getXMLParser().parseAnalysisEngineDescription(new XMLInputSource(this.getClass().getClassLoader() .getResource("desc/ae/tagger/AbbreviationTaggerAE.xml"))), CAS.NAME_DEFAULT_SOFA); } if (ChemSpotConfiguration.useComponent(Component.EUMED_TAGGER)) { if (pathToEumedModel != null) { if (new File(pathToEumedModel).exists()) { System.out.println("Initializing multi-class tagger..."); drugTagger = AnalysisEngineFactory.createPrimitive(UIMAFramework.getXMLParser().parseAnalysisEngineDescription(new XMLInputSource(this.getClass().getClassLoader() .getResource("desc/ae/tagger/EumedTaggerAE.xml"))), EumedNERTagger.PATH_TO_EUMED_MODEL, pathToEumedModel); } else { System.out.println("Multi-class model file '" + pathToEumedModel + "' does not exist. Tagging without multi-class tagger..."); } } else { System.out.println("No multi-class model location specified! Tagging without multi-class tagger..."); } } if (ChemSpotConfiguration.useComponent(Component.MENTION_EXPANDER)) { mentionExpander = AnalysisEngineFactory.createAnalysisEngine(UIMAFramework.getXMLParser().parseAnalysisEngineDescription(new XMLInputSource(this.getClass().getClassLoader() .getResource("desc/ae/expander/MentionExpanderAE.xml"))), CAS.NAME_DEFAULT_SOFA); } if (ChemSpotConfiguration.useComponent(Component.ANNOTATION_MERGER)) { annotationMerger = AnalysisEngineFactory.createAnalysisEngine(UIMAFramework.getXMLParser().parseAnalysisEngineDescription(new XMLInputSource(this.getClass().getClassLoader() .getResource("desc/ae/AnnotationMergerAE.xml"))), CAS.NAME_DEFAULT_SOFA); } if (ChemSpotConfiguration.useComponent(Component.NORMALIZER) || ChemSpotConfiguration.useComponent(Component.CHEMHITS)) { if (pathToIDs != null) { if (new File(pathToIDs).exists()) { normalizer = AnalysisEngineFactory.createPrimitive(UIMAFramework.getXMLParser().parseAnalysisEngineDescription(new XMLInputSource(this.getClass().getClassLoader() .getResource("desc/ae/normalizer/NormalizerAE.xml"))), "PathToIDs", pathToIDs); if (ChemSpotConfiguration.useComponent(Component.DICTIONARY) && ChemSpotConfiguration.initializeDictionaryFromNormalizer()) { dictionaryTagger = AnalysisEngineFactory.createPrimitive(UIMAFramework.getXMLParser().parseAnalysisEngineDescription(new XMLInputSource(this.getClass().getClassLoader() .getResource("desc/ae/tagger/BricsTaggerAE.xml"))), BricsTagger.PATH_TO_DICTIONARY, ""); } } else { System.out.println("Normalization ids file '" + pathToIDs + "' does not exist. Tagging without subsequent normalization..."); } } else System.out.println("No location for ids specified! Tagging without subsequent normalization..."); } if (ChemSpotConfiguration.useComponent(Component.STOPWORD_FILTER)) { stopwordFilter = AnalysisEngineFactory.createAnalysisEngine(UIMAFramework.getXMLParser().parseAnalysisEngineDescription(new XMLInputSource(this.getClass().getClassLoader() .getResource("desc/ae/filter/StopwordFilterAE.xml"))), CAS.NAME_DEFAULT_SOFA); } if (ChemSpotConfiguration.useComponent(Component.FEATURE_GENERATOR)) { featureGenerator = new FeatureTokenGenerator(); } setEvaluator(new ChemicalNEREvaluator()); System.out.println("Finished initializing ChemSpot."); } catch (UIMAException e) { System.err.println("Failed initializing ChemSpot."); e.printStackTrace(); } catch (IOException e) { System.err.println("Failed initializing ChemSpot."); e.printStackTrace(); } } /** * Returns all mentions (non-goldstandard entities) from a jcas object. * * @param jcas the jcas * @return */ public static List<Mention> getMentions(JCas jcas) { List<Mention> mentions = new ArrayList<Mention>(); Iterator<NamedEntity> entities = JCasUtil.iterator(jcas, NamedEntity.class); while (entities.hasNext()) { NamedEntity entity = entities.next(); //disregards gold-standard mentions if (!Constants.GOLDSTANDARD.equals(entity.getSource())) { Mention mention = new Mention(entity); if (ChemSpotConfiguration.isAnnotate(mention.getType())) { mentions.add(mention); } } } return mentions; } /** * Returns all goldstandard entities from a jcas object. * * @param jcas the jcas * @return */ public static List<Mention> getGoldstandardAnnotations(JCas jcas) { List<Mention> result = new ArrayList<Mention>(); Iterator<NamedEntity> entities = JCasUtil.iterator(jcas, NamedEntity.class); while (entities.hasNext()) { NamedEntity entity = entities.next(); if (Constants.GOLDSTANDARD.equals(entity.getSource())) { result.add(new Mention(entity)); } } return result; } /** * Reads a text from a file and puts the content into the provided jcas. * * @param jcas the jcas * @param pathToFile the path to the text file * @throws IOException */ public static void readFile(JCas jcas, String pathToFile) throws IOException { FileInputStream stream = new FileInputStream(new File(pathToFile)); String text = null; try { FileChannel fc = stream.getChannel(); MappedByteBuffer bb = fc.map(FileChannel.MapMode.READ_ONLY, 0, fc.size()); text = Charset.defaultCharset().decode(bb).toString(); } finally { stream.close(); } jcas.setDocumentText(text); PubmedDocument pd = new PubmedDocument(jcas); pd.setBegin(0); pd.setEnd(text.length()); pd.setPmid(""); pd.addToIndexes(jcas); SourceDocumentInformation srcDocInfo = new SourceDocumentInformation(jcas); srcDocInfo.setUri(new File(pathToFile).getAbsoluteFile().toURI().toString()); srcDocInfo.setOffsetInSource(0); srcDocInfo.setDocumentSize((int) new File(pathToFile).length()); srcDocInfo.setBegin(0); srcDocInfo.setEnd(text.length()); srcDocInfo.addToIndexes(); } /** * Reads a text from a gzipped file and puts the content into the provided jcas. * * @param jcas the jcas * @param pathToFile the path to the text file * @throws IOException */ public static void readGZFile(JCas jcas, String pathToFile) throws IOException { File file = new File(pathToFile); String text; BufferedReader reader = new BufferedReader( new InputStreamReader( new GZIPInputStream( new FileInputStream(file)) ) ); StringBuilder textBuffer = new StringBuilder(); Integer currindex = -1; while(reader.ready()){ PubmedDocument pmdoc = new PubmedDocument(jcas); String s = reader.readLine(); if (s != null) { //split line into pmid and text String pmid = s.substring(0, s.indexOf("\t")); String annot = s.substring(s.indexOf("\t")); //two = splitFirst(s, "\t"); pmdoc.setPmid(pmid); //append text textBuffer.append(annot).append("\n"); pmdoc.setBegin(currindex + 1); Integer len = annot.length(); currindex = currindex + len + 1; pmdoc.setEnd(currindex); pmdoc.addToIndexes(); } } text = textBuffer.toString(); //put document in CAS jcas.setDocumentText(text); SourceDocumentInformation srcDocInfo = new SourceDocumentInformation(jcas); srcDocInfo.setUri(file.getAbsoluteFile().toURI().toString()); srcDocInfo.setOffsetInSource(0); srcDocInfo.setDocumentSize((int) file.length()); srcDocInfo.setBegin(0); srcDocInfo.setEnd(currindex); srcDocInfo.addToIndexes(); } private static long start = 0; public static void printTime(String action) { if (ChemSpotConfiguration.useComponent(Component.PROFILER)) { System.out.printf("%s: %.1f s%n", action, (System.currentTimeMillis() - start) / 1000.0); start = System.currentTimeMillis(); } } private static void startTimer() { start = ChemSpotConfiguration.useComponent(Component.PROFILER) ? System.currentTimeMillis() : 0; if (start != 0) { System.out.println("start profiling..."); } } /** * Finds chemical entities in the document of a {@code JCas} object and returns a list of mentions. * @param jcas contains the document text * @return a list of mentions */ public List<Mention> tag(JCas jcas) { List<NamedEntity> otherEntities = null; startTimer(); try { if (fineTokenizer != null) { fineTokenizer.process(jcas); printTime("tokenization"); } synchronized (this) { if (sentenceDetector != null) { sentenceDetector.process(jcas); printTime("sentence detector"); } if (posTagger != null) { posTagger.process(jcas); printTime("POS tagger"); } } if (tokenConverter != null) { tokenConverter.process(jcas); printTime("token converter"); } if (sentenceConverter != null) { sentenceConverter.process(jcas); printTime("sentence converter"); } if (crfTagger != null) { crfTagger.process(jcas); printTime("crf tagger"); } if (dictionaryTagger != null) { dictionaryTagger.process(jcas); printTime("dictionary tagger"); } if (chemicalFormulaTagger != null) { chemicalFormulaTagger.process(jcas); printTime("chemical formula tagger"); } if (abbrevTagger != null) { abbrevTagger.process(jcas); printTime("abbreviation tagger"); } if (drugTagger != null) { drugTagger.process(jcas); printTime("drug tagger"); } if (featureGenerator != null) { if (normalizer != null) { normalizer.process(jcas); } featureGenerator.process(jcas, Feature_Phase.PHASE1); printTime("feature generation phase 1 (+ preliminary normalization run)"); } if (stopwordFilter != null) { //stopwordFilter.process(jcas); printTime("stopword filter"); } if (mentionExpander != null) { mentionExpander.process(jcas); printTime("mention expander"); } if (featureGenerator != null) { featureGenerator.process(jcas, Feature_Phase.PHASE2); printTime("feature generator phase 2"); } if (annotationMerger != null) { annotationMerger.process(jcas); printTime("annotation merger"); } if (featureGenerator != null) { featureGenerator.process(jcas, Feature_Phase.PHASE3); printTime("feature generator phase 3"); } if (normalizer != null) { normalizer.process(jcas); printTime("normalizer"); } if (featureGenerator != null) { featureGenerator.process(jcas, Feature_Phase.PHASE4); printTime("feature generator phase 4"); } } catch (AnalysisEngineProcessException e) { System.err.println("Failed to extract chemicals from text."); e.printStackTrace(); } finally { if (otherEntities != null && !otherEntities.isEmpty()) { for (NamedEntity ne : otherEntities) { ne.addToIndexes(); } } } return getMentions(jcas); /*Oscar oscar = new Oscar(); ChemicalEntityRecogniser recogniser = new MEMMRecogniser(new PubMedModel(), OntologyTerms.getDefaultInstance(), new ChemNameDictRegistry(Locale.ENGLISH)); List<PubmedDocument> documents = new ArrayList<PubmedDocument>(); for (PubmedDocument doc : JCasUtil.iterate(jcas, PubmedDocument.class)) { documents.add(doc); } if (documents.isEmpty()) { PubmedDocument doc = new PubmedDocument(jcas); doc.setBegin(0); doc.setEnd(jcas.getDocumentText().length()); doc.setPmid(""); doc.addToIndexes(jcas); documents.add(doc); } for (PubmedDocument doc : documents) { List<uk.ac.cam.ch.wwmm.oscar.document.NamedEntity> entities = recogniser.findNamedEntities(oscar.tokenise(doc.getCoveredText()), ResolutionMode.REMOVE_BLOCKED); for (uk.ac.cam.ch.wwmm.oscar.document.NamedEntity rne : entities) { if (!rne.getType().isInstance(NamedEntityType.COMPOUND)){ continue; } NamedEntity entity = new NamedEntity(jcas); entity.setBegin(doc.getBegin() + rne.getStart()); entity.setEnd(doc.getBegin() + rne.getEnd()); for (String id : rne.getOntIds()) { if (id.contains("CHEBI:")) { entity.setId("," + id); } } entity.setSource("OSCAR"); entity.addToIndexes(); } } return null;*/ } /** * Finds chemical entities in a {@code text} and returns a list of mentions. * @param text natural language text from which ChemSpot shall extract chemical entities * @return a list of mentions * @throws UIMAException */ public List<Mention> tag(String text) { // get JCas object for currently executed thread long threadId = Thread.currentThread().getId(); // create new jcas if necessary (i.e. a thread calls this method for the first time) if (!jCases.containsKey(threadId)) { synchronized (jCases) { try { jCases.put(threadId, JCasFactory.createJCas(typeSystem)); } catch (UIMAException e) { throw new RuntimeException(e); } } } // get jcas JCas jcas = jCases.get(threadId); jcas.reset(); // TODO: for applications that create an excessive amount of threads it would be best // to release the jcas object once a calling thread dies in order to reduce memory consumption. // This would probably require a new thread for each one that calls this method (to call its // thread.join() method), which seems like a bit of an overkill for applications with few threads. jcas.setDocumentText(text); PubmedDocument pd = new PubmedDocument(jcas); pd.setBegin(0); pd.setEnd(text.length()); pd.setPmid(""); pd.addToIndexes(jcas); return tag(jcas); } /** * Converts all annotations from jcas to the IOB format * * @param jcas the jcas * @return */ public static String convertToIOB(JCas jcas) { StringBuilder sb = new StringBuilder(); HashMap<String, ArrayList<NamedEntity>> goldAnnotations = new HashMap<String, ArrayList<NamedEntity>>(); HashMap<String, ArrayList<NamedEntity>> pipelineAnnotations = new HashMap<String, ArrayList<NamedEntity>>(); System.out.println("Converting annotations to IOB format..."); Iterator<PubmedDocument> abstracts = JCasUtil.iterator(jcas, PubmedDocument.class); while (abstracts.hasNext()) { PubmedDocument pubmedAbstract = abstracts.next(); sb.append("### ").append(pubmedAbstract.getPmid()).append("\n"); int offset = pubmedAbstract.getBegin(); String pmid = pubmedAbstract.getPmid(); List<Token> tokens = JCasUtil.selectCovered(Token.class, pubmedAbstract); for (Token token : tokens) { token.setLabel("O"); } List<NamedEntity> entities = JCasUtil.selectCovered(NamedEntity.class, pubmedAbstract); for (NamedEntity entity : entities) { int firstTokenBegin = 0; int lastTokenEnd = 0; String id = ""; Mention m = new Mention(entity); for (ChemicalID type : ChemicalID.values()) { String tempId = m.getId(type); id += (!id.isEmpty() ? "\t" : "") + (tempId != null && !tempId.isEmpty() ? tempId : ""); } if (!Constants.GOLDSTANDARD.equals(entity.getSource())) { if (pipelineAnnotations.containsKey(pmid)) { pipelineAnnotations.get(pmid).add(entity); } else { ArrayList<NamedEntity> tempArray = new ArrayList<NamedEntity>(); tempArray.add(entity); pipelineAnnotations.put(pmid, tempArray); } String labelName = m.getType().toString(); List<Token> entityTokens = JCasUtil.selectCovered(Token.class, entity); boolean first = true; for (Token token : entityTokens) { if (first) { if (id.isEmpty()) token.setLabel("B-" + labelName); else token.setLabel("B-" + labelName + "\t" + id); first = false; firstTokenBegin = token.getBegin(); } else { token.setLabel("I-" + labelName); } lastTokenEnd = token.getEnd(); } assert entity.getBegin() == firstTokenBegin : (id + ": " + entity.getBegin() + " -> " + firstTokenBegin); assert entity.getEnd() == lastTokenEnd : (id + ": " + entity.getEnd() + " -> " + lastTokenEnd); } else { if (goldAnnotations.containsKey(pmid)) { goldAnnotations.get(pmid).add(entity); } else { ArrayList<NamedEntity> tempArray = new ArrayList<NamedEntity>(); tempArray.add(entity); goldAnnotations.put(pmid, tempArray); } } } List<Token> tokensToPrint = JCasUtil.selectCovered(Token.class, pubmedAbstract); boolean firstToken = true; for (Token token : tokensToPrint) { if (firstToken && (token.getBegin() - offset) != 0) { sb.append(" " + "\t" + 0 + "\t").append(token.getBegin() - offset).append("\t\t|O\n"); } firstToken = false; sb.append(token.getCoveredText()).append("\t").append(token.getBegin() - offset).append("\t").append(token.getEnd() - offset).append("\t\t|").append(token.getLabel()).append("\n"); } } return sb.toString(); } public static String serializeAnnotations(JCas jcas) { int offset; StringBuilder sb = new StringBuilder(); Iterator<PubmedDocument> documentIterator = JCasUtil.iterator(jcas, PubmedDocument.class); while (documentIterator.hasNext()) { PubmedDocument document = documentIterator.next(); offset = document.getBegin(); String pmid = document.getPmid(); int numberOfEntities = 0; Iterator<NamedEntity> entityIterator = JCasUtil.iterator(document, NamedEntity.class, true, true); while (entityIterator.hasNext()) { NamedEntity entity = entityIterator.next(); if (!Constants.GOLDSTANDARD.equals(entity.getSource())) { //offset fix for GeneView //int begin = entity.getBegin() - offset; int begin = entity.getBegin() - offset - 1; //int end = entity.getEnd() - offset - 1; int end = entity.getEnd() - offset - 2; String text = entity.getCoveredText(); String id = ""; Mention m = new Mention(entity); for (ChemicalID type : ChemicalID.values()) { String tempId = m.getId(type); id += "\t" + (tempId != null && !tempId.isEmpty() ? tempId : ""); } sb.append(pmid + "\t" + begin + "\t" + end + "\t" + text + "\t" + m.getType().toString() + id + "\n"); } numberOfEntities++; } if (numberOfEntities == 0) { sb.append(pmid + "\t-1\t-1\t\\N\t\\N\t\\N" + new String(new char[ChemicalID.values().length]).replace("\0", "\t") + "\n"); } } return sb.toString(); } public ChemicalNEREvaluator getEvaluator() { return evaluator; } public void setEvaluator(ChemicalNEREvaluator evaluator) { this.evaluator = evaluator; } public FeatureTokenGenerator getFeatureTokenGenerator() { return featureGenerator; } }