/* * Concept profile generation tool suite * Copyright (C) 2015 Biosemantics Group, Erasmus University Medical Center, * Rotterdam, The Netherlands * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see <http://www.gnu.org/licenses/> */ package org.erasmusmc.ontology.ontologyutilities.evaluationScripts; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; import org.erasmusmc.peregrine.ConceptPeregrine; import org.erasmusmc.peregrine.ResultConcept; import org.erasmusmc.peregrine.ResultTerm; import org.erasmusmc.peregrine.Tokenizer; import org.erasmusmc.peregrine.disambiguator.GeneDisambiguator; public class FullTextAdapter { public static Map<Integer, Context> process(List<String> lines, ConceptPeregrine peregrine, GeneDisambiguator disambiguator){ List<String> sections = toSections(lines); return index(sections, peregrine, disambiguator); } private static Map<Integer, Context> index(List<String> sections, ConceptPeregrine peregrine, GeneDisambiguator disambiguator) { Map<Integer, Context> id2context = new HashMap<Integer, Context>(); List<String> tokens = new ArrayList<String>(); Map<Integer, ResultConcept> id2concept = new HashMap<Integer, ResultConcept>(); for (String section : sections){ peregrine.index(section); if (disambiguator != null) disambiguator.disambiguate(peregrine); removeNonCooccurring(peregrine, id2context); //Change word ids: for (ResultTerm term : peregrine.resultTerms) for (int i = 0; i < term.words.length; i++) term.words[i] = term.words[i] + tokens.size(); //Store found terms: for (ResultConcept concept : peregrine.resultConcepts){ ResultConcept existingConcept = id2concept.get(concept.conceptId); if (existingConcept == null) id2concept.put(concept.conceptId, concept); else existingConcept.terms.addAll(concept.terms); } tokens.addAll(peregrine.tokenizer.tokens); } peregrine.resultConcepts = new ArrayList<ResultConcept>(id2concept.values()); peregrine.tokenizer.tokens = tokens; return id2context; } private static void removeNonCooccurring(ConceptPeregrine peregrine, Map<Integer,Context> cid2context) { Set<Integer> conceptWords = new HashSet<Integer>(); Map<Integer, Set<Integer>> cid2sentenceIDs = new HashMap<Integer, Set<Integer>>(); Set<Integer> singleSentences = new HashSet<Integer>(); Set<Integer> cooccurringSentences = new HashSet<Integer>(); for (ResultConcept concept : peregrine.resultConcepts){ Set<Integer> sentenceIDs = new HashSet<Integer>(); cid2sentenceIDs.put(concept.conceptId, sentenceIDs); for (ResultTerm term : concept.terms){ int sentenceID = getSentenceID(peregrine.tokenizer, term); if (sentenceIDs.add(sentenceID)) if (!singleSentences.add(sentenceID)) cooccurringSentences.add(sentenceID); for (int word : term.words) conceptWords.add(word); } } List<String> lines = filterLines(peregrine.tokenizer, conceptWords); //remove all non cooccurring concepts: Iterator<ResultConcept> conceptIterator = peregrine.resultConcepts.iterator(); while (conceptIterator.hasNext()){ ResultConcept concept = conceptIterator.next(); boolean cooccurrence = false; for (Integer sentenceID : cid2sentenceIDs.get(concept.conceptId)) if (cooccurringSentences.contains(sentenceID)){ cooccurrence = true; break; } if (!cooccurrence) conceptIterator.remove(); } //Build contexts: for (ResultConcept concept : peregrine.resultConcepts){ Context context = cid2context.get(concept.conceptId); if (context == null){ context = new Context(); cid2context.put(concept.conceptId, context); } for (Integer sentenceID : cid2sentenceIDs.get(concept.conceptId)){ String sentence = lines.get(sentenceID); context.all.append(sentence); if (cooccurringSentences.contains(sentenceID)) context.cooccurring.append(sentence); } cid2context.put(concept.conceptId, context); } } public static class Context { public StringBuilder cooccurring = new StringBuilder(); public StringBuilder all = new StringBuilder(); } private static List<String> filterLines(Tokenizer tokenizer, Set<Integer> conceptWords) { List<String> lines = new ArrayList<String>(); int start = 0; for (int eos : tokenizer.endOfSentence){ StringBuilder sb = new StringBuilder(); for (int i = start; i < eos; i++) if (!conceptWords.contains(i)){ sb.append(tokenizer.tokens.get(i)); sb.append(' '); } else sb.append("proteinX "); sb.append("\n"); lines.add(sb.toString()); start = eos; } return lines; } private static int getSentenceID(Tokenizer tokenizer, ResultTerm term) { int start = 0; int termStart = term.words[0]; for (int i = 0; i < tokenizer.endOfSentence.size(); i++){ int eos = tokenizer.endOfSentence.get(i); if (termStart >= start && termStart < eos) return i; start = eos; } return -1; } public static List<String> toSections(List<String> lines) { String buffer = ""; List<String> sections = new ArrayList<String>(); for (String line : lines){ if (line.length() == 0){ sections.add(buffer); buffer = ""; } else buffer = buffer + "\n" + line; } return sections; } public static String getSentence(String text, Tokenizer tokenizer, ResultTerm term) { int termStart = term.words[0]; int termEnd = term.words[term.words.length-1]; int termStartPos = tokenizer.startpositions.get(termStart); int termEndPos = tokenizer.endpositions.get(termEnd); int sos = 0; StringBuilder sentence = new StringBuilder(); for (int eos : tokenizer.endOfSentence){ if (termStart >= sos && termStart < eos){ if (termStart != sos){ sentence.append(text.substring(tokenizer.startpositions.get(sos), termStartPos)); if (termEnd != eos-1) sentence.append(" and "); } if (termEnd != eos-1) sentence.append(text.substring(termEndPos+1,tokenizer.endpositions.get(eos-1))); sentence.append(". "); } sos = eos; } return sentence.toString(); } }