/* * Concept profile generation tool suite * Copyright (C) 2015 Biosemantics Group, Erasmus University Medical Center, * Rotterdam, The Netherlands * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see <http://www.gnu.org/licenses/> */ package org.erasmusmc.peregrine; import java.util.ArrayList; import java.util.HashSet; import java.util.List; import java.util.Set; import org.erasmusmc.utilities.StringUtilities; public class StatementFilterTokenizer extends SBDtokenizer { private static final long serialVersionUID = 6227202980575935265L; public boolean questionmarkFilter = true; public boolean slashFilter = true; public boolean negationWordFilter = true; public boolean speculationWordFilter = true; public boolean alternativeWordFilter = true; public Set<String> speculationWords = new HashSet<String>(); public Set<String> negationWords = new HashSet<String>(); public Set<String> alternativesWords = new HashSet<String>(); public StatementFilterTokenizer(){ setSpeculationWords(); setNegationWords(); setAlternativesWords(); } public void tokenize(String string){ super.tokenize(string); if (questionmarkFilter) applyCharSentenceFilter('?'); if (slashFilter) applyCharAroundFilter('/'); if (alternativeWordFilter) applyWordFilter(alternativesWords, true); if (negationWordFilter) applyWordFilter(negationWords, false); if (speculationWordFilter) applyWordFilter(speculationWords, false); } private void applyWordFilter(Set<String> words, boolean bothSides) { List<Integer> indices = findAll(words); for (int index : indices){ deleteSentenceContaining(index, bothSides); } } private void applyCharSentenceFilter(char ch) { List<Integer> charPos = findAll(ch); for (int pos : charPos){ deleteSentenceContaining(pos); } } private void applyCharAroundFilter(char ch) { List<Integer> charPos = findAll(ch); for (int pos : charPos){ deleteTokensAround(pos); } } private void deleteTokensAround(int pos) { for (int i = 0; i < startpositions.size(); i++) if (startpositions.get(i) > pos){ if (i != 0) tokens.set(i-1, ""); tokens.set(i, ""); break; } } private List<Integer> findAll(Set<String> words) { List<Integer> positions = new ArrayList<Integer>(); for (int i = 0; i < tokens.size(); i++) if (words.contains(StringUtilities.firstLetterToLowerCase(tokens.get(i)))) positions.add(i); return positions; } private List<Integer> findAll(char ch) { List<Integer> positions = new ArrayList<Integer>(); for (int i = 0; i < string.length(); i++) if (string.charAt(i) == ch) positions.add(i); return positions; } private void deleteSentenceContaining(int pos) { int sos = 0; for (int i = 0; i < endOfSentence.size(); i++){ int eos = endOfSentence.get(i); int startPos = startpositions.get(sos); int endPos; if (eos == startpositions.size()) endPos = string.length(); else endPos = startpositions.get(eos); if (pos >= startPos && pos < endPos) deleteSentence(sos,eos); sos = eos; } } private void deleteSentenceContaining(int index, boolean bothSides) { int sos = 0; for (int i = 0; i < endOfSentence.size(); i++){ int eos = endOfSentence.get(i); if (index >= sos && index < eos){ if (bothSides) deleteSentence(sos,eos); else deleteSentence(index, eos); } sos = eos; } } private void deleteSentence(int sos, int eos) { for (int i = sos; i < eos; i++) tokens.set(i, ""); } private void setSpeculationWords(){ speculationWords.add("can"); speculationWords.add("either"); speculationWords.add("may"); speculationWords.add("might"); speculationWords.add("would"); speculationWords.add("likely"); speculationWords.add("should"); speculationWords.add("could"); speculationWords.add("probably"); speculationWords.add("probable"); speculationWords.add("possible"); speculationWords.add("possibly"); speculationWords.add("suggestion"); speculationWords.add("suggesting"); speculationWords.add("suggestive"); speculationWords.add("unsure"); speculationWords.add("rule"); speculationWords.add("question"); speculationWords.add("questions"); speculationWords.add("questionable"); speculationWords.add("most"); speculationWords.add("sometimes"); speculationWords.add("unless"); speculationWords.add("evaluate"); speculationWords.add("suggests"); speculationWords.add("probable"); speculationWords.add("favor"); speculationWords.add("favored"); speculationWords.add("presumes"); speculationWords.add("presumed"); speculationWords.add("presume"); speculationWords.add("suspect"); speculationWords.add("suspected"); speculationWords.add("suspecting"); speculationWords.add("consistent"); speculationWords.add("will"); } private void setNegationWords(){ negationWords.add("cannot"); negationWords.add("no"); negationWords.add("without"); negationWords.add("not"); } private void setAlternativesWords(){ alternativesWords.add("vs"); alternativesWords.add("versus"); speculationWords.add("or"); } }