/* * Concept profile generation tool suite * Copyright (C) 2015 Biosemantics Group, Erasmus University Medical Center, * Rotterdam, The Netherlands * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see <http://www.gnu.org/licenses/> */ package org.erasmusmc.peregrine; import java.io.Serializable; import java.util.ArrayList; import java.util.List; /** * Tokenizer based on the SBDTokenizer that also detects sub-sentence boundaries (e.g. commas) * @author martijn * */ public class SubSentenceTokenizer extends SBDtokenizer implements Serializable { /** * The list of characters that can form a subsentence boundary. */ public char[] subSentenceDividers = new char[]{',',';',':'}; private int lastDivider; protected List<Integer> subEndOfSentence = new ArrayList<Integer>(); public void tokenize(String string){ super.tokenize(string); subEndOfSentence.clear(); lastDivider = 0; boolean hitDivider = false; int i = 0; for (; i < string.length(); i++){ char ch = string.charAt(i); if (hitDivider) if (Character.isWhitespace(ch)) storeDivider(i-1); hitDivider = false; for (char divider : subSentenceDividers) if (ch == divider){ hitDivider = true; break; } } merge(); } /** Merge subEndOfSentence and endOfSentence lists * */ private void merge() { List<Integer> newList = new ArrayList<Integer>(); int index1 = 0; int index2 = 0; int previous = -1; int value1; int value2; while (index1 < subEndOfSentence.size() && index2 < endOfSentence.size()){ value1 = subEndOfSentence.get(index1); value2 = endOfSentence.get(index2); if (value1 < value2){ if (value1 != previous) //also delete duplicates in subEOS list newList.add(value1); previous = value1; index1++; } else if (value1 > value2){ newList.add(value2); index2++; } else { newList.add(value1); index1++; index2++; } } while (index1 < subEndOfSentence.size()) { value1 = subEndOfSentence.get(index1); if (value1 != previous) //also delete duplicates in subEOS list newList.add(value1); previous = value1; index1++; } while (index2 < endOfSentence.size()) { value2 = endOfSentence.get(index2); newList.add(value2); index2++; } subEndOfSentence = newList; } private void storeDivider(int pos) { for (; lastDivider < endpositions.size(); lastDivider++) if (endpositions.get(lastDivider) > pos){ if (lastDivider != 0 && endpositions.get(lastDivider-1) < pos) subEndOfSentence.add(lastDivider); break; } } public void removeToken(int index){ super.removeToken(index); int value; for (int i = subEndOfSentence.size()-1; i >= 0; i--){ value = subEndOfSentence.get(i); if (value > index){ subEndOfSentence.set(i, value-1); } else break; } } private static final long serialVersionUID = 1L; /** * Similar to the endOfSentences, but now for subsentences. * @return */ public List<Integer> getSubEndOfSentences() { return subEndOfSentence; } }