/* * Concept profile generation tool suite * Copyright (C) 2015 Biosemantics Group, Erasmus University Medical Center, * Rotterdam, The Netherlands * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see <http://www.gnu.org/licenses/> */ package org.erasmusmc.peregrine; import java.io.Serializable; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import org.erasmusmc.utilities.StringUtilities; /**Tokenizer that uses an algorithm based on the work of Meehan to detect sentence boundaries.*/ public class SBDtokenizer extends Tokenizer implements Serializable{ public SBDtokenizer(){ super(); } public SBDtokenizer(Tokenizer tokenizer) { super(tokenizer); } public void tokenize(String string){ this.string = string; word2data.clear(); tokens.clear(); startpositions.clear(); endpositions.clear(); endOfSentence.clear(); ambiguousEOS.clear(); potentialEosFlag = false; boolean inParenthesis = false; int start = 0; int i = 0; for (; i < string.length(); i++){ char ch = string.charAt(i); if (!Character.isLetterOrDigit(ch) && !(ch == '\'' && i>0 && Character.isLetter(string.charAt(i-1)) && string.length()-1 > i && string.charAt(i+1) == 's' && (string.length()-2 == i || !Character.isLetterOrDigit(string.charAt(i+2))))){ //leaves ' in possesive pattern if (start != i) { AddToken(start, i); } if (ch == '(') inParenthesis = true; else if (ch == ')') inParenthesis = false; if (tokens.size() != 0){ //Detect (potential) End Of Sentence: potentialEosFlag = false; if ((int)ch == 10 || ch == '!' || ch == '?'){ //single char unambiguous patterns potentialEosFlag = true; } else if ((ch == ']' || ch == ')') && i < string.length()-1 && string.charAt(i+1) == '.'){ potentialEosFlag = true; i++; } if (!inParenthesis && ch == '.' && i < string.length()-2 && string.charAt(i+1) == ' ') { int ord = (int)string.charAt(i+2); if (ord<97 || ord>122){ //anything but lowercase potentialEosFlag = true; ambiguousEOS.add(endOfSentence.size()); i++; } } if (potentialEosFlag) { endOfSentence.add(tokens.size()); } } start = i+1; } } if (start != i) { AddToken(start, i); } if (ambiguousEOS.size() != 0){ checkForMoreAbbreviations(); disambiguateEOS(); } //Add end of sentence at end of document: endOfSentence.add(tokens.size()); removeDuplicates(endOfSentence); } private void removeDuplicates(List<Integer> endOfSentence){ Iterator<Integer> eosIterator = endOfSentence.iterator(); int previous = -1; while (eosIterator.hasNext()){ int eos = eosIterator.next(); if (eos == previous) eosIterator.remove(); else previous = eos; } } private void disambiguateEOS() { for (int i = ambiguousEOS.size()-1; i >= 0; i--){ int eosIndex = ambiguousEOS.get(i); int tokenIndex = endOfSentence.get(eosIndex); if (tokenIndex > 0 && tokenIndex < tokens.size()-1){ WordData precedingWordData = word2data.get(tokens.get(tokenIndex-1).toLowerCase()); WordData nextWordData = word2data.get(tokens.get(tokenIndex).toLowerCase()); if (!(nextWordData.isNotProperNoun || (precedingWordData.isNotAbbreviation) || (!precedingWordData.isAbbreviation && !nextWordData.isProperNoun)) || (!precedingWordData.isNotAbbreviation && nextWordData.isNumber )){ endOfSentence.remove(eosIndex); } } } } private void checkForMoreAbbreviations() { String word; WordData wordData; for (Map.Entry<String, WordData> entry : word2data.entrySet()){ wordData = entry.getValue(); if (!wordData.isNotAbbreviation){ word = entry.getKey(); if (StringUtilities.containsNumber(word)) wordData.isNotAbbreviation = true; else { if (word.length() == 1 || noVowels(word)) wordData.isAbbreviation = true; } } } } private boolean noVowels(String word) { for (char c : word.toCharArray()){ if (c == 'a' || c == 'e' || c == 'i' || c == 'o' || c == 'u' || c == 'y'){ return false; } } return true; } private void AddToken(int start, int end) { String word = string.substring(start,end); String lcword = word.toLowerCase(); tokens.add(word); startpositions.add(start); endpositions.add(end-1); //add to word list: WordData wordData; wordData = word2data.get(lcword); if (wordData == null) { wordData = new WordData(); word2data.put(lcword, wordData); } //check for proper noun: int ord = (int)word.charAt(0); if (ord<91 && ord>64) { // first char is a capital if (!potentialEosFlag && tokens.size() != 0) wordData.isProperNoun = true; } else { wordData.isNotProperNoun = true; } //check for abbreviation: if (word.length() > 4 || (end < string.length()-1 && !(string.charAt(end) == '.'))){ wordData.isNotAbbreviation = true; } //check for number: if (StringUtilities.isNumber(word)) wordData.isNumber = true; } private class WordData implements Serializable{ private static final long serialVersionUID = -6853979045253442261L; boolean isAbbreviation = false; boolean isNotAbbreviation = false; boolean isProperNoun = false; boolean isNotProperNoun = false; boolean isNumber = false; } protected String string; private boolean potentialEosFlag; private final Map<String, WordData> word2data = new HashMap<String, WordData>(); private static final long serialVersionUID = 1L; private List<Integer> ambiguousEOS = new ArrayList<Integer>(); }