SBDtokenizer.java example

Explorer
GeneDiseasePaper-master
- java
/*
 * Concept profile generation tool suite
 * Copyright (C) 2015 Biosemantics Group, Erasmus University Medical Center,
 *  Rotterdam, The Netherlands
 * 
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as published
 * by the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Affero General Public License for more details.
 * 
 * You should have received a copy of the GNU Affero General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>
 */

package org.erasmusmc.peregrine;

import java.io.Serializable;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;

import org.erasmusmc.utilities.StringUtilities;

/**Tokenizer that uses an algorithm based on the work of Meehan to detect sentence boundaries.*/
public class SBDtokenizer extends Tokenizer implements Serializable{

	public SBDtokenizer(){
		super();
	}

	public SBDtokenizer(Tokenizer tokenizer) {
		super(tokenizer);
	}

	public void tokenize(String string){
		this.string = string;
		word2data.clear();
		tokens.clear(); 
		startpositions.clear(); 
		endpositions.clear();  
		endOfSentence.clear();
		ambiguousEOS.clear();
		potentialEosFlag = false;
		boolean inParenthesis = false;

		int start = 0;
		int i = 0;
		for (; i < string.length(); i++){
			char ch = string.charAt(i);

			if (!Character.isLetterOrDigit(ch) && 
					!(ch == '\'' && i>0 && Character.isLetter(string.charAt(i-1)) && string.length()-1 > i && string.charAt(i+1) == 's' && (string.length()-2 == i || !Character.isLetterOrDigit(string.charAt(i+2))))){ //leaves ' in possesive pattern    
				if (start != i) {
					AddToken(start, i);        
				}
				if (ch == '(') 
					inParenthesis = true;
				else if (ch == ')')
					inParenthesis = false;

				if (tokens.size() != 0){
					//Detect (potential) End Of Sentence:        
					potentialEosFlag = false;
					if ((int)ch == 10 || ch == '!' || ch == '?'){ //single char unambiguous patterns
						potentialEosFlag = true;
					} else if ((ch == ']' || ch == ')') && i < string.length()-1 && string.charAt(i+1) == '.'){
						potentialEosFlag = true;
						i++;
					} 
					if (!inParenthesis && ch == '.' && i < string.length()-2 && string.charAt(i+1) == ' ') {
						int ord = (int)string.charAt(i+2);
						if (ord<97 || ord>122){ //anything but lowercase    
							potentialEosFlag = true;
							ambiguousEOS.add(endOfSentence.size());        
							i++;
						}
					}
					if (potentialEosFlag) {
						endOfSentence.add(tokens.size());
					}

				}
				start = i+1;
			}
		}
		if (start != i) {
			AddToken(start, i);   
		} 
		if (ambiguousEOS.size() != 0){
			checkForMoreAbbreviations();
			disambiguateEOS();
		}
		//Add end of sentence at end of document:
		endOfSentence.add(tokens.size());

		removeDuplicates(endOfSentence);
	}

	private void removeDuplicates(List<Integer> endOfSentence){
		Iterator<Integer> eosIterator = endOfSentence.iterator();
		int previous = -1;
		while (eosIterator.hasNext()){
			int eos = eosIterator.next();
			if (eos == previous)
				eosIterator.remove();
			else
				previous = eos;
		}
	}

	private void disambiguateEOS() {
		for (int i = ambiguousEOS.size()-1; i >= 0; i--){
			int eosIndex = ambiguousEOS.get(i);
			int tokenIndex = endOfSentence.get(eosIndex);
			if (tokenIndex > 0 && tokenIndex < tokens.size()-1){
				WordData precedingWordData = word2data.get(tokens.get(tokenIndex-1).toLowerCase());
				WordData nextWordData = word2data.get(tokens.get(tokenIndex).toLowerCase());
				if (!(nextWordData.isNotProperNoun || 
						(precedingWordData.isNotAbbreviation) ||
						(!precedingWordData.isAbbreviation && !nextWordData.isProperNoun)) ||
						(!precedingWordData.isNotAbbreviation && nextWordData.isNumber )){
					endOfSentence.remove(eosIndex);
				} 
			}   
		}    
	}

	private void checkForMoreAbbreviations() {
		String word;
		WordData wordData;
		for (Map.Entry<String, WordData> entry : word2data.entrySet()){
			wordData = entry.getValue();
			if (!wordData.isNotAbbreviation){
				word = entry.getKey();
				if (StringUtilities.containsNumber(word))
					wordData.isNotAbbreviation = true; 
				else {
					if (word.length() == 1 || noVowels(word)) 
						wordData.isAbbreviation = true; 
				}
			}
		}
	}

	private boolean noVowels(String word) {
		for (char c : word.toCharArray()){
			if (c == 'a' || c == 'e' || c == 'i' || c == 'o' || c == 'u' || c == 'y'){
				return false;
			}
		}
		return true;
	}

	private void AddToken(int start, int end) {
		String word = string.substring(start,end);
		String lcword = word.toLowerCase();
		tokens.add(word);
		startpositions.add(start);
		endpositions.add(end-1);

		//add to word list:
		WordData wordData;
		wordData = word2data.get(lcword);
		if (wordData == null) {
			wordData = new WordData();
			word2data.put(lcword, wordData);
		}

		//check for proper noun:
		int ord = (int)word.charAt(0);
		if (ord<91 && ord>64) { // first char is a capital
			if (!potentialEosFlag && tokens.size() != 0) wordData.isProperNoun = true; 
		} else {
			wordData.isNotProperNoun = true;
		}

		//check for abbreviation:
		if (word.length() > 4 || (end < string.length()-1 && !(string.charAt(end) == '.'))){
			wordData.isNotAbbreviation = true;
		}

		//check for number:
		if (StringUtilities.isNumber(word))
			wordData.isNumber = true;
	}

	private class WordData implements Serializable{
		private static final long serialVersionUID = -6853979045253442261L;
		boolean isAbbreviation = false;
		boolean isNotAbbreviation = false;
		boolean isProperNoun = false;
		boolean isNotProperNoun = false;
		boolean isNumber = false;
	}
	protected String string;
	private boolean potentialEosFlag;
	private final Map<String, WordData> word2data = new HashMap<String, WordData>();
	private static final long serialVersionUID = 1L;
	private List<Integer> ambiguousEOS = new ArrayList<Integer>();
}