ExperimentalTokenizer.java example

Explorer
GeneDiseasePaper-master
- java
/*
 * Concept profile generation tool suite
 * Copyright (C) 2015 Biosemantics Group, Erasmus University Medical Center,
 *  Rotterdam, The Netherlands
 * 
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as published
 * by the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Affero General Public License for more details.
 * 
 * You should have received a copy of the GNU Affero General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>
 */

package org.erasmusmc.peregrine;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.erasmusmc.utilities.StringUtilities;

public class ExperimentalTokenizer extends Tokenizer implements Serializable{

	public boolean removeSuffixes = true; //Release takes longer if true
	private Set<String> suffixes = getDefaultSuffixes();
	private String string;
	private boolean potentialEosFlag;
	private final Map<String, WordData> word2data = new HashMap<String, WordData>();
	private static final long serialVersionUID = 1L;
	private List<Integer> ambiguousEOS = new ArrayList<Integer>();

	public static void main(String[] args){
		ExperimentalTokenizer tokenizer = new ExperimentalTokenizer();
		 
		//String text = "A John. A dr. John. a-backed a,abc-def a, a.bcd ab(cd). Abc) abc. Abc- (abc. def). A-b-c-backed dr. John";
		String text = "MMP (8%). Once upon a time, there was a 5,3-monoxoy-something. Alzheimer's disease. Disease, Alzheimer's. One [3,5] blaat [blaat]. One (3,5) blaat (blaat)";
		System.out.println(text);
		//tokenizer.tokenize("a-backed abc-def a, John a.bcd ab(cd). Abc) abc. Abc- (abc. def). A-b-c-backed dr. John");
		tokenizer.tokenize(text);
		int start = 0;

		for (int eos : tokenizer.endOfSentence){
			System.out.println(StringUtilities.join(tokenizer.tokens.subList(start, eos)," "));
			start = eos;
		}
	}

	public ExperimentalTokenizer(){
		super();
	}

	public ExperimentalTokenizer(Tokenizer tokenizer) {
		super(tokenizer);
	}

	public void tokenize(String string){
		this.string = string;
		word2data.clear();
		tokens.clear(); 
		startpositions.clear(); 
		endpositions.clear();  
		endOfSentence.clear();
		ambiguousEOS.clear();
		potentialEosFlag = true;
		boolean inParenthesis = false;

		int start = 0;
		int i = 0;
		int lastHyphen = -1;
		for (; i < string.length(); i++){
			char ch = string.charAt(i);
			if (isWordSeperator(ch))
				if (ch == '-')
					lastHyphen = i;
				else if (ch == '.'){
					if(!inParenthesis && i < string.length()-2 && string.charAt(i+1) == ' ') {
						int ord = (int)string.charAt(i+2);
						if (ord<97 || ord>122){ //anything but lowercase   
							trimSplitAndAddToken(new TokenRef(start, i), lastHyphen); 
							ambiguousEOS.add(endOfSentence.size());      
							endOfSentence.add(tokens.size()); 
							potentialEosFlag = true;
							i++;
							start = i+1;
						}
					}
				} else {  
					trimSplitAndAddToken(new TokenRef(start, i), lastHyphen);        

					if (ch == '(') 
						inParenthesis = true;
					else if (ch == ')')
						inParenthesis = false;

					if (tokens.size() != 0){ //Detect End Of Sentence:  
						potentialEosFlag = false;
						if ((int)ch == 10 || ch == '!' || ch == '?') //single char unambiguous patterns
							potentialEosFlag = true;
						else if ((ch == ']' || ch == ')') && i < string.length()-1 && string.charAt(i+1) == '.'){
							potentialEosFlag = true;
							if (ch == ')'){
								addToken(new TokenRef(i,i+1));
								ch = '@';
							}
							i++;
						}
						if (potentialEosFlag)
						  endOfSentence.add(tokens.size());
					}
					start = i+1;
				}
			if (ch == ',' || ch == '(' || ch == ')')
				addToken(new TokenRef(i,i+1));
		}
		if (start != i) {
			trimSplitAndAddToken(new TokenRef(start, i), lastHyphen);   
		} 
		if (ambiguousEOS.size() != 0){
			checkForMoreAbbreviations();
			disambiguateEOS();
		}
		//Add end of sentence at end of document:
		endOfSentence.add(tokens.size());

		removeDuplicates(endOfSentence);
	}

	private void removeDuplicates(List<Integer> endOfSentence){
		Iterator<Integer> eosIterator = endOfSentence.iterator();
		int previous = -1;
		while (eosIterator.hasNext()){
			int eos = eosIterator.next();
			if (eos == previous)
				eosIterator.remove();
			else
				previous = eos;
		}
	}

	private void disambiguateEOS() {
		for (int i = ambiguousEOS.size()-1; i >= 0; i--){
			int eosIndex = ambiguousEOS.get(i);
			int tokenIndex = endOfSentence.get(eosIndex);
			if (tokenIndex > 0 && tokenIndex < tokens.size()){
				WordData precedingWordData = word2data.get(tokens.get(tokenIndex-1).toLowerCase());
				WordData nextWordData = word2data.get(tokens.get(tokenIndex).toLowerCase());
				if (!(nextWordData.isNotProperNoun || 
						(precedingWordData.isNotAbbreviation) ||
						(!precedingWordData.isAbbreviation && !nextWordData.isProperNoun)) ||
						(!precedingWordData.isNotAbbreviation && nextWordData.isNumber )){
					endOfSentence.remove(eosIndex);
				} 
			}   
		}    
	}

	private void checkForMoreAbbreviations() {
		String word;
		WordData wordData;
		for (Map.Entry<String, WordData> entry : word2data.entrySet()){
			wordData = entry.getValue();
			if (!wordData.isNotAbbreviation){
				word = entry.getKey();
				if (StringUtilities.containsNumber(word))
					wordData.isNotAbbreviation = true; 
				else {
					if (word.length() == 1 || noVowels(word)) 
						wordData.isAbbreviation = true; 
				}
			}
		}
	}

	private boolean noVowels(String word) {
		for (char c : word.toCharArray()){
			if (c == 'a' || c == 'e' || c == 'i' || c == 'o' || c == 'u' || c == 'y'){
				return false;
			}
		}
		return true;
	}

	private void trimSplitAndAddToken(TokenRef tokenRef, int lastHyphen) {
		tokenRef = trim(tokenRef);

		if (tokenRef.length() == 0)
			return;

		if (removeSuffixes && lastHyphen > tokenRef.start && lastHyphen < tokenRef.end-1){ 
			String suffix = string.substring(lastHyphen+1, tokenRef.end);
			if (suffixes.contains(suffix.toLowerCase())){// || StringUtilities.isNumber(suffix) || StringUtilities.isRomanNumeral(suffix)){
				addToken(new TokenRef(tokenRef.start, lastHyphen));
				addToken(new TokenRef(lastHyphen + 1, tokenRef.end));
				return;
			}
		}  
		addToken(tokenRef);
	}


	private TokenRef trim(TokenRef tokenRef) {
		if (tokenRef.length() < 2)
			return tokenRef;

		char startChar = string.charAt(tokenRef.start);
		char endChar = string.charAt(tokenRef.end - 1);

		if (startChar == '.')
			tokenRef.start++;
		else if (endChar == '.')
			tokenRef.end--;
		else if (startChar == '[' && endChar == ']') {
			tokenRef.start++;
			tokenRef.end--;
		} else if (startChar == '{' && endChar == '}') {
			tokenRef.start++;
			tokenRef.end--;
		}else if (startChar == '[' && !contains(tokenRef, ']'))
			tokenRef.start++;
		else if (endChar == ']' && !contains(tokenRef, '['))
			tokenRef.end--;
		else if (startChar == '{' && !contains(tokenRef, '}'))
			tokenRef.start++;
		else if (endChar == '}' && !contains(tokenRef, '{'))
			tokenRef.end--;   
		else 
			return tokenRef;
		return trim(tokenRef);
	}

	private boolean contains(TokenRef tokenPos, char ch) {
		for (int i = tokenPos.start; i < tokenPos.end; i++)
			if (string.charAt(i) == ch)
				return true;
		return false;
	}

	private void addToken(TokenRef tokenRef){
		String word = tokenRef.toString();

		String lcword = word.toLowerCase();
		tokens.add(word);
		startpositions.add(tokenRef.start);
		endpositions.add(tokenRef.end-1);

		//add to word list:
		WordData wordData;
		wordData = word2data.get(lcword);
		if (wordData == null) {
			wordData = new WordData();
			word2data.put(lcword, wordData);
		}

		//check for proper noun:
		int ord = (int)word.charAt(0);
		if (ord<91 && ord>64) { // first char is a capital
			if (!potentialEosFlag) 
				wordData.isProperNoun = true; 
		} else {
			wordData.isNotProperNoun = true;
		}

		//check for abbreviation:
		if (word.length() > 4 || (tokenRef.end < string.length()-1 && !(string.charAt(tokenRef.end) == '.'))){
			wordData.isNotAbbreviation = true;
		}

		//check for number:
		if (StringUtilities.isNumber(word))
			wordData.isNumber = true;
	}

	private boolean isWordSeperator(char ch){
		return (!Character.isLetterOrDigit(ch) &&
				!(ch == '+') &&
				!(ch == ']') &&
				!(ch == '[') &&
				//!(ch == ')') &&
				//!(ch == '(') &&
				!(ch == '}') &&
				!(ch == '{') &&
				!(ch == '\''));
	}  

	private static Set<String>  getDefaultSuffixes() {
		Set<String>  suffixes = new HashSet<String>();
		BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(ChemicalSBDtokenizer.class.getResourceAsStream("chemicalSuffixes.txt")));
		try {
			while (bufferedReader.ready()) {
				String suffix = bufferedReader.readLine();
				if (suffix.startsWith("-"))
					suffix = suffix.substring(1);
				suffixes.add(suffix);
			}
		} catch (IOException e) {
			e.printStackTrace();
		}
		return suffixes;
	}
	
	public Set<String> getSuffixes(){
		return suffixes;
	}
	
	public void setSuffixes(Set<String> suffixes){
		this.suffixes = suffixes;
	}

	private class WordData implements Serializable{
		private static final long serialVersionUID = -6853979045253442261L;
		boolean isAbbreviation = false;
		boolean isNotAbbreviation = false;
		boolean isProperNoun = false;
		boolean isNotProperNoun = false;
		boolean isNumber = false;
	}

	private class TokenRef {
		int start; //inclusive
		int end; //exclusive
		public TokenRef(int start, int end){
			this.start = start;
			this.end = end;
		}

		public String toString(){
			return string.substring(start, end);
		}

		public int length(){
			return end-start;
		}
	}
}