/*
* Concept profile generation tool suite
* Copyright (C) 2015 Biosemantics Group, Erasmus University Medical Center,
* Rotterdam, The Netherlands
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published
* by the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>
*/
package org.erasmusmc.peregrine;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.erasmusmc.utilities.StringUtilities;
public class ChemicalSBDtokenizer extends Tokenizer implements Serializable{
boolean removeSuffixes = true; //Release takes longer if true
public ChemicalSBDtokenizer(){
super();
}
public ChemicalSBDtokenizer(Tokenizer tokenizer) {
super(tokenizer);
}
public void tokenize(String string){
this.string = string;
word2data.clear();
tokens.clear();
startpositions.clear();
endpositions.clear();
endOfSentence.clear();
ambiguousEOS.clear();
potentialEosFlag = false;
boolean inParenthesis = false;
int start = 0;
int i = 0;
for (; i < string.length(); i++){
char ch = string.charAt(i);
if (!Character.isLetterOrDigit(ch) &&
!(ch == '.') &&
!(ch == ',') &&
!(ch == '+') &&
!(ch == ']') &&
!(ch == '[') &&
!(ch == ')') &&
!(ch == '(') &&
!(ch == '}') &&
!(ch == '{') &&
!(ch == '-') &&
!(ch == '\'')){
// !(ch == '\'' && i>0 && Character.isLetter(string.charAt(i-1)) && string.length()-1 > i && string.charAt(i+1) == 's' && (string.length()-2 == i || !Character.isLetterOrDigit(string.charAt(i+2))))){ //leaves ' in possesive pattern
if (start != i) {
AddToken(start, i);
}
if (ch == '(')
inParenthesis = true;
else if (ch == ')')
inParenthesis = false;
if (tokens.size() != 0){
//Detect (potential) End Of Sentence:
potentialEosFlag = false;
if ((int)ch == 10 || ch == '!' || ch == '?'){ //single char unambiguous patterns
potentialEosFlag = true;
} else if ((ch == ']' || ch == ')') && i < string.length()-1 && string.charAt(i+1) == '.'){
potentialEosFlag = true;
i++;
}
if (!inParenthesis && ch == '.' && i < string.length()-2 && string.charAt(i+1) == ' ') {
int ord = (int)string.charAt(i+2);
if (ord<97 || ord>122){ //anything but lowercase
potentialEosFlag = true;
ambiguousEOS.add(endOfSentence.size());
i++;
}
}
if (potentialEosFlag) {
endOfSentence.add(tokens.size());
}
}
start = i+1;
}
}
if (start != i) {
AddToken(start, i);
}
if (ambiguousEOS.size() != 0){
checkForMoreAbbreviations();
disambiguateEOS();
}
//Add end of sentence at end of document:
endOfSentence.add(tokens.size());
removeDuplicates(endOfSentence);
}
private void removeDuplicates(List<Integer> endOfSentence){
Iterator<Integer> eosIterator = endOfSentence.iterator();
int previous = -1;
while (eosIterator.hasNext()){
int eos = eosIterator.next();
if (eos == previous)
eosIterator.remove();
else
previous = eos;
}
}
private void disambiguateEOS() {
for (int i = ambiguousEOS.size()-1; i >= 0; i--){
int eosIndex = ambiguousEOS.get(i);
int tokenIndex = endOfSentence.get(eosIndex);
if (tokenIndex > 0 && tokenIndex < tokens.size()-1){
WordData precedingWordData = word2data.get(tokens.get(tokenIndex-1).toLowerCase());
WordData nextWordData = word2data.get(tokens.get(tokenIndex).toLowerCase());
if (!(nextWordData.isNotProperNoun ||
(precedingWordData.isNotAbbreviation) ||
(!precedingWordData.isAbbreviation && !nextWordData.isProperNoun)) ||
(!precedingWordData.isNotAbbreviation && nextWordData.isNumber )){
endOfSentence.remove(eosIndex);
}
}
}
}
private void checkForMoreAbbreviations() {
String word;
WordData wordData;
for (Map.Entry<String, WordData> entry : word2data.entrySet()){
wordData = entry.getValue();
if (!wordData.isNotAbbreviation){
word = entry.getKey();
if (StringUtilities.containsNumber(word))
wordData.isNotAbbreviation = true;
else {
if (word.length() == 1 || noVowels(word))
wordData.isAbbreviation = true;
}
}
}
}
private boolean noVowels(String word) {
for (char c : word.toCharArray()){
if (c == 'a' || c == 'e' || c == 'i' || c == 'o' || c == 'u' || c == 'y'){
return false;
}
}
return true;
}
private void AddToken(int start, int end) {
String word = string.substring(start,end);
//Start change oct 22, 2008
if (word.length()>1){
if (word.startsWith(".") || word.startsWith(",")){
word = word.substring(1);
start = start+1;
}
if (word.endsWith(".") || word.endsWith(",")){
word = word.substring(0, word.length()-1);
end = end-1;
if (end==-1) end = 0;
}
if (word.startsWith("(") && word.endsWith(")")){
word = word.substring(1, word.length()-1);
start = start+1;
end = end-1;
if (end==-1) end = 0;
}
if (word.startsWith("(") && !word.contains(")")){
word = word.substring(1, word.length());
start = start+1;
}
if (word.endsWith(")") && !word.contains("(")){
word = word.substring(0, word.length()-1);
end = end-1;
if (end==-1) end = 0;
}
if (word.startsWith("[") && word.endsWith("]")){
word = word.substring(1, word.length()-1);
start = start+1;
end = end-1;
if (end==-1) end = 0;
}
if (word.startsWith("[") && !word.contains("]")){
word = word.substring(1, word.length());
start = start+1;
}
if (word.endsWith("]") && !word.contains("[")){
word = word.substring(0, word.length()-1);
end = end-1;
if (end==-1) end = 0;
}
if (word.startsWith("{") && word.endsWith("}")){
word = word.substring(1, word.length()-1);
start = start+1;
end = end-1;
if (end==-1) end = 0;
}
if (word.startsWith("{") && !word.contains("}")){
word = word.substring(1, word.length());
start = start+1;
}
if (word.endsWith("}") && !word.contains("{")){
word = word.substring(0, word.length()-1);
end = end-1;
if (end==-1) end = 0;
}
}
if (removeSuffixes){
// This code checks for suffixes at the end of words
List<String> words = new ArrayList<String>();
int start1 = -10;
int end1 = -10;
int start2 = -10;
int end2 = -10;
if (word.length()!=0){
String suffixPart = "";
for (String suffix: suffixes){
if (word.toLowerCase().endsWith(suffix) && !word.toLowerCase().equals(suffix)){
suffixPart = suffix;
}
}
if (suffixPart.length()!=0){
String word1 = word.substring(0, word.length() - suffixPart.length());
start1 = start;
end1 = end-suffixPart.length();
String word2 = suffixPart.substring(1);
start2 = start+word1.length()+2;
end2 = end;
words.add(word1);
words.add(word2);
}
}
if (!words.isEmpty()){
int i =1;
for (String wordPart: words){
if (i==1){
start = start1;
end = end1-1;
} else if (i==2){
start = start2;
end = end2-1;
}
String lcword = wordPart.toLowerCase();
tokens.add(wordPart);
startpositions.add(start);
endpositions.add(end);
//add to word list:
WordData wordData;
wordData = word2data.get(lcword);
if (wordData == null) {
wordData = new WordData();
word2data.put(lcword, wordData);
}
//check for proper noun:
if (wordPart.length()==0) System.out.println("Something is wrong "+words.toString());
int ord = (int)wordPart.charAt(0);
if (ord<91 && ord>64) { // first char is a capital
if (!potentialEosFlag && tokens.size() != 0) wordData.isProperNoun = true;
} else {
wordData.isNotProperNoun = true;
}
//check for abbreviation:
if (wordPart.length() > 4 || (end < string.length()-1 && !(string.charAt(end) == '.'))){
wordData.isNotAbbreviation = true;
}
//check for number:
if (StringUtilities.isNumber(wordPart))
wordData.isNumber = true;
i++;
}
} else {
if (word.length()!=0){
// End change oct 22, 2008
String lcword = word.toLowerCase();
tokens.add(word);
startpositions.add(start);
endpositions.add(end-1);
//add to word list:
WordData wordData;
wordData = word2data.get(lcword);
if (wordData == null) {
wordData = new WordData();
word2data.put(lcword, wordData);
}
//check for proper noun:
int ord = (int)word.charAt(0);
if (ord<91 && ord>64) { // first char is a capital
if (!potentialEosFlag && tokens.size() != 0) wordData.isProperNoun = true;
} else {
wordData.isNotProperNoun = true;
}
//check for abbreviation:
if (word.length() > 4 || (end < string.length()-1 && !(string.charAt(end) == '.'))){
wordData.isNotAbbreviation = true;
}
//check for number:
if (StringUtilities.isNumber(word))
wordData.isNumber = true;
}
}
} else {
if (word.length()!=0){
// End change oct 22, 2008
String lcword = word.toLowerCase();
tokens.add(word);
startpositions.add(start);
endpositions.add(end-1);
//add to word list:
WordData wordData;
wordData = word2data.get(lcword);
if (wordData == null) {
wordData = new WordData();
word2data.put(lcword, wordData);
}
//check for proper noun:
int ord = (int)word.charAt(0);
if (ord<91 && ord>64) { // first char is a capital
if (!potentialEosFlag && tokens.size() != 0) wordData.isProperNoun = true;
} else {
wordData.isNotProperNoun = true;
}
//check for abbreviation:
if (word.length() > 4 || (end < string.length()-1 && !(string.charAt(end) == '.'))){
wordData.isNotAbbreviation = true;
}
//check for number:
if (StringUtilities.isNumber(word))
wordData.isNumber = true;
}
}
}
private static Set<String> suffixes = getSuffixes();
private static Set<String> getSuffixes() {
Set<String> suffixes = new HashSet<String>();
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(ChemicalSBDtokenizer.class.getResourceAsStream("chemicalSuffixes.txt")));
try {
while (bufferedReader.ready()) {
suffixes.add(bufferedReader.readLine());
}
} catch (IOException e) {
e.printStackTrace();
}
return suffixes;
}
private class WordData implements Serializable{
private static final long serialVersionUID = -6853979045253442261L;
boolean isAbbreviation = false;
boolean isNotAbbreviation = false;
boolean isProperNoun = false;
boolean isNotProperNoun = false;
boolean isNumber = false;
}
private String string;
private boolean potentialEosFlag;
private final Map<String, WordData> word2data = new HashMap<String, WordData>();
private static final long serialVersionUID = 1L;
private List<Integer> ambiguousEOS = new ArrayList<Integer>();
}