/* * Concept profile generation tool suite * Copyright (C) 2015 Biosemantics Group, Erasmus University Medical Center, * Rotterdam, The Netherlands * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see <http://www.gnu.org/licenses/> */ package org.erasmusmc.peregrine; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import java.io.Serializable; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; import org.erasmusmc.utilities.StringUtilities; public class UMLSGeneChemTokenizer extends Tokenizer implements Serializable{ public boolean removeSuffixes = true; //Release takes longer if true private Set<String> suffixes = getDefaultSuffixes(); private String string; private boolean potentialEosFlag; private final Map<String, WordData> word2data = new HashMap<String, WordData>(); private static final long serialVersionUID = 1L; private List<Integer> ambiguousEOS = new ArrayList<Integer>(); public static void main(String[] args){ UMLSGeneChemTokenizer tokenizer = new UMLSGeneChemTokenizer(); String text = "A John. A dr. John. a-backed a,abc-def a, a.bcd ab(cd). Abc) abc. Abc- (abc. def). A-b-c-backed dr. John"; System.out.println(text); //tokenizer.tokenize("a-backed abc-def a, John a.bcd ab(cd). Abc) abc. Abc- (abc. def). A-b-c-backed dr. John"); tokenizer.tokenize(text); int start = 0; for (int eos : tokenizer.endOfSentence){ System.out.println(StringUtilities.join(tokenizer.tokens.subList(start, eos)," ")); start = eos; } } public UMLSGeneChemTokenizer(){ super(); } public UMLSGeneChemTokenizer(Tokenizer tokenizer) { super(tokenizer); } public void tokenize(String string){ this.string = string; word2data.clear(); tokens.clear(); startpositions.clear(); endpositions.clear(); endOfSentence.clear(); ambiguousEOS.clear(); potentialEosFlag = true; boolean inParenthesis = false; int start = 0; int i = 0; int lastHyphen = -1; for (; i < string.length(); i++){ char ch = string.charAt(i); if (isWordSeperator(ch)) if (ch == '-') lastHyphen = i; else if (ch == '.'){ if(!inParenthesis && i < string.length()-2 && string.charAt(i+1) == ' ') { int ord = (int)string.charAt(i+2); if (ord<97 || ord>122){ //anything but lowercase trimSplitAndAddToken(new TokenRef(start, i), lastHyphen); ambiguousEOS.add(endOfSentence.size()); endOfSentence.add(tokens.size()); potentialEosFlag = true; i++; start = i+1; } } } else { trimSplitAndAddToken(new TokenRef(start, i), lastHyphen); if (ch == '(') inParenthesis = true; else if (ch == ')') inParenthesis = false; if (tokens.size() != 0){ //Detect End Of Sentence: potentialEosFlag = false; if ((int)ch == 10 || ch == '!' || ch == '?') //single char unambiguous patterns potentialEosFlag = true; else if ((ch == ']' || ch == ')') && i < string.length()-1 && string.charAt(i+1) == '.'){ potentialEosFlag = true; i++; } if (potentialEosFlag) endOfSentence.add(tokens.size()); } start = i+1; } if (ch == ',') addToken(new TokenRef(i,i+1)); } if (start != i) { trimSplitAndAddToken(new TokenRef(start, i), lastHyphen); } if (ambiguousEOS.size() != 0){ checkForMoreAbbreviations(); disambiguateEOS(); } //Add end of sentence at end of document: endOfSentence.add(tokens.size()); removeDuplicates(endOfSentence); } private void removeDuplicates(List<Integer> endOfSentence){ Iterator<Integer> eosIterator = endOfSentence.iterator(); int previous = -1; while (eosIterator.hasNext()){ int eos = eosIterator.next(); if (eos == previous) eosIterator.remove(); else previous = eos; } } private void disambiguateEOS() { for (int i = ambiguousEOS.size()-1; i >= 0; i--){ int eosIndex = ambiguousEOS.get(i); int tokenIndex = endOfSentence.get(eosIndex); if (tokenIndex > 0 && tokenIndex < tokens.size()){ WordData precedingWordData = word2data.get(tokens.get(tokenIndex-1).toLowerCase()); WordData nextWordData = word2data.get(tokens.get(tokenIndex).toLowerCase()); if (!(nextWordData.isNotProperNoun || (precedingWordData.isNotAbbreviation) || (!precedingWordData.isAbbreviation && !nextWordData.isProperNoun)) || (!precedingWordData.isNotAbbreviation && nextWordData.isNumber )){ endOfSentence.remove(eosIndex); } } } } private void checkForMoreAbbreviations() { String word; WordData wordData; for (Map.Entry<String, WordData> entry : word2data.entrySet()){ wordData = entry.getValue(); if (!wordData.isNotAbbreviation){ word = entry.getKey(); if (StringUtilities.containsNumber(word)) wordData.isNotAbbreviation = true; else { if (word.length() == 1 || noVowels(word)) wordData.isAbbreviation = true; } } } } private boolean noVowels(String word) { for (char c : word.toCharArray()){ if (c == 'a' || c == 'e' || c == 'i' || c == 'o' || c == 'u' || c == 'y'){ return false; } } return true; } private void trimSplitAndAddToken(TokenRef tokenRef, int lastHyphen) { tokenRef = trim(tokenRef); if (tokenRef.length() == 0) return; if (removeSuffixes && lastHyphen > tokenRef.start && lastHyphen < tokenRef.end-1){ String suffix = string.substring(lastHyphen+1, tokenRef.end); if (suffixes.contains(suffix.toLowerCase())){// || StringUtilities.isNumber(suffix) || StringUtilities.isRomanNumeral(suffix)){ addToken(new TokenRef(tokenRef.start, lastHyphen)); addToken(new TokenRef(lastHyphen + 1, tokenRef.end)); return; } } addToken(tokenRef); } private TokenRef trim(TokenRef tokenRef) { if (tokenRef.length() < 2) return tokenRef; char startChar = string.charAt(tokenRef.start); char endChar = string.charAt(tokenRef.end - 1); if (startChar == '.') tokenRef.start++; else if (endChar == '.') tokenRef.end--; else if (startChar == '(' && endChar == ')') { tokenRef.start++; tokenRef.end--; } else if (startChar == '[' && endChar == ']') { tokenRef.start++; tokenRef.end--; } else if (startChar == '{' && endChar == '}') { tokenRef.start++; tokenRef.end--; } else if (startChar == '(' && !contains(tokenRef, ')')) tokenRef.start++; else if (endChar == ')' && !contains(tokenRef, '(')) tokenRef.end--; else if (startChar == '[' && !contains(tokenRef, ']')) tokenRef.start++; else if (endChar == ']' && !contains(tokenRef, '[')) tokenRef.end--; else if (startChar == '{' && !contains(tokenRef, '}')) tokenRef.start++; else if (endChar == '}' && !contains(tokenRef, '{')) tokenRef.end--; else return tokenRef; return trim(tokenRef); } private boolean contains(TokenRef tokenPos, char ch) { for (int i = tokenPos.start; i < tokenPos.end; i++) if (string.charAt(i) == ch) return true; return false; } private void addToken(TokenRef tokenRef){ String word = tokenRef.toString(); String lcword = word.toLowerCase(); tokens.add(word); startpositions.add(tokenRef.start); endpositions.add(tokenRef.end-1); //add to word list: WordData wordData; wordData = word2data.get(lcword); if (wordData == null) { wordData = new WordData(); word2data.put(lcword, wordData); } //check for proper noun: int ord = (int)word.charAt(0); if (ord<91 && ord>64) { // first char is a capital if (!potentialEosFlag) wordData.isProperNoun = true; } else { wordData.isNotProperNoun = true; } //check for abbreviation: if (word.length() > 4 || (tokenRef.end < string.length()-1 && !(string.charAt(tokenRef.end) == '.'))){ wordData.isNotAbbreviation = true; } //check for number: if (StringUtilities.isNumber(word)) wordData.isNumber = true; } private boolean isWordSeperator(char ch){ return (!Character.isLetterOrDigit(ch) && !(ch == '+') && !(ch == ']') && !(ch == '[') && !(ch == ')') && !(ch == '(') && !(ch == '}') && !(ch == '{') && !(ch == '\'')); } private static Set<String> getDefaultSuffixes() { Set<String> suffixes = new HashSet<String>(); BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(ChemicalSBDtokenizer.class.getResourceAsStream("chemicalSuffixes.txt"))); try { while (bufferedReader.ready()) { String suffix = bufferedReader.readLine(); if (suffix.startsWith("-")) suffix = suffix.substring(1); suffixes.add(suffix); } } catch (IOException e) { e.printStackTrace(); } return suffixes; } public Set<String> getSuffixes(){ return suffixes; } public void setSuffixes(Set<String> suffixes){ this.suffixes = suffixes; } private class WordData implements Serializable{ private static final long serialVersionUID = -6853979045253442261L; boolean isAbbreviation = false; boolean isNotAbbreviation = false; boolean isProperNoun = false; boolean isNotProperNoun = false; boolean isNumber = false; } private class TokenRef { int start; //inclusive int end; //exclusive public TokenRef(int start, int end){ this.start = start; this.end = end; } public String toString(){ return string.substring(start, end); } public int length(){ return end-start; } } }