/** * Copyright 2014, Emory University * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package edu.emory.clir.clearnlp.tokenization; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import edu.emory.clir.clearnlp.collection.set.CharHashSet; import edu.emory.clir.clearnlp.dictionary.AbstractDTTokenizer; import edu.emory.clir.clearnlp.dictionary.universal.DTCurrency; import edu.emory.clir.clearnlp.dictionary.universal.DTEmoticon; import edu.emory.clir.clearnlp.dictionary.universal.DTUnit; import edu.emory.clir.clearnlp.util.CharUtils; import edu.emory.clir.clearnlp.util.DSUtils; import edu.emory.clir.clearnlp.util.IOUtils; import edu.emory.clir.clearnlp.util.MetaUtils; import edu.emory.clir.clearnlp.util.PatternUtils; import edu.emory.clir.clearnlp.util.StringUtils; import edu.emory.clir.clearnlp.util.constant.CharConst; import edu.emory.clir.clearnlp.util.constant.StringConst; /** * @since 1.1.0 * @author Jinho D. Choi ({@code jinho.choi@emory.edu}) */ abstract public class AbstractTokenizer { private final CharHashSet S_SYMBOL_IN_BETWEEN = new CharHashSet(CharConst.SEMICOLON, CharConst.COMMA, CharConst.TILDA, CharConst.EQUAL, CharConst.PLUS, CharConst.AMPERSAND, CharConst.PIPE, CharConst.FW_SLASH); private final Pattern P_ABBREVIATION = PatternUtils.createClosedPattern("\\p{Alnum}([\\.|-]\\p{Alnum})*"); private final Pattern P_YEAR = PatternUtils.createClosedPattern("\\d\\d['\u2019]?[sS]?"); private DTEmoticon d_emoticon; private DTCurrency d_currency; private DTUnit d_unit; public AbstractTokenizer() { d_emoticon = new DTEmoticon(); d_currency = new DTCurrency(); d_unit = new DTUnit(); } // ----------------------------------- Public methods ----------------------------------- abstract public List<List<String>> segmentize(InputStream in); /** @return a list of tokens in the specific input stream. */ public List<String> tokenize(InputStream in) { BufferedReader reader = IOUtils.createBufferedReader(in); ArrayList<String> tokens = new ArrayList<>(); List<String> t; String line; try { while ((line = reader.readLine()) != null) { t = tokenizeWhiteSpaces(line); if (!t.isEmpty()) tokens.addAll(t); } reader.close(); } catch (IOException e) {e.printStackTrace();} tokens.trimToSize(); return tokens; } /** @return a list of tokens in the specific string. */ public List<String> tokenize(String s) { List<String> tokens = tokenizeWhiteSpaces(s); return tokens; } // ----------------------------------- Tokenize ----------------------------------- /** * Tokenizes white spaces. * Called by {@link #tokenize(InputStream)} and {@link #tokenize(String)}. */ private List<String> tokenizeWhiteSpaces(String s) { List<String> tokens = new ArrayList<>(); int i, len = s.length(), bIndex = 0; char[] cs = s.toCharArray(); for (i=0; i<len; i++) { if (CharUtils.isWhiteSpace(cs[i])) { if (bIndex < i) tokenizeMetaInfo(tokens, s.substring(bIndex, i)); bIndex = i + 1; } } if (bIndex < len) tokenizeMetaInfo(tokens, s.substring(bIndex)); if (!tokens.isEmpty()) finalize(tokens); return tokens; } /** * Tokenizes hyperlinks, emoticons. * Called by {@link #tokenizeAux(String)}. */ private void tokenizeMetaInfo(List<String> tokens, String s) { int[] ps; if ((ps = getMetaRange(s)) != null) { int bIndex = ps[0], eIndex = ps[1], len = s.length(); if (0 < bIndex) tokenizeSymbols(tokens, s.substring(0, bIndex)); tokens.add(s.substring(bIndex, eIndex)); if (eIndex < len) tokenizeSymbols(tokens, s.substring(eIndex)); } else tokenizeSymbols(tokens, s); } /** Called by {@link #tokenizeMetaInfo(List, String)}. */ private int[] getMetaRange(String s) { int[] ps; if ((ps = d_emoticon.getEmoticonRange(s)) != null) return ps; Matcher m = MetaUtils.HYPERLINK.matcher(s); if (m.find()) return new int[]{m.start(), m.end()}; return null; } /** Called by {@link #tokenizeMetaInfo(List, String)}. */ private void tokenizeSymbols(List<String> tokens, String s) { char[] cs = s.toCharArray(); int len = s.length(); int bIndex = getFirstNonSymbolIndex(cs); if (bIndex == len) { addSymbols(tokens, s); return; } int eIndex = getLastSymbolSequenceIndex(cs); List<int[]> indices = new ArrayList<>(); indices.add(new int[]{0, bIndex}); addNextSymbolSequenceIndices(indices, cs, bIndex+1, eIndex-1); indices.add(new int[]{eIndex, len}); tokenizeSymbolsAux(tokens, s, cs, indices); } /** * @return {@code 0} if no character in {@code cs} is symbol. * @return {@code cs.length} if all characters in {@code cs} are symbols. * Called by {@link #tokenizeSymbols(List, String)}. */ private int getFirstNonSymbolIndex(char[] cs) { int i, len = cs.length; for (i=0; i<len; i++) { if (!isSymbol(cs[i])) return i; } return i; } /** * @return {@code cs.length} if no character in {@code cs} is symbol. * @return {@code 0} if all characters in {@code cs} are symbols. * Called by {@link #tokenizeSymbols(List, String)}. */ private int getLastSymbolSequenceIndex(char[] cs) { int i; for (i=cs.length-1; i>=0; i--) { if (!isSymbol(cs[i])) return i+1; } return i+1; } /** Called by {@link #tokenizeSymbols(List, String)}. */ private void addNextSymbolSequenceIndices(List<int[]> indices, char[] cs, int bIndex, int eIndex) { int i, j; for (i=bIndex; i<eIndex; i++) { if (preserveSymbolInBetween(cs, i) || preserveSymbolInDigits(cs, i) || preserveSymbolInAlphabets(cs, i)) continue; if (isEllipsis(cs, i) || isSymbolInBetween(cs[i]) || (i+1<eIndex && isSymbolInBetween(cs[i+1]) && CharUtils.isFinalMark(cs[i]))) { j = getSpanIndex(cs, i, eIndex, false); indices.add(new int[]{i, j}); i = j - 1; } } } /** Called by {@link #tokenizeSymbols(List, String)}. */ private void tokenizeSymbolsAux(List<String> tokens, String s, char[] cs, List<int[]> indices) { int i, pg, ng, bIndex, eIndex, size = indices.size() - 1; boolean pb, nb; int[] pi, ni; String t; for (i=0; i<size; i++) { pi = indices.get(i); ni = indices.get(i+1); bIndex = pi[1]; eIndex = ni[0]; if (bIndex < eIndex) { t = s.substring(bIndex, eIndex); pg = pi[1] - pi[0]; ng = ni[1] - ni[0]; pb = (i == 0) ? pg > 0 : pg == 1; nb = (i+1 == size) ? ng > 0 : ng == 1; if (pb) pi[1] = adjustFirstNonSymbolIndex(cs, bIndex, t); if (nb) ni[0] = adjustLastSymbolSequenceIndex(cs, eIndex, t); } } for (i=0; i<size; i++) { pi = indices.get(i); ni = indices.get(i+1); bIndex = pi[0]; eIndex = pi[1]; if (bIndex < eIndex) { t = s.substring(bIndex, eIndex); if (i == 0) addSymbols(tokens, t); else tokens.add(t); } bIndex = pi[1]; eIndex = ni[0]; if (bIndex < eIndex) { t = s.substring(bIndex, eIndex); addMorphemes(tokens, t); } } ni = indices.get(size); bIndex = ni[0]; eIndex = ni[1]; if (bIndex < eIndex) addSymbols(tokens, s.substring(bIndex, eIndex)); } /** Called by {@link #tokenizeSymbolsAux(List, String, char[], List)}. */ private int adjustFirstNonSymbolIndex(char[] cs, int beginIndex, String t) { char sym = cs[beginIndex-1], curr = cs[beginIndex]; int gap; if ((gap = adjustFirstNonSymbolGap(cs, beginIndex, t)) > 0) { beginIndex -= gap; } else if (CharUtils.isPreDigitSymbol(sym)) { if (CharUtils.isDigit(curr)) beginIndex--; // -1, .1, +1 } else if ((sym == CharConst.AT || sym == CharConst.POUND)) { if (CharUtils.isAlphabet(curr)) beginIndex--; // @A, #A } else if (CharUtils.isApostrophe(sym)) { if (P_YEAR.matcher(t).find()) beginIndex--; } return beginIndex; } /** Called by {@link #tokenizeSymbolsAux(List, String, char[], List)}. */ protected int adjustLastSymbolSequenceIndex(char[] cs, int endIndex, String t) { String lower = StringUtils.toLowerCase(t); char sym = cs[endIndex]; int gap; if ((gap = adjustLastSymbolSequenceGap(cs, endIndex, t)) > 0) { endIndex += gap; } else if (sym == CharConst.DOLLAR) { if (d_currency.isCurrencyDollar(lower)) endIndex++; } else if (sym == CharConst.PERIOD) { if (preservePeriod(cs, endIndex, t)) endIndex++; } return endIndex; } /** Called by {@link #adjustFirstNonSymbolIndex(char[], int, String)}. */ abstract protected int adjustFirstNonSymbolGap(char[] cs, int beginIndex, String t); /** Called by {@link #adjustLastSymbolSequenceIndex(char[], int, String)}. */ abstract protected int adjustLastSymbolSequenceGap(char[] cs, int endIndex, String t); // ----------------------------------- Add symbols ----------------------------------- /** Called by {@link #tokenizeSymbols(List, String)}. */ private void addSymbols(List<String> tokens, String s) { if (s.length() == 1) { tokens.add(s); return; } int i, j, flag, len = s.length(), bIndex = 0; char[] cs = s.toCharArray(); for (i=0; i<len; i=j) { flag = getSymbolFlag(cs[i]); j = getSpanIndex(cs, i, len, flag == 1); if (0 < flag || i+1 < j) { if (bIndex < i) tokens.add(s.substring(bIndex, i)); tokens.add(s.substring(i, j)); bIndex = j; } } if (bIndex < len) tokens.add(s.substring(bIndex)); } /** * @return the right-most index in the span (exclusive). * Called by {@link #addSymbols(List, String)}. */ private int getSpanIndex(char[] cs, int index, int rightBound, boolean finalMark) { char c = cs[index]; int i; for (i=index+1; i<rightBound; i++) { if (!isConsecutive(cs, i, c, finalMark)) return i; } return i; } // /** // * @return the left-most index in the span (inclusive). // * Called by {@link #addSymbols(List, String)}. // */ // private int getSpanIndexRL(char[] cs, int index, int leftBound, boolean finalMark) // { // char c = cs[index]; // int i; // // for (i=index-1; i>leftBound; i--) // { // if (!isConsecutive(cs, i, c, finalMark)) // return i+1; // } // // return i+1; // } // ----------------------------------- Add morphmes ----------------------------------- /** Called by {@link #tokenizeSymbols(List, String)}. */ private void addMorphemes(List<String> tokens, String s) { if (s.length() == 1) { tokens.add(s); return; } char[] lcs = s.toCharArray(); String lower = CharUtils.toLowerCase(lcs) ? new String(lcs) : s; if (!tokenize(tokens, s, lower, lcs, d_currency) && !tokenize(tokens, s, lower, lcs, d_unit) && !tokenizeDigit(tokens, s, lcs) && !tokenizeWordsMore(tokens, s, lower, lcs)) tokens.add(s); } /** Called by {@link #addMorphemes(List, String)}. */ protected boolean tokenize(List<String> tokens, String original, String lower, char[] lcs, AbstractDTTokenizer tokenizer) { String[] t = tokenizer.tokenize(original, lower, lcs); if (t != null) { DSUtils.addAll(tokens, t); return true; } return false; } /** Called by {@link #addMorphemes(List, String)}. */ private boolean tokenizeDigit(List<String> tokens, String original, char[] lcs) { int len = lcs.length; if (len < 2) return false; if (tokenizeDigitAux(lcs[0]) && CharUtils.containsDigitPunctuationOnly(lcs, 1, len)) { tokens.add(original.substring(0, 1)); tokens.add(original.substring(1)); return true; } len--; if (tokenizeDigitAux(lcs[len]) && CharUtils.containsDigitPunctuationOnly(lcs, 0, len)) { tokens.add(original.substring(0, len)); tokens.add(original.substring(len)); return true; } return false; } /** {@link #tokenizeDigit(List, String, char[])}. */ private boolean tokenizeDigitAux(char c) { return c == CharConst.POUND || c == CharConst.DOLLAR || c == CharConst.PERCENT || c == CharConst.ASTERISK || c == CharConst.EQUAL; } /** Called by {@link #addMorphemes(List, String)}. */ abstract protected boolean tokenizeWordsMore(List<String> tokens, String original, String lower, char[] lcs); // ----------------------------------- Finalize ----------------------------------- /** Called by {@link #tokenize(String)}. */ private void finalize(List<String> tokens) { int i, j, size = tokens.size(); String token, lower; for (i=0; i<size; i++) { token = tokens.get(i); lower = StringUtils.toLowerCase(token); if ((j = tokenizeNo(tokens, token, lower, i)) != 0 || (mergeParenthesis(tokens, token, i)) != 0) { size = tokens.size(); i += j; } } if (tokens.size() == 1) tokenizeLastPeriod(tokens); } /** Called by {@link #finalize()}. */ private int tokenizeNo(List<String> tokens, String token, String lower, int index) { if (lower.equals("no.") && (index+1 == tokens.size() || !CharUtils.isDigit(tokens.get(index+1).charAt(0)))) { tokens.set(index , StringUtils.trim(token, 1)); tokens.add(index+1, StringConst.PERIOD); return 1; } return 0; } /** Called by {@link #finalize()}. */ private int mergeParenthesis(List<String> tokens, String token, int index) { if (token.length() == 1 && 0 <= index-1 && index+1 < tokens.size()) { String prev = tokens.get(index-1); String next = tokens.get(index+1); if (prev.equals(StringConst.LRB) && next.equals(StringConst.RRB)) { tokens.set(index-1, prev+token+next); tokens.remove(index); tokens.remove(index); return -1; } } return 0; } /** Called by {@link #finalize()}. */ private void tokenizeLastPeriod(List<String> tokens) { int last = tokens.size() - 1; String token = tokens.get(last); char[] cs = token.toCharArray(); int len = token.length(); if (1 < len && cs[len-1] == CharConst.PERIOD && !CharUtils.isFinalMark(cs[len-2])) { tokens.set(last, StringUtils.trim(token, 1)); tokens.add(StringConst.PERIOD); } } // ----------------------------------- Preserve ----------------------------------- /** Called by {@link #addNextSymbolSequenceIndices(List, char[], int, int)}. */ abstract protected boolean preserveSymbolInBetween(char[] cs, int index); /** Called by {@link #addMorphemes(List, String)}. */ private boolean preserveSymbolInDigits(char[] cs, int index) { char c = cs[index]; if (CharUtils.isHyphen(c)) return (0 <= index-1 && index+1 < cs.length) && CharUtils.isAlnum(cs[index-1]) && CharUtils.isDigit(cs[index+1]); else if (c == CharConst.FW_SLASH) return (0 <= index-1 && index+1 < cs.length) && CharUtils.isDigit(cs[index-1]) && CharUtils.isDigit(cs[index+1]); else if (cs[index] == CharConst.COMMA) return (0 <= index-1 && index+3 < cs.length) && (index+4 == cs.length || !CharUtils.isDigit(cs[index+4])) && CharUtils.isDigit(cs[index-1]) && CharUtils.isDigit(cs[index+1]) && CharUtils.isDigit(cs[index+2]) && CharUtils.isDigit(cs[index+3]); return false; } /** Called by {@link #addMorphemes(List, String)}. */ private boolean preserveSymbolInAlphabets(char[] cs, int index) { char c = cs[index]; if (c == CharConst.AMPERSAND) return (0 <= index-1 && index+1 < cs.length) && CharUtils.isAlphabet(cs[index-1]) && CharUtils.isAlphabet(cs[index+1]); return false; } /** Called by {@link #adjustLastSymbolSequenceGap(char[], int, String)}. */ private boolean preservePeriod(char[] cs, int endIndex, String t) { if (endIndex+1 < cs.length) { char c = cs[endIndex+1]; if (CharUtils.isSeparatorMark(c)) return true; if (CharUtils.isFinalMark(c) || CharUtils.isQuotationMark(c)) return false; } if (P_ABBREVIATION.matcher(t).find()) return true; int len = t.length(); return (2 <= len && len <= 5) && CharUtils.containsOnlyConsonants(t); } // ----------------------------------- Boolean ----------------------------------- /** Called by {@link #getFirstNonSymbolIndex(char[])} and {@link #getLastSymbolSequenceIndex(char[])}. */ private boolean isSymbol(char c) { return CharUtils.isPunctuation(c) || CharUtils.isGeneralPunctuation(c) || CharUtils.isCurrency(c) || CharUtils.isArrow(c); } /** Called by {@link #addNextSymbolSequenceIndices(List, char[], int, int)}. */ private boolean isEllipsis(char[] cs, int index) { if (cs[index] == CharConst.PERIOD && index+1 < cs.length) { char c = cs[index+1]; return CharUtils.isFinalMark(c) || CharUtils.isSeparatorMark(c) || CharUtils.isQuotationMark(c); } return false; } /** Called by {@link #addNextSymbolSequenceIndices(List, char[], int, int)}. */ private boolean isSymbolInBetween(char c) { return CharUtils.isBracket(c) || CharUtils.isArrow(c) || CharUtils.isDoubleQuotationMark(c) || CharUtils.isHyphen(c) || S_SYMBOL_IN_BETWEEN.contains(c); } /** Called by {@link #getSpanIndex(char[], int, int, boolean)}. */ private boolean isConsecutive(char[] cs, int index, char c, boolean finalMark) { return finalMark ? CharUtils.isFinalMark(cs[index]) : c == cs[index]; } /** Called by {@link #addSymbols(List, String)}. */ private int getSymbolFlag(char c) { if (CharUtils.isFinalMark(c)) return 1; else if (CharUtils.isBracket(c) || CharUtils.isSeparatorMark(c) || CharUtils.isQuotationMark(c) || c == CharConst.PRIME) return 2; else return 0; } protected boolean isFinalMarksOnly(String s) { for (char c : s.toCharArray()) { if (!CharUtils.isFinalMark(c)) return false; } return true; } }