/** * Copyright 2014, Emory University * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package edu.emory.clir.clearnlp.tokenization.english; import edu.emory.clir.clearnlp.dictionary.AbstractDTTokenizer; import edu.emory.clir.clearnlp.util.CharUtils; import edu.emory.clir.clearnlp.util.Splitter; /** * @since 3.0.0 * @author Jinho D. Choi ({@code jinho.choi@emory.edu}) */ public class ApostropheEnglishTokenizer extends AbstractDTTokenizer { private final String[] APOSTROPHE_SUFFIXES = {"d","m","s","t","z","ll","nt","re","ve"}; public String[] tokenize(String original, String lower, char[] lcs) { int i; for (String suffix : APOSTROPHE_SUFFIXES) { i = isApostropheSuffix(lower, lcs, suffix); if (i > 0) return Splitter.split(original, i); } return null; } private int isApostropheSuffix(String lower, char[] lcs, String suffix) { if (lower.endsWith(suffix)) { if (suffix.equals("t")) // n't { int i = lower.length() - suffix.length() - 2; if (0 < i && lcs[i] == 'n' && CharUtils.isApostrophe(lcs[i+1])) return i; } else { int i = lower.length() - suffix.length() - 1; if (0 < i && CharUtils.isApostrophe(lcs[i])) return (suffix.equals("s") && CharUtils.isDigit(lcs[i-1])) ? -1 : i; } } return -1; } }