/** * Copyright (C) 2012 cogroo <cogroo@cogroo.org> * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.cogroo.tools.sentdetect; import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.Set; import opennlp.tools.sentdetect.SDContextGenerator; import opennlp.tools.util.StringUtil; /** * Generate event contexts for maxent decisions for sentence detection. * */ public class PortugueseSDContextGenerator implements SDContextGenerator { /** * String buffer for generating features. */ protected StringBuffer buf; /** * List for holding features as they are generated. */ protected List<String> collectFeats; private Set<String> inducedAbbreviations; private char[] eosCharacters; /** * Creates a new <code>SDContextGenerator</code> instance with no induced * abbreviations. * * @param eosCharacters */ public PortugueseSDContextGenerator(char[] eosCharacters) { this(Collections.<String> emptySet(), eosCharacters); } /** * Creates a new <code>SDContextGenerator</code> instance which uses the set * of induced abbreviations. * * @param inducedAbbreviations * a <code>Set</code> of Strings representing induced abbreviations * in the training data. Example: "Mr." * * @param eosCharacters */ public PortugueseSDContextGenerator(Set<String> inducedAbbreviations, char[] eosCharacters) { this.inducedAbbreviations = inducedAbbreviations; this.eosCharacters = eosCharacters; buf = new StringBuffer(); collectFeats = new ArrayList<String>(); } /* * (non-Javadoc) * * @see * opennlp.tools.sentdetect.SDContextGenerator#getContext(java.lang.StringBuffer * , int) */ public String[] getContext(CharSequence sb, int position) { /** * String preceding the eos character in the eos token. */ String prefix; /** * Space delimited token preceding token containing eos character. */ String previous; /** * String following the eos character in the eos token. */ String suffix; /** * Space delimited token following token containing eos character. */ String next; int lastIndex = sb.length() - 1; { // compute space previous and space next features. if (position > 0 && StringUtil.isWhitespace(sb.charAt(position - 1))) collectFeats.add("sp"); if (position < lastIndex && StringUtil.isWhitespace(sb.charAt(position + 1))) collectFeats.add("sn"); collectFeats.add("eos=" + sb.charAt(position)); } int prefixStart = previousSpaceIndex(sb, position); int c = position; { // /assign prefix, stop if you run into a period though otherwise stop at // space while (--c > prefixStart) { for (int eci = 0, ecl = eosCharacters.length; eci < ecl; eci++) { if (sb.charAt(c) == eosCharacters[eci]) { prefixStart = c; c++; // this gets us out of while loop. break; } } } prefix = new StringBuffer(sb.subSequence(prefixStart, position)) .toString().trim(); } int prevStart = previousSpaceIndex(sb, prefixStart); previous = new StringBuffer(sb.subSequence(prevStart, prefixStart)) .toString().trim(); int suffixEnd = nextSpaceIndex(sb, position, lastIndex); { c = position; while (++c < suffixEnd) { for (int eci = 0, ecl = eosCharacters.length; eci < ecl; eci++) { if (sb.charAt(c) == eosCharacters[eci]) { suffixEnd = c; c--; // this gets us out of while loop. break; } } } } int nextEnd = nextSpaceIndex(sb, suffixEnd + 1, lastIndex + 1); if (position == lastIndex) { suffix = ""; next = ""; } else { suffix = new StringBuilder(sb.subSequence(position + 1, suffixEnd)) .toString().trim(); next = new StringBuilder(sb.subSequence(suffixEnd + 1, nextEnd)) .toString().trim(); } collectFeatures(prefix, suffix, previous, next, sb.charAt(position)); int sentEnd = Math.max(position + 1, suffixEnd); collectFeats.addAll(getSentenceContext(sb.subSequence(prefixStart, sentEnd) .toString(), position - prefixStart)); String[] context = new String[collectFeats.size()]; context = collectFeats.toArray(context); collectFeats.clear(); return context; } /** * Determines some of the features for the sentence detector and adds them to * list features. * * @param prefix * String preceeding the eos character in the eos token. * @param suffix * String following the eos character in the eos token. * @param previous * Space delimited token preceeding token containing eos character. * @param next * Space delimited token following token containsing eos character. */ protected void collectFeatures(String prefix, String suffix, String previous, String next, char eosChar) { buf.append("x="); buf.append(prefix); collectFeats.add(buf.toString()); buf.setLength(0); if (!prefix.equals("")) { collectFeats.add(Integer.toString(prefix.length())); if (isFirstUpper(prefix)) { collectFeats.add("xcap"); } if (inducedAbbreviations.contains(prefix + eosChar)) { collectFeats.add("xabbrev"); } char c = prefix.charAt(0); if (prefix.length() == 1 && Character.isLetter(c) && Character.isUpperCase(c) && eosChar == '.') { // looks like name abb collectFeats.add("xnabb"); } } buf.append("v="); buf.append(previous); collectFeats.add(buf.toString()); buf.setLength(0); if (!previous.equals("")) { if (isFirstUpper(previous)) { collectFeats.add("vcap"); } if (inducedAbbreviations.contains(previous)) { collectFeats.add("vabbrev"); } } buf.append("s="); buf.append(suffix); collectFeats.add(buf.toString()); buf.setLength(0); if (!suffix.equals("")) { if (isFirstUpper(suffix)) { collectFeats.add("scap"); } if (inducedAbbreviations.contains(suffix)) { collectFeats.add("sabbrev"); } } buf.append("n="); buf.append(next); collectFeats.add(buf.toString()); buf.setLength(0); if (!next.equals("")) { if (isFirstUpper(next)) { collectFeats.add("ncap"); } if (inducedAbbreviations.contains(next)) { collectFeats.add("nabbrev"); } } } private static final boolean isFirstUpper(String s) { return Character.isUpperCase(s.charAt(0)); } /** * Finds the index of the nearest space before a specified index which is not * itself preceded by a space. * * @param sb * The string buffer which contains the text being examined. * @param seek * The index to begin searching from. * @return The index which contains the nearest space. */ private static final int previousSpaceIndex(CharSequence sb, int seek) { seek--; while (seek > 0 && !StringUtil.isWhitespace(sb.charAt(seek))) { seek--; } if (seek > 0 && StringUtil.isWhitespace(sb.charAt(seek))) { while (seek > 0 && StringUtil.isWhitespace(sb.charAt(seek - 1))) seek--; return seek; } return 0; } /** * Finds the index of the nearest space after a specified index. * * @param sb * The string buffer which contains the text being examined. * @param seek * The index to begin searching from. * @param lastIndex * The highest index of the StringBuffer sb. * @return The index which contains the nearest space. */ private static final int nextSpaceIndex(CharSequence sb, int seek, int lastIndex) { seek++; char c; while (seek < lastIndex) { c = sb.charAt(seek); if (StringUtil.isWhitespace(c)) { while (sb.length() > seek + 1 && StringUtil.isWhitespace(sb.charAt(seek + 1))) seek++; return seek; } seek++; } return lastIndex; } public List<String> getSentenceContext(String sentence, int index) { List<String> preds = new ArrayList<String>(); if (index > 0) { addCharPreds("p1", sentence.charAt(index - 1), preds); if (index > 1) { addCharPreds("p2", sentence.charAt(index - 2), preds); preds.add("p21=" + sentence.charAt(index - 2) + sentence.charAt(index - 1)); } else { preds.add("p2=bok"); } preds.add("p1f1=" + sentence.charAt(index - 1) + sentence.charAt(index)); } else { preds.add("p1=bok"); } // addCharPreds("f1", sentence.charAt(index), preds); if (index + 1 < sentence.length()) { addCharPreds("f2", sentence.charAt(index + 1), preds); preds.add("f12=" + sentence.charAt(index) + sentence.charAt(index + 1)); } else { preds.add("f2=bok"); } if (sentence.charAt(0) == '&' && sentence.charAt(sentence.length() - 1) == ';') { preds.add("cc");// character code } return preds; } /** * Helper function for getContext. */ private void addCharPreds(String key, char c, List<String> preds) { preds.add(key + "=" + c); if (Character.isLetter(c)) { preds.add(key + "_alpha"); if (Character.isUpperCase(c)) { preds.add(key + "_caps"); } } else if (Character.isDigit(c)) { preds.add(key + "_num"); } else if (StringUtil.isWhitespace(c)) { preds.add(key + "_ws"); } else { if (c == '.' || c == '?' || c == '!') { preds.add(key + "_eos"); } else if (c == ',' || c == ';' || c == ':') { preds.add(key + "_reos"); } else if (c == '`' || c == '"' || c == '\'') { preds.add(key + "_quote"); } else if (c == '[' || c == '{' || c == '(') { preds.add(key + "_lp"); } else if (c == ']' || c == '}' || c == ')') { preds.add(key + "_rp"); } } } }