package com.knowledgebooks.nlp.util;
import java.util.*;
import java.util.ArrayList;
/**
* Utilities finding sentence breaks in documents.
*/
/**
* Copyright Mark Watson 2008-2010. All Rights Reserved.
* License: LGPL version 3 (http://www.gnu.org/licenses/lgpl-3.0.txt)
*/
public class Document {
/**
* @param words a string containing plain text
*/
public Document(String words) {
List<String> tokens = Tokenizer.wordsToList(words);
init(tokens);
}
/**
* @param words a list of string tokens
*/
public Document(List<String> words) {
init(words);
}
public String toString() {
StringBuffer sb = new StringBuffer("[Document " + this.hashCode() + ":\n");
for (int i = 0, size = getNumSentences(); i < size; i++) {
sb.append(" ").append(i).append(": ").append(getSentence(i)).append("\n");
}
sb.append("]\n");
return sb.toString();
}
/**
* @return a list of string tokens in this document
*/
public List<String> getTokens() {
return tokens;
}
private void init(List<String> words) {
this.tokens = words;
// pre-calculate sentence boundaries:
List<IPair> sentenceBoundaries = new ArrayList<IPair>();
int start = 0, end = 0;
for (int i = 0; i < words.size(); i++) {
String w = words.get(i);
// handle special cases like: Procter & Gamble Co. saves $300 million annually
// (i.e., do not treat Co. as the end of a sentence) -- handle ABREVIATIONS
boolean notEnd = false;
if (i > 0 && i < (words.size() - 1) && w.equals(".")) {
if (words.get(i - 1).length() < 5 && words.get(i - 1).length() > 0 && words.get(i + 1).length() > 0) {
if (Character.isUpperCase(words.get(i - 1).charAt(0)) &&
Character.isLowerCase(words.get(i + 1).charAt(0))) notEnd = true;
if (Character.isUpperCase(words.get(i - 1).charAt(0)) &&
words.get(i - 1).length() == 1) notEnd = true;
if (words.get(i + 1).charAt(0) == ',') notEnd = true;
if (i < (words.size() - 2)) {
if (words.get(i + 1).charAt(0) == '.' && words.get(i + 2).charAt(0) == ',') notEnd = true;
}
if (words.get(i + 1).charAt(0) == ';') notEnd = true;
}
}
if ((!notEnd && w.equals(".")) || w.equals("!") || w.equals("?")) {
end = i;
sentenceBoundaries.add(new IPair(start, end));
start = i + 1;
}
}
if (end < start) {
sentenceBoundaries.add(new IPair(start, words.size() - 1));
}
int size = sentenceBoundaries.size();
if (size > 0) {
startSentenceBoundary = new int[size];
endSentenceBoundary = new int[size];
for (int i = 0; i < size; i++) {
IPair ip = sentenceBoundaries.get(i);
startSentenceBoundary[i] = ip.getFirst();
endSentenceBoundary[i] = ip.getSecond();
}
}
}
/**
*
*/
public int[] startSentenceBoundary = new int[0];
/**
*
*/
public int[] endSentenceBoundary = new int[0];
/**
* @return
*/
public int getNumWords() {
return tokens.size();
}
/**
* @return
*/
public int getNumSentences() {
return startSentenceBoundary.length - 1;
}
/**
* @param wordIndex
* @return
*/
public String getWord(int wordIndex) {
if (wordIndex < 0 || wordIndex >= tokens.size()) return "";
return tokens.get(wordIndex);
}
/**
* @param wordIndex
* @return
*/
public IPair getSentenceBoundaryFromWordIndex(int wordIndex) {
if (startSentenceBoundary == null) return null;
for (int i = 0, size = startSentenceBoundary.length; i < size; i++) {
if (wordIndex >= startSentenceBoundary[i] && wordIndex <= endSentenceBoundary[i]) {
return new IPair(startSentenceBoundary[i], endSentenceBoundary[i]);
}
}
// the following is, really, an error return:
return new IPair(startSentenceBoundary[0], endSentenceBoundary[0]);
}
/**
* @param sentenceIndex
* @return
*/
public IPair getSentenceBoundary(int sentenceIndex) {
if (startSentenceBoundary == null) return null;
return new IPair(startSentenceBoundary[sentenceIndex], endSentenceBoundary[sentenceIndex]);
}
private List<String> tokens = new ArrayList<String>(0);
/**
* @param index sentence index in document
* @return a string containing the specified sentence
*/
public String getSentence(int index) {
if (index < 0 || index >= startSentenceBoundary.length) return ""; // error/bogus return
StringBuffer sb = new StringBuffer();
int start = startSentenceBoundary[index];
int end = endSentenceBoundary[index] + 1;
for (int i = start; i < end; i++) {
if (tokens.get(i).equals("nbsp")) tokens.set(i, "nbsp;");
sb.append(tokens.get(i));
if ((i < (end - 1)) &&
(i == (end - 1) || editSpace.get(tokens.get(i + 1)) == null)) sb.append(" ");
}
return sb.toString();
}
static private Hashtable<String, Boolean> editSpace = new Hashtable<String, Boolean>();
static {
editSpace.put("nbsp", true);
editSpace.put("t", true);
editSpace.put("s", true);
editSpace.put("'", true);
editSpace.put(",", true);
editSpace.put(".", true);
editSpace.put("!", true);
editSpace.put("?", true);
}
}