package info.ephyra.nlp;
import info.ephyra.util.StringUtils;
import java.util.ArrayList;
import com.aliasi.sentences.MedlineSentenceModel;
import com.aliasi.sentences.SentenceModel;
import com.aliasi.tokenizer.IndoEuropeanTokenizerFactory;
import com.aliasi.tokenizer.Tokenizer;
import com.aliasi.tokenizer.TokenizerFactory;
/**
* <p>This class provides a common interface to the
* <a href="http://www.alias-i.com/lingpipe/">LingPipe</a> toolkit.</p>
*
* <p>It supports the following natural language processing tools:
* <ul>
* <li>Tokenization</li>
* <li>Sentence detection</li>
* </ul>
* </p>
*
* @author Nico Schlaefer
* @version 2006-11-25
*/
public class LingPipe {
/** Tokenization model. */
private static TokenizerFactory tokenizerFactory;
/** Sentence detection model. */
private static SentenceModel sentenceModel;
/**
* Creates a model for the tokenizer, if not done already.
*/
public static void createTokenizer() {
if (tokenizerFactory == null)
tokenizerFactory = new IndoEuropeanTokenizerFactory();
}
/**
* Creates models for the tokenizer and the sentence detector, if not
* already done.
*/
public static void createSentenceDetector() {
if (tokenizerFactory == null)
tokenizerFactory = new IndoEuropeanTokenizerFactory();
if (sentenceModel == null)
sentenceModel = new MedlineSentenceModel();
}
/**
* Tokenizes a text.
*
* @param text text to tokenize
* @return array of tokens or <code>null</code>, if the tokenizer is not
* initialized
*/
public static String[] tokenize(String text) {
if (tokenizerFactory == null) return null;
ArrayList<String> tokenList = new ArrayList<String>();
ArrayList<String> whiteList = new ArrayList<String>();
Tokenizer tokenizer =
tokenizerFactory.tokenizer(text.toCharArray(), 0, text.length());
tokenizer.tokenize(tokenList, whiteList);
return tokenList.toArray(new String[tokenList.size()]);
}
/**
* Tokenizes a text and concatenates the tokens with spaces.
*
* @param text text to tokenize
* @return string of space-delimited tokens or <code>null</code>, if the
* tokenizer is not initialized
*/
public static String tokenizeWithSpaces(String text) {
String[] tokens = tokenize(text);
return (tokens != null) ? StringUtils.concatWithSpaces(tokens) : null;
}
/**
* Splits a text into sentences.
*
* @param text sequence of sentences
* @return array of sentences in the text or <code>null</code>, if the
* sentence detector is not initialized
*/
public static String[] sentDetect(String text) {
if (sentenceModel == null) return null;
// tokenize text
ArrayList<String> tokenList = new ArrayList<String>();
ArrayList<String> whiteList = new ArrayList<String>();
Tokenizer tokenizer =
tokenizerFactory.tokenizer(text.toCharArray(), 0, text.length());
tokenizer.tokenize(tokenList, whiteList);
String[] tokens = tokenList.toArray(new String[tokenList.size()]);
String[] whites = whiteList.toArray(new String[whiteList.size()]);
// detect sentences
int[] sentenceBoundaries =
sentenceModel.boundaryIndices(tokens, whites);
int sentStartTok = 0;
int sentEndTok = 0;
String[] sentences = new String[sentenceBoundaries.length];
for (int i = 0; i < sentenceBoundaries.length; i++) {
sentEndTok = sentenceBoundaries[i];
StringBuilder sb = new StringBuilder();
for (int j = sentStartTok; j <= sentEndTok; j++) {
sb.append(tokens[j]);
if (whites[j + 1].length() > 0 && j < sentEndTok)
sb.append(" ");
}
sentences[i] = sb.toString();
sentStartTok = sentEndTok+1;
}
return sentences;
}
}