package edu.fudan.nlp.cn;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import edu.fudan.util.MyStrings;
/**
* 简单中文文本的断句
* @author xpqiu
* @version 1.0
* @since FudanNLP 1.5
*/
public class Sentenizer {
private static char[] puncs = new char[] { '。', '?', '!',';' };
/**
* 根据标点符号进行断句
* @param sent
* @return
*/
public static void setPuncs(char[] puncs) {
Sentenizer.puncs = puncs;
}
public static String[] split(String sent) {
List<Integer> plist = new ArrayList<Integer>();
int p = 0;
for (int i = 0; i < puncs.length; i++) {
p = sent.indexOf(puncs[i]);
while (p != -1) {
plist.add(p);
p = sent.indexOf(puncs[i], p + 1);
}
}
Collections.sort(plist);
if (!plist.isEmpty()) {
p = plist.get(plist.size() - 1);
if (p < sent.length() - 1)
plist.add(sent.length() - 1);
}else {
plist.add(sent.length() - 1);
}
String[] ret = new String[plist.size()];
p = 0;
for (int i = 0; i < plist.size(); i++) {
ret[i] = sent.substring(p, plist.get(i) + 1);
p = plist.get(i) + 1;
}
plist.clear();
return ret;
}
}