package arkref.sent;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.lang.StringUtils;
import arkref.parsestuff.AlignedSub;
import arkref.parsestuff.AnalysisUtilities;
import arkref.parsestuff.U;
import com.aliasi.sentences.SentenceModel;
import com.aliasi.tokenizer.Tokenizer;
import com.aliasi.tokenizer.TokenizerFactory;
import com.aliasi.util.Strings;
/**
* Breaks a document into sentences in a very re-traceable manner, using our hacked-up
* version of LingPipe's sentence breaker.
* @author brendano
*/
public class SentenceBreaker {
static final TokenizerFactory TOKENIZER_FACTORY = MyIndoEuropeanTokenizerFactory.INSTANCE;
public static class Sentence {
public int charStart;
public int charEnd;
public String rawText;
public String cleanText;
public List<String> tokens;
/** parallel to rawText **/
public int alignments[] = null;
/** assume charStart and charEnd are correct relative to map. move them & fill in indiv alignments. **/
public void setAlignmentProjection(int[] map) {
alignments = new int[rawText.length()];
for (int i=0; i < alignments.length; i++) {
alignments[i] = map[i+charStart];
}
charStart = alignments[0];
charEnd = alignments[alignments.length-1];
}
}
/** also do the projections back to original text **/
public static List<Sentence> getSentences(AlignedSub cleanText) {
List<Sentence> sentences = getSentences(cleanText.text);
for (Sentence s : sentences) {
s.setAlignmentProjection(cleanText.alignments);
}
return sentences;
}
public static List<Sentence> getSentences(String text) {
ArrayList<Sentence> sentences = new ArrayList<Sentence>();
List<String> tokenList = new ArrayList<String>();
List<String> whiteList = new ArrayList<String>();
Tokenizer tokenizer = TOKENIZER_FACTORY.tokenizer(text.toCharArray(),0,text.length());
tokenizer.tokenize(tokenList,whiteList);
SentenceModel SENTENCE_MODEL = new MyIndoEuropeanSentenceModel(usesCapitalConvention(tokenList));
String[] tokens = new String[tokenList.size()];
String[] whites = new String[whiteList.size()];
tokenList.toArray(tokens);
whiteList.toArray(whites);
int[] sentenceBoundaries = SENTENCE_MODEL.boundaryIndices(tokens,whites);
if (sentenceBoundaries.length < 1) {
return sentences;
}
int charStart = 0;
int charEnd = 0;
int sentStartTok = 0;
int sentEndTok = 0;
for (int i = 0; i < sentenceBoundaries.length; ++i) {
sentEndTok = sentenceBoundaries[i];
List<String> sentToks = new ArrayList<String>();
for (int j=sentStartTok; j<=sentEndTok; j++) {
// U.pf("adding [%s] size %d and %d (ws [%s])\n", tokens[j], tokens[j].length(), tokens[j].length()+whites[j+1].length(), U.backslashEscape(whites[j+1]));
charEnd += tokens[j].length() + whites[j+1].length();
sentToks.add(tokens[j]);
}
Sentence s = new Sentence();
s.charStart = charStart;
s.charEnd = charEnd;
s.rawText = text.substring(charStart, charEnd);
s.cleanText = Strings.normalizeWhitespace(s.rawText);
s.tokens = sentToks;
sentences.add(s);
sentStartTok = sentEndTok+1;
charStart = charEnd;
// charStart += whites[seentEndTok+1].length();
}
// Sentence last = sentences.get(sentences.size()-1);
// String rest = text.substring(last.charEnd, text.length());
// should add degenerate last "sentence" ... if missing punctuation, it might be a real sentence
return sentences;
}
/**
* see notes/cap_ratio_experiment
*/
public static boolean usesCapitalConvention(List<String> documentTokens) {
double numCap = 0.1;
double numPunct = 0.1;
for (String tok : documentTokens) {
if (Strings.allPunctuation(tok)) numPunct++;
if (Strings.capitalized(tok.toCharArray())) numCap++;
}
U.pf("CAPRATIO\t%f\n", numCap/numPunct);
return (numCap / numPunct) > 0.3;
}
public static void main(String[] args) throws IOException {
for (String arg : args) {
if (args.length > 1)
U.pf("\nDOC\t%s\n", arg);
String text = U.readFile(arg);
AlignedSub textAS = AnalysisUtilities.cleanupDocument(text);
for (Sentence s : getSentences(textAS)) {
// rawText might have newlines, tabs
U.pf("SENT\t%d\t%d\nNEARRAW\t%s\n", s.charStart, s.charEnd, U.backslashEscape(s.rawText));
U.pf("TOKS\t%s\n", StringUtils.join(s.tokens, " "));
U.pf("CLEAN\t%s\n", s.cleanText);
}
}
}
}