package arkref.sent; import java.io.IOException; import java.util.ArrayList; import java.util.List; import org.apache.commons.lang.StringUtils; import arkref.parsestuff.AlignedSub; import arkref.parsestuff.AnalysisUtilities; import arkref.parsestuff.U; import com.aliasi.sentences.SentenceModel; import com.aliasi.tokenizer.Tokenizer; import com.aliasi.tokenizer.TokenizerFactory; import com.aliasi.util.Strings; /** * Breaks a document into sentences in a very re-traceable manner, using our hacked-up * version of LingPipe's sentence breaker. * @author brendano */ public class SentenceBreaker { static final TokenizerFactory TOKENIZER_FACTORY = MyIndoEuropeanTokenizerFactory.INSTANCE; public static class Sentence { public int charStart; public int charEnd; public String rawText; public String cleanText; public List<String> tokens; /** parallel to rawText **/ public int alignments[] = null; /** assume charStart and charEnd are correct relative to map. move them & fill in indiv alignments. **/ public void setAlignmentProjection(int[] map) { alignments = new int[rawText.length()]; for (int i=0; i < alignments.length; i++) { alignments[i] = map[i+charStart]; } charStart = alignments[0]; charEnd = alignments[alignments.length-1]; } } /** also do the projections back to original text **/ public static List<Sentence> getSentences(AlignedSub cleanText) { List<Sentence> sentences = getSentences(cleanText.text); for (Sentence s : sentences) { s.setAlignmentProjection(cleanText.alignments); } return sentences; } public static List<Sentence> getSentences(String text) { ArrayList<Sentence> sentences = new ArrayList<Sentence>(); List<String> tokenList = new ArrayList<String>(); List<String> whiteList = new ArrayList<String>(); Tokenizer tokenizer = TOKENIZER_FACTORY.tokenizer(text.toCharArray(),0,text.length()); tokenizer.tokenize(tokenList,whiteList); SentenceModel SENTENCE_MODEL = new MyIndoEuropeanSentenceModel(usesCapitalConvention(tokenList)); String[] tokens = new String[tokenList.size()]; String[] whites = new String[whiteList.size()]; tokenList.toArray(tokens); whiteList.toArray(whites); int[] sentenceBoundaries = SENTENCE_MODEL.boundaryIndices(tokens,whites); if (sentenceBoundaries.length < 1) { return sentences; } int charStart = 0; int charEnd = 0; int sentStartTok = 0; int sentEndTok = 0; for (int i = 0; i < sentenceBoundaries.length; ++i) { sentEndTok = sentenceBoundaries[i]; List<String> sentToks = new ArrayList<String>(); for (int j=sentStartTok; j<=sentEndTok; j++) { // U.pf("adding [%s] size %d and %d (ws [%s])\n", tokens[j], tokens[j].length(), tokens[j].length()+whites[j+1].length(), U.backslashEscape(whites[j+1])); charEnd += tokens[j].length() + whites[j+1].length(); sentToks.add(tokens[j]); } Sentence s = new Sentence(); s.charStart = charStart; s.charEnd = charEnd; s.rawText = text.substring(charStart, charEnd); s.cleanText = Strings.normalizeWhitespace(s.rawText); s.tokens = sentToks; sentences.add(s); sentStartTok = sentEndTok+1; charStart = charEnd; // charStart += whites[seentEndTok+1].length(); } // Sentence last = sentences.get(sentences.size()-1); // String rest = text.substring(last.charEnd, text.length()); // should add degenerate last "sentence" ... if missing punctuation, it might be a real sentence return sentences; } /** * see notes/cap_ratio_experiment */ public static boolean usesCapitalConvention(List<String> documentTokens) { double numCap = 0.1; double numPunct = 0.1; for (String tok : documentTokens) { if (Strings.allPunctuation(tok)) numPunct++; if (Strings.capitalized(tok.toCharArray())) numCap++; } U.pf("CAPRATIO\t%f\n", numCap/numPunct); return (numCap / numPunct) > 0.3; } public static void main(String[] args) throws IOException { for (String arg : args) { if (args.length > 1) U.pf("\nDOC\t%s\n", arg); String text = U.readFile(arg); AlignedSub textAS = AnalysisUtilities.cleanupDocument(text); for (Sentence s : getSentences(textAS)) { // rawText might have newlines, tabs U.pf("SENT\t%d\t%d\nNEARRAW\t%s\n", s.charStart, s.charEnd, U.backslashEscape(s.rawText)); U.pf("TOKS\t%s\n", StringUtils.join(s.tokens, " ")); U.pf("CLEAN\t%s\n", s.cleanText); } } } }