package edu.harvard.wcfia.yoshikoder.document.tokenizer; import java.io.File; import java.text.BreakIterator; import java.util.ArrayList; import java.util.List; import java.util.Locale; import java.util.logging.Level; import java.util.logging.Logger; import edu.harvard.wcfia.yoshikoder.util.FileUtil; /** * Breaks a text into sentence spans. Note that this will leave trailing newlines (paragraph * breaks etc.) in its current incarnation. These should be fixed. * * @author will * */ public class SentenceTokenizer { private static Logger log = Logger.getLogger("edu.harvard.wcfia.yoshikoder.document.tokenizer.SentenceTokenizer"); protected Locale locale; protected BreakIterator sentenceIterator; public SentenceTokenizer(Locale loc){ if (loc == null){ locale = Locale.getDefault(); log.info("Null handed in as Locale, using default: " + locale.toString()); } else locale = loc; sentenceIterator = BreakIterator.getSentenceInstance(locale); } public int [][] getTokenSpans(String txt){ sentenceIterator.setText(txt); List list = new ArrayList(); int start = sentenceIterator.first(); int end = sentenceIterator.next(); while (end != BreakIterator.DONE) { if (Character.isLetterOrDigit( txt.charAt(start) )) list.add(new int[]{start, end}); start = end; try { end = sentenceIterator.next(); // throws exceptions rarely } catch (Exception e) { log.log(Level.WARNING, "tokenization exception somewhere after character " + end, e); } } sentenceIterator.setText(""); // drop any document references we might be keeping return (int[][])list.toArray(new int[list.size()][2]); } public String[] getTokens(String txt){ int[][] spans = getTokenSpans(txt); String[] s = new String[spans.length]; for (int ii = 0; ii < s.length; ii++) { s[ii] = txt.substring(spans[ii][0], spans[ii][1]); } return s; } public static void main(String[] args) throws Exception { File f = new File("/Users/will/review.txt"); String txt = FileUtil.slurp(f); txt = txt.replace('\r', '\n'); WordTokenizer tok = new WordTokenizer(null); String[] spans = tok.getTokens(txt); for (int ii = 0; ii < spans.length; ii++) { System.out.println(ii + ": " + spans[ii] + "]"); } } }