package com.cognitionis.nlp_segmentation; import java.io.File; import java.util.logging.Level; import java.util.logging.Logger; /** * * @author hector */ public class SentSplit { public static void sentSplitAndTokenize(File in_file, File out_file){ try { // Create a new tokenizer (default) and tokenize with sentence split Tokenizer_PTB_Rulebased toki=new Tokenizer_PTB_Rulebased(); toki.tokenize(in_file, out_file); } catch (Exception ex) { Logger.getLogger(SentSplit.class.getName()).log(Level.SEVERE, null, ex); } } /* public static void printEachForward(BreakIterator boundary, String source) { int start = boundary.first(); for (int end = boundary.next(); end != BreakIterator.DONE; start = end, end = boundary.next()) { System.out.println(source.substring(start, end)); } } public static void printFirst(BreakIterator boundary, String source) { int start = boundary.first(); int end = boundary.next(); System.out.println(source.substring(start, end)); } public static void printLast(BreakIterator boundary, String source) { int end = boundary.last(); int start = boundary.previous(); System.out.println(source.substring(start, end)); } case SEGMENT_SENTENCE_PRUEBAS: for (NLPFile nlpfile : nlp_files) { if (!nlpfile.getClass().getSimpleName().equals("PlainFile")) { throw new Exception("TIMEE requires PlainFile files as input. Found: " + nlpfile.getClass().getSimpleName()); } String stringToExamine = FileUtils.readFileAsString(nlpfile.getFile().getCanonicalPath(), null); //print each word in order BreakIterator boundary = BreakIterator.getWordInstance(); boundary.setText(stringToExamine); printEachForward(boundary, stringToExamine); System.out.println("---------sent-------" + Arrays.toString(BreakIterator.getAvailableLocales())); //print each sentence in reverse order boundary = BreakIterator.getSentenceInstance(new Locale(lang)); boundary.setText(stringToExamine); printEachForward(boundary, stringToExamine); printFirst(boundary, stringToExamine); printLast(boundary, stringToExamine); BreakIterator iterator = BreakIterator.getSentenceInstance(Locale.US); String source = "This is a test. This is a T.L.A. test. Now with a Dr. in it."; iterator.setText(source); int start = iterator.first(); for (int end = iterator.next(); end != BreakIterator.DONE; start = end, end = iterator.next()) { System.out.println(source.substring(start, end)); } } break; */ }