package com.transmem.nlp; import java.util.ArrayList; /** * Implements the ISegmenter interface to break an English sentence into separate words. * It simply breaks the sentence by spaces,punctuation characters. */ public class EnglishSegmenter implements ISegmenter { /** * Break an English sentence to a string array of words and punctuation marks with spaces removed. * The recognised tokens are words (a sequence of letters) or numbers or mixed. * Hyphenated words are separated as two words with the mark '-' reserved as one token in between. * It can be interpreted by higher-level programs for a hyphen or minus mark. * * @param sent - sentence as a string * @return array of strings of words and marks */ public String[] segment(String sent) { ArrayList<String> ar = new ArrayList<String>(); char[] chars = sent.toCharArray(); int n = chars.length; int i = 0; int x = -1; while (i < n) { char c = chars[i]; if (Character.isWhitespace(c)) { if (x < 0) { x = 0; } if (i > x) { //String s = sent.substring(x, i); //String s = new String(chars, x, i-x); //which is faster and robust? ar.add(sent.substring(x, i)); } x = i + 1; } else if (!Character.isLetterOrDigit(c)) { if (x < 0) { x = 0; } if (i > x) { ar.add(sent.substring(x, i)); } String s = String.valueOf(c); ar.add(s); x = i + 1; } i ++; } if (i > x && x >= 0) { ar.add(sent.substring(x, i)); } return (String[])ar.toArray(new String[ar.size()]); } }