/*
* Copyright 2014 Alpha Cephei Inc.
* All Rights Reserved. Use is subject to license terms.
*
* See the file "license.terms" for information on usage and
* redistribution of this file, and for a DISCLAIMER OF ALL
* WARRANTIES.
*/
package edu.cmu.sphinx.alignment;
import java.util.Arrays;
import java.util.List;
public class SimpleTokenizer implements TextTokenizer {
public List<String> expand(String text) {
text = text.replace('’', '\'');
text = text.replace('‘', ' ');
text = text.replace('”', ' ');
text = text.replace('“', ' ');
text = text.replace('"', ' ');
text = text.replace('»', ' ');
text = text.replace('«', ' ');
text = text.replace('–', '-');
text = text.replace('—', ' ');
text = text.replace('…', ' ');
text = text.replace(" - ", " ");
text = text.replaceAll("[/_*%]", " ");
text = text.toLowerCase();
String[] tokens = text.split("[.,?:!;()]");
return Arrays.asList(tokens);
}
}