package clear.helper; import clear.dep.DepNode; import clear.dep.DepTree; import clear.morph.MorphEnLib; import java.io.FileInputStream; import java.util.ArrayList; import java.util.StringTokenizer; import opennlp.tools.tokenize.TokenizerME; import opennlp.tools.tokenize.TokenizerModel; public class Tokenizer { private TokenizerME me_tokenizer; public Tokenizer(String modelFile) { try { me_tokenizer = new TokenizerME(new TokenizerModel(new FileInputStream(modelFile))); } catch (Exception e) { e.printStackTrace(); } } public DepTree tokenize(String line) { String[] tokens = me_tokenizer.tokenize(line); DepTree tree = new DepTree(); DepNode node; for (int i = 0; i < tokens.length; i++) { node = new DepNode(); node.id = i + 1; node.form = tokens[i]; tree.add(node); } return tree; } static public String[] hyphenate(String[] tokens) { ArrayList<String> l0 = new ArrayList<>(); StringTokenizer tok; for (String token : tokens) { tok = new StringTokenizer(token, "-/", true); while (tok.hasMoreTokens()) { l0.add(tok.nextToken()); } } ArrayList<String> l1 = new ArrayList<>(); String prev, curr, next; int i, idx, size = l0.size(); for (i = 0; i < size; i++) { curr = l0.get(i); if (i > 0 && (curr.equals("-") || curr.equals("/"))) { if (MorphEnLib.isPunctuation(prev = l0.get(i - 1))) { idx = l1.size() - 1; l1.set(idx, l1.get(idx) + curr); continue; } else if (i + 1 < size && prev.matches("\\d+") && (next = l0.get(i + 1)).matches("\\d+")) { idx = l1.size() - 1; l1.set(idx, l1.get(idx) + curr + next); i++; continue; } } l1.add(curr); } String[] tmp = new String[l1.size()]; l1.toArray(tmp); return tmp; } }