// Copyright 2015 Thomas Müller // This file is part of MarMoT, which is licensed under GPLv3. package marmot.tokenize.openlp; import java.io.Writer; import java.util.List; import marmot.tokenize.preprocess.Pair; import marmot.tokenize.rules.RuleProvider; import marmot.tokenize.rules.RulebasedTransformator; public class OpenNlpConverter { private RulebasedTransformator untok_transformator_; private RulebasedTransformator tok_transformator_; public OpenNlpConverter(RuleProvider provider) { if (provider == null) { tok_transformator_ = null; untok_transformator_ = null; } else { tok_transformator_ = provider.getTokTransformator(); untok_transformator_ = provider.getUnTokTransformator(); } } public void convert(Iterable<Pair> pairs, Writer writer, int verbose) { Aligner a = new LevenshteinAligner(); int total = 0; int error = 0; for (Pair pair : pairs) { total ++; String tokenized = pair.tokenized; String untokenized = pair.untokenized; if (tok_transformator_ != null) { tokenized = tok_transformator_.applyRules(tokenized); } if (untok_transformator_ != null) { untokenized = untok_transformator_.applyRules(untokenized); } try { List<Aligner.Pair> alignment = a.align(tokenized, untokenized).pairs; if(alignment == null) { throw new RuntimeException(); // java style goto } writer.write(insertSplit(untokenized, alignment)); writer.write("\n"); // if(verbose == 1 || verbose == 2) { // System.err.println(tokenized); // System.err.println(untokenized); // } } catch (Exception e) { // catches unforeseen alignment errors as well error++; if(verbose == 2 || verbose == 3) { if(!e.getClass().toString().split("\\.")[2].equals("RuntimeException")) { System.err.println("GRAVE ERROR!"); } System.err.println("<Tok> " + tokenized + "</Tok>"); System.err.println("<UnT> " + untokenized + "</UnT>" ); } } } if(verbose > 0) { System.err.format("Conversion Error rate: %d / %d = %g\n", error, total, error * 100. / total); } } private String insertSplit(String untokenized, List<Aligner.Pair> alignment){ StringBuilder sb = new StringBuilder(); int index = 0; for(Aligner.Pair p : alignment) { if(p.b == -1){ sb.append("<SPLIT>"); } else { sb.append(untokenized.charAt(index)); index ++; } } return sb.toString(); } }