/* LanguageTool, a natural language style checker * Copyright (C) 2015 Daniel Naber (http://www.danielnaber.de) * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 * USA */ package org.languagetool.dev.bigdata; import org.languagetool.languagemodel.LuceneLanguageModel; import java.io.File; import java.io.IOException; import java.util.*; /** * Take simple confusion set file (one set per line, separated by semicolons) and print out * occurrence information for all items. * @since 3.1 */ final class ConfusionSetOccurrenceLookup { private ConfusionSetOccurrenceLookup() { } public static void main(String[] args) throws IOException { if (args.length != 2) { System.out.println("Usage: " + ConfusionSetOccurrenceLookup.class.getName() + " <confusion-file> <ngram-data-dir>"); System.exit(1); } try (Scanner sc = new Scanner(new File(args[0])); LuceneLanguageModel lm = new LuceneLanguageModel(new File(args[1])) ) { while (sc.hasNextLine()) { String line = sc.nextLine(); String[] words = line.split(";\\s*"); long total = 0; List<Long> counts = new ArrayList<>(); StringBuilder sb = new StringBuilder(); for (String word : words) { long count = lm.getCount(word); total += count; sb.append(word).append(":").append(count).append(" "); counts.add(count); } float factor = (float)Collections.max(counts) / Collections.min(counts); System.out.printf(Locale.ENGLISH, total + " " + line + " " + sb.toString().trim() + " factor:%.1f\n", factor); } } } }