/* LanguageTool, a natural language style checker * Copyright (C) 2016 Daniel Naber (http://www.danielnaber.de) * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 * USA */ package org.languagetool.dev.bigdata; import org.languagetool.AnalyzedToken; import org.languagetool.language.GermanyGerman; import org.languagetool.languagemodel.LuceneLanguageModel; import org.languagetool.synthesis.Synthesizer; import java.io.File; import java.io.IOException; import java.nio.file.Files; import java.util.List; import static java.util.Arrays.asList; /** * Hackish attempt to have a corpus-based guess on whether a German verb is reflexive. */ final class GermanReflexiveVerbGuesser { private final Synthesizer synthesizer; private GermanReflexiveVerbGuesser() { synthesizer = new GermanyGerman().getSynthesizer(); } private void run(File indexTopDir, File lemmaListFile) throws IOException { List<String> lemmas = Files.readAllLines(lemmaListFile.toPath()); System.out.println("Durchschnitt Prozent | Anzahl Lemma | mich/uns/euch ... | ... mich/uns/euch | Lemma"); try (LuceneLanguageModel lm = new LuceneLanguageModel(indexTopDir)) { for (String lemma : lemmas) { //if (!lemma.equals("reklamieren")) { continue; } //if (!lemma.equals("hertreiben")) { continue; } String[] firstPsSinArray = synthesizer.synthesize(new AnalyzedToken(lemma, "VER:INF:NON", lemma), "VER:1:SIN:PRÄ.*", true); String[] thirdPsSinArray = synthesizer.synthesize(new AnalyzedToken(lemma, "VER:INF:NON", lemma), "VER:3:SIN:PRÄ.*", true); String firstPsSin = firstPsSinArray.length > 0 ? firstPsSinArray[0] : null; String thirdPsSin = thirdPsSinArray.length > 0 ? thirdPsSinArray[0] : null; long reflexiveCount1 = count1(lm, lemma, firstPsSin, thirdPsSin) - counterExamples("für", lm, lemma, firstPsSin, thirdPsSin) - counterExamples("vor", lm, lemma, firstPsSin, thirdPsSin); long reflexiveCount2 = count2(lm, lemma, firstPsSin, thirdPsSin); long lemmaCount = lm.getCount(lemma); float factor1 = ((float)reflexiveCount1 / lemmaCount) * 100.0f; float factor2 = ((float)reflexiveCount2 / lemmaCount) * 100.0f; float avgFactor = (factor1 + factor2) / 2; //System.out.printf("%.2f%% %.2f%% " + reflexiveCount1 + " " + reflexiveCount2 + " " + lemmaCount + " " + lemma + "\n", factor1, factor2); //System.out.printf("%.2f%% %.2f%% " + lemmaCount + " " + lemma + "\n", factor1, factor2); System.out.printf("%.2f %d %.2f%% %.2f%% %s\n", avgFactor, lemmaCount, factor1, factor2, lemma); } } } private long count1(LuceneLanguageModel lm, String lemma, String firstPsSin, String thirdPsSin) { return lm.getCount(asList("mich", firstPsSin)) // "wenn ich mich schäme" + lm.getCount(asList("mich", lemma)) // "ich muss mich schämen" //+ lm.getCount(asList("dich", sing2)) + lm.getCount(asList("sich", thirdPsSin)) + lm.getCount(asList("uns", lemma)) + lm.getCount(asList("euch", lemma)) + lm.getCount(asList("sich", lemma)); } private long counterExamples(String term, LuceneLanguageModel lm, String lemma, String firstPsSin, String thirdPsSin) { return lm.getCount(asList(term, "mich", firstPsSin)) // "für mich reklamiere" + lm.getCount(asList(term, "mich", lemma)) // "... für mich reklamieren" + lm.getCount(asList(term, "sich", thirdPsSin)) + lm.getCount(asList(term, "uns", lemma)) + lm.getCount(asList(term, "euch", lemma)) + lm.getCount(asList(term, "sich", lemma)); } private long count2(LuceneLanguageModel lm, String lemma, String firstPsSin, String thirdPsSin) { return lm.getCount(asList(firstPsSin, "mich")) // "schäme mich" //+ lm.getCount(asList(sing2, "dich")) + lm.getCount(asList(thirdPsSin, "sich")) + lm.getCount(asList(lemma, "uns")) //+ lm.getCount(asList(plu2, "euch")) + lm.getCount(asList(lemma, "sich")); } public static void main(String[] args) throws IOException { if (args.length != 2) { System.out.println("Usage: " + GermanReflexiveVerbGuesser.class.getName() + " <ngramDataIndex> <verbLemmaFile>"); System.exit(1); } String indexTopDir = args[0]; String lemmaListFile = args[1]; new GermanReflexiveVerbGuesser().run(new File(indexTopDir), new File(lemmaListFile)); } }