/* LanguageTool, a natural language style checker * Copyright (C) 2015 Daniel Naber (http://www.danielnaber.de) * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 * USA */ package org.languagetool.dev; import morfologik.fsa.FSA; import org.apache.commons.io.FileUtils; import org.languagetool.AnalyzedTokenReadings; import org.languagetool.JLanguageTool; import org.languagetool.tagging.de.GermanTagger; import java.io.File; import java.io.IOException; import java.nio.ByteBuffer; import java.util.*; /** * One-time script: find missing "-es" forms in the German tagger dictionary, * e.g. Morphy knows "Antrag" and "Antrags", but not "Antrages". * Uses Google n-gram data as a filter, but may nonetheless create * forms that aren't common anymore (e.g. Verb -> Verbes). * * @author Daniel Naber */ public class MissingGenitiveFinder { private static final String DICT_FILENAME = "/de/german.dict"; private final Map<String,Integer> occurrences; private MissingGenitiveFinder() throws IOException { occurrences = loadOccurrences("/media/Data/google-ngram/de/1gram-aggregated/all_without_underscore"); } @SuppressWarnings("unchecked") private Map<String, Integer> loadOccurrences(String filename) throws IOException { System.err.println("Loading " + filename); Map<String, Integer> map = new HashMap<>(); List<String> lines = (List<String>)FileUtils.readLines(new File(filename)); for (String line : lines) { String[] parts = line.split(" "); map.put(parts[0], Integer.valueOf(parts[1])); } System.err.println("Loaded " + map.size() + " occurrence items"); return map; } @SuppressWarnings("UnnecessaryParentheses") private void run() throws IOException { GermanTagger tagger = new GermanTagger(); final FSA fsa = FSA.read(JLanguageTool.getDataBroker().getFromResourceDirAsStream(DICT_FILENAME)); int i = 0; for (ByteBuffer buffer : fsa) { final byte [] sequence = new byte [buffer.remaining()]; buffer.get(sequence); final String output = new String(sequence, "iso-8859-1"); boolean isNoun = output.contains("+SUB:") || (output.contains("+EIG:") && output.contains("COU")); // COU = Country if (isNoun && output.contains(":GEN:")) { final String[] parts = output.split("\\+"); String word = parts[0]; String esWord = parts[0].replaceFirst("s$", "es"); if (isRelevantWord(word)) { boolean hasEsGenitive = hasEsGenitive(tagger, word); boolean ignore1 = word.endsWith("els") && !word.endsWith("iels"); Integer occurrence = occurrences.get(esWord); if (!hasEsGenitive && !ignore1 && occurrence != null) { //System.out.println(i + ". " + word + " " + occurrence); System.out.println(esWord + "\t" + word.replaceFirst("s$", "") + "\t" + parts[2]); i++; } } } } } private boolean isRelevantWord(String word) { return word.endsWith("s") && !word.endsWith("es") && !word.endsWith("ens") && !word.endsWith("ems") && !word.endsWith("els") && !word.endsWith("ers") && !word.endsWith("lings") && !word.endsWith("leins") && !word.endsWith("chens") && !word.endsWith("erns") && !word.endsWith("elns") && !word.endsWith("os") && !word.endsWith("us") && !word.endsWith("is") && !word.endsWith("as") && !word.endsWith("ols"); } private boolean hasEsGenitive(GermanTagger tagger, String word) throws IOException { String esForm = word.replaceFirst("s$", "es"); List<AnalyzedTokenReadings> readings = tagger.tag(Collections.singletonList(esForm)); for (AnalyzedTokenReadings reading : readings) { if (reading.isTagged()) { return true; } } return false; } public static void main(String[] args) throws IOException { MissingGenitiveFinder prg = new MissingGenitiveFinder(); prg.run(); } }