/* LanguageTool, a natural language style checker * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 * USA */ package org.languagetool.dev; import com.google.common.base.Charsets; import morfologik.fsa.FSA; import org.languagetool.JLanguageTool; import org.languagetool.tools.StringTools; import java.io.IOException; import java.nio.ByteBuffer; import java.nio.file.FileSystems; import java.nio.file.Files; import java.util.*; /** * Export German nouns, to be used by jWordSplitter. * * @author Daniel Naber */ public class ExportGermanNouns { private static final String DICT_FILENAME = "/de/german.dict"; private static final String ADDED_DICT_FILENAME = "languagetool-language-modules/de/src/main/resources/org/languagetool/resource/de/added.txt"; private ExportGermanNouns() { } private List<String> getSortedWords() throws IOException { Set<String> words1 = getBinaryDictWords(); Set<String> words2 = getAddedDictWords(); List<String> sortedWords = new ArrayList<>(); sortedWords.addAll(words1); sortedWords.addAll(words2); Collections.sort(sortedWords); return sortedWords; } private Set<String> getBinaryDictWords() throws IOException { final FSA fsa = FSA.read(JLanguageTool.getDataBroker().getFromResourceDirAsStream(DICT_FILENAME)); final Set<String> set = new HashSet<>(); for (ByteBuffer buffer : fsa) { final byte [] sequence = new byte [buffer.remaining()]; buffer.get(sequence); final String output = new String(sequence, "iso-8859-1"); if (isRelevantNoun(output)) { final String[] parts = output.split("\\+"); final String term = parts[0].toLowerCase(); set.add(term); } } return set; } private Set<String> getAddedDictWords() throws IOException { final Set<String> set = new HashSet<>(); List<String> lines = Files.readAllLines(FileSystems.getDefault().getPath(ADDED_DICT_FILENAME), Charsets.UTF_8); for (String line : lines) { if (isRelevantNoun(line)) { final String[] parts = line.split("\t"); final String term = parts[0].toLowerCase(); set.add(term); } } return set; } private boolean isRelevantNoun(String output) { boolean isNoun = output.contains("SUB:") || (output.contains("EIG:") && output.contains("COU")); return isNoun && !output.contains(":ADJ") && !StringTools.isAllUppercase(output); } public static void main(String[] args) throws IOException { ExportGermanNouns prg = new ExportGermanNouns(); List<String> words = prg.getSortedWords(); System.out.println("# DO NOT MODIFY - automatically exported"); System.out.println("# Exporting class: " + ExportGermanNouns.class.getName()); System.out.println("# Export date: " + new Date()); System.out.println("# LanguageTool: " + JLanguageTool.VERSION + " (" + JLanguageTool.BUILD_DATE + ")"); System.out.println("# Potential German compound parts."); System.out.println("# Data from Morphy (http://www.wolfganglezius.de/doku.php?id=cl:morphy)"); System.out.println("# with extensions by LanguageTool (https://languagetool.org)"); System.out.println("# License: Creative Commons Attribution-Share Alike 4.0, http://creativecommons.org/licenses/by-sa/4.0/"); for (String word : words) { System.out.println(word); } //System.err.println("Done. Printed " + words.size() + " words."); } }