/* LanguageTool, a natural language style checker
* Copyright (C) 2013 Daniel Naber (http://www.danielnaber.de)
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
* USA
*/
package org.languagetool.tools;
import morfologik.tools.DictDecompile;
import morfologik.tools.FSADecompile;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.util.Arrays;
import java.util.Scanner;
import java.util.regex.Pattern;
import org.apache.commons.cli.CommandLine;
/**
* Print the contents of a Morfologik binary dictionary to STDOUT.
*/
final class DictionaryExporter extends DictionaryBuilder {
protected DictionaryExporter(File infoFile) throws IOException {
super(infoFile);
}
public static void main(String[] args) throws Exception {
BuilderOptions builderOptions = new BuilderOptions();
builderOptions.addOption(BuilderOptions.INPUT_OPTION, true,
"binary Morfologik dictionary file (.dict)", true);
builderOptions.addOption(BuilderOptions.INFO_OPTION, true,
BuilderOptions.INFO_HELP, true);
CommandLine cmdLine = builderOptions.parseArguments(args, DictionaryExporter.class);
File binaryDictFile = new File(cmdLine.getOptionValue(BuilderOptions.INPUT_OPTION));
File infoFile = new File(cmdLine.getOptionValue(BuilderOptions.INFO_OPTION));
DictionaryExporter builder = new DictionaryExporter(infoFile);
builder.setOutputFilename(cmdLine.getOptionValue(BuilderOptions.OUTPUT_OPTION));
builder.build(binaryDictFile);
}
private void build(File binaryDictFile) throws RuntimeException, IOException {
String inputPath = binaryDictFile.toString();
File tmpOutputFile = File.createTempFile(
DictionaryExporter.class.getSimpleName() + "_separator", ".txt");
if (inputPath.contains("hunspell") || inputPath.contains("spelling")) {
String[] buildOptions = {
"--exit", "false",
"-i", binaryDictFile.toString(),
"-o", tmpOutputFile.toString()
};
System.out.println("Running Morfologik FSADecompile.main with these options: " + Arrays.toString(buildOptions));
FSADecompile.main(buildOptions);
} else {
String[] buildOptions = {"--exit", "false",
"-i", binaryDictFile.toString(),
"-o", tmpOutputFile.toString()
};
System.out.println("Running Morfologik DictDecompile.main with these options: " + Arrays.toString(buildOptions));
DictDecompile.main(buildOptions);
}
outputSeparatorToTab(tmpOutputFile);
System.out.println("Done. The dictionary export has been written to " + getOutputFilename());
}
protected void outputSeparatorToTab(File inputFile) throws RuntimeException, IOException {
File outputFile = new File(getOutputFilename());
String separator = getOption("fsa.dict.separator");
if (separator == null || separator.trim().isEmpty()) {
throw new IOException(
"A separator character (fsa.dict.separator) must be defined in the dictionary info file.");
}
boolean hasFrequency = isOptionTrue("fsa.dict.frequency-included");
String encoding = getOption("fsa.dict.encoding");
try (Scanner scanner = new Scanner(inputFile, encoding);
Writer out = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outputFile), encoding))) {
while (scanner.hasNextLine()) {
String line = scanner.nextLine();
String[] parts = line.split(Pattern.quote(separator));
if (parts.length == 3) {
if (hasFrequency) { // remove frequency data in the last byte
parts[2] = parts[2].substring(0, parts[2].length() - 1);
}
out.write(parts[1] + "\t" + parts[0] + "\t" + parts[2] + "\n");
} else if (parts.length == 2) {
if (hasFrequency) {
out.write(parts[1] + "\n");
}
out.write(parts[1] + "\t" + parts[0] + "\n");
} else if (parts.length == 1) {
out.write(parts[0]);
} else {
System.err
.println("Invalid input, expected one, two or three columns separated with "
+ separator + " in " + inputFile + ": " + line + " => ignoring");
}
}
}
}
}