/* LanguageTool, a natural language style checker
* Copyright (C) 2013 Daniel Naber (http://www.danielnaber.de)
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
* USA
*/
package org.languagetool.tools;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.util.Arrays;
import java.util.List;
import java.util.Scanner;
import org.apache.commons.cli.CommandLine;
/**
* Create a Morfologik spelling binary dictionary from plain text data.
*/
final class SpellDictionaryBuilder extends DictionaryBuilder {
SpellDictionaryBuilder(File infoFile) throws IOException {
super(infoFile);
}
public static void main(String[] args) throws Exception {
BuilderOptions builderOptions = new BuilderOptions();
builderOptions.addOption(BuilderOptions.INPUT_OPTION, true,
"plain text dictionary file, e.g. created from a Hunspell dictionary by 'unmunch'", true);
builderOptions.addOption(BuilderOptions.INFO_OPTION, true,
BuilderOptions.INFO_HELP, true);
builderOptions.addOption(BuilderOptions.FREQ_OPTION, true,
BuilderOptions.FREQ_HELP, false);
CommandLine cmdLine = builderOptions.parseArguments(args, SpellDictionaryBuilder.class);
String plainTextFile = cmdLine.getOptionValue(BuilderOptions.INPUT_OPTION);
String infoFile = cmdLine.getOptionValue(BuilderOptions.INFO_OPTION);
SpellDictionaryBuilder builder = new SpellDictionaryBuilder(new File(infoFile));
builder.setOutputFilename(cmdLine.getOptionValue(BuilderOptions.OUTPUT_OPTION));
File inputFile = new File(plainTextFile);
if (cmdLine.hasOption(BuilderOptions.FREQ_OPTION)) {
builder.readFreqList(new File(cmdLine.getOptionValue(BuilderOptions.FREQ_OPTION)));
inputFile = builder.addFreqData(inputFile, true);
}
builder.build(inputFile);
}
private File build(File plainTextDictFile) throws Exception {
File tempFile = null;
try {
tempFile = tokenizeInput(plainTextDictFile);
return buildFSA(tempFile);
} finally {
if (tempFile != null) {
tempFile.delete();
}
}
}
private File tokenizeInput(File plainTextDictFile) throws IOException {
// Tokenizer wordTokenizer = language.getWordTokenizer();
String encoding = getOption("fsa.dict.encoding");
String separatorChar = hasOption("fsa.dict.separator") ? getOption("fsa.dict.separator") : "";
File tempFile = File.createTempFile(SpellDictionaryBuilder.class.getSimpleName(), ".txt");
try (Scanner scanner = new Scanner(plainTextDictFile, encoding)) {
try (Writer out = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(tempFile), encoding))) {
while (scanner.hasNextLine()) {
String line = scanner.nextLine();
int sepPos = separatorChar.isEmpty() ? -1 : line.indexOf(separatorChar);
String occurrences = sepPos != -1 ? line.substring(sepPos + separatorChar.length()) : "";
String lineWithoutOcc = sepPos != -1 ? line.substring(0, sepPos) : line;
// List<String> tokens = wordTokenizer.tokenize(lineWithoutOcc);
List<String> tokens = Arrays.asList(lineWithoutOcc);
for (String token : tokens) {
if (token.length() > 0) {
out.write(token);
if (sepPos != -1) {
out.write(separatorChar);
if (tokens.size() == 1) {
out.write(occurrences);
} else {
// TODO: as the word occurrence data from
// https://github.com/mozilla-b2g/gaia/tree/master/apps/keyboard/js/imes/latin/dictionaries
// has already been assigned in a previous step, we now cannot just use
// that value after having changed the tokenization...
out.write("A"); // assume least frequent
}
}
out.write("\n");
}
}
}
}
}
return tempFile;
}
}