/* LanguageTool, a natural language style checker
* Copyright (C) 2014 Daniel Naber (http://www.danielnaber.de)
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
* USA
*/
package org.languagetool.dev.eval;
import com.optimaize.langdetect.ngram.NgramExtractors;
import com.optimaize.langdetect.profiles.LanguageProfile;
import com.optimaize.langdetect.profiles.LanguageProfileBuilder;
import com.optimaize.langdetect.profiles.LanguageProfileWriter;
import com.optimaize.langdetect.text.CommonTextObjectFactories;
import com.optimaize.langdetect.text.TextObject;
import com.optimaize.langdetect.text.TextObjectFactory;
import org.apache.commons.io.IOUtils;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
/**
* Train the language detector for a language not known yet.
* @since 2.9
*/
final class LanguageDetectionTrainer {
public static void main(String[] args) throws IOException {
if (args.length != 3) {
System.out.println("Usage: " + LanguageDetectionTrainer.class.getName() + " <languageCode> <plainTextFile> <minimalFrequency>");
System.exit(1);
}
String langCode = args[0];
String fileName = args[1];
int minimalFrequency = Integer.parseInt(args[2]);
String text = IOUtils.toString(new FileReader(fileName));
TextObjectFactory textObjectFactory = CommonTextObjectFactories.forIndexingCleanText();
TextObject inputText = textObjectFactory.create().append(text);
LanguageProfile languageProfile = new LanguageProfileBuilder(langCode)
.ngramExtractor(NgramExtractors.standard())
.minimalFrequency(minimalFrequency)
.addText(inputText)
.build();
File outputDir = new File(System.getProperty("user.dir")); // current dir
new LanguageProfileWriter().writeToDirectory(languageProfile, outputDir);
System.out.println("Language profile written to " + new File(outputDir, langCode).getAbsolutePath());
}
}