/* LanguageTool, a natural language style checker
* Copyright (C) 2016 Daniel Naber (http://www.danielnaber.de)
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
* USA
*/
package org.languagetool.dev;
import org.languagetool.rules.spelling.hunspell.Hunspell;
import org.languagetool.rules.spelling.morfologik.MorfologikSpeller;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.nio.charset.CharacterCodingException;
import java.util.List;
import java.util.Scanner;
/**
* A hacky attempt to find rare words which are considered correct
* by the spell checker, but which might actually be too rare,
* so a rule with a warning might be advisable.
* @since 3.6
*/
final class RareWordsFinder {
private static final String dictInClassPath = "/en/hunspell/en_US.dict";
private final Hunspell.Dictionary hunspellDict;
private RareWordsFinder(String hunspellBase) throws IOException {
Hunspell hunspell = Hunspell.getInstance();
hunspellDict = hunspell.getDictionary(hunspellBase);
}
private void run(File input, int minimum) throws FileNotFoundException, CharacterCodingException {
MorfologikSpeller speller = new MorfologikSpeller(dictInClassPath, 1);
int lineCount = 0;
int wordCount = 0;
try (Scanner s = new Scanner(input)) {
while (s.hasNextLine()) {
String line = s.nextLine();
String[] parts = line.split("\t");
String word = parts[0];
long count = Long.parseLong(parts[1]);
if (count <= minimum) {
if (word.matches("[a-zA-Z]+") && !word.matches("[A-Z]+") && !word.matches("[a-zA-Z]+[A-Z]+[a-zA-Z]*") && !word.matches("[A-Z].*")) {
boolean isMisspelled = speller.isMisspelled(word);
if (!isMisspelled) {
//List<String> suggestions = speller.getSuggestions(word); // seems to work only for words that are actually misspellings
List<String> suggestions = hunspellDict.suggest(word);
suggestions.remove(word);
if (suggestionsMightBeUseful(word, suggestions)) {
System.out.println(word + "\t" + count + " -> " + String.join(", ", suggestions));
wordCount++;
}
}
}
}
lineCount++;
if (lineCount % 1_000_000 == 0) {
System.out.println("lineCount: " + lineCount + ", words found: " + wordCount);
}
}
System.out.println("Done. lineCount: " + lineCount + ", words found: " + wordCount);
}
}
private boolean suggestionsMightBeUseful(String word, List<String> suggestions) {
return suggestions.size() > 0 &&
!suggestions.get(0).contains(" ") &&
!suggestions.get(0).equals(word + "s") &&
!suggestions.get(0).equals(word.replaceFirst("s$", ""));
}
public static void main(String[] args) throws IOException {
if (args.length != 3) {
System.out.println("Usage: " + RareWordsFinder.class.getSimpleName() + " <wordFile> <hunspellBase> <limit>");
System.out.println(" <wordFile> is a word file with occurrence counts, separated by tabs");
System.out.println(" <hunspellBase> is the hunspell file without suffix, e.g. '/path/to/en_US'");
System.out.println(" <limit> only words with this many or less occurrences are considered");
System.exit(1);
}
RareWordsFinder finder = new RareWordsFinder(args[1]);
File input = new File(args[0]);
int minimum = Integer.parseInt(args[2]);
finder.run(input, minimum);
}
}