/*
* Copyright 2010
* Ubiquitous Knowledge Processing (UKP) Lab
* Technische Universität Darmstadt
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package de.tudarmstadt.ukp.dkpro.core.norvig;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import static java.lang.Character.isUpperCase;
import static java.lang.Character.toUpperCase;
import static java.lang.Double.parseDouble;
/**
* Spelling corrector based on Norvig's algorithm.
*
* @see <a href="http://norvig.com/spell-correct.html">Norvig's algorithm</a>
*/
public class NorvigSpellingAlgorithm
{
private final static Pattern WORD_PATTERN = Pattern.compile("\\w+");
private final Map<String, AtomicInteger> nWords = new HashMap<String, AtomicInteger>();
private Map<String, String> cachedCorrections = new HashMap<String, String>();
private int bestScore = -1;
private String bestCandidate;
protected void resetScore()
{
bestScore = -1;
bestCandidate = null;
}
public void reset()
{
resetScore();
cachedCorrections = new HashMap<String, String>();
}
/**
* Read words from the given reader and count their occurrences.
*
* @param aReader
* the reader.
* @throws IOException
* if the words cannot be read.
*/
public void train(Reader aReader)
throws IOException
{
BufferedReader in = new BufferedReader(aReader);
String line = in.readLine();
while (line != null) {
Matcher m = WORD_PATTERN.matcher(line.toLowerCase());
while (m.find()) {
String word = m.group();
AtomicInteger count = nWords.get(word);
if (count == null) {
count = new AtomicInteger(0);
nWords.put(word, count);
}
count.incrementAndGet();
}
line = in.readLine();
}
}
/**
* Read words from the given URL and count their occurrences.
*
* @param aUrl
* the URL to load the words from.
* @param aEncoding
* the encoding.
* @throws IOException
* if the words cannot be read.
*/
public void train(URL aUrl, String aEncoding)
throws IOException
{
try (InputStream is = aUrl.openStream()) {
train(new InputStreamReader(is, aEncoding));
}
}
/**
* Get a list for all possible variants of the given word containing an insertion, deletion,
* replacement or transposition.
*
* @param word
* the word.
* @return the list of variants.
*/
protected List<String> edits(String word)
{
List<String> candidates = new ArrayList<String>();
for (int i = 0; i < word.length(); i++) {
// deletes
candidates.add(word.substring(0, i) + word.substring(i + 1));
for (char c = 'a'; c <= 'z'; c++) {
// replaces
candidates.add(word.substring(0, i) + c + word.substring(i + 1));
// inserts
candidates.add(word.substring(0, i) + c + word.substring(i));
}
}
// inserts at the end
for (char c = 'a'; c <= 'z'; c++) {
candidates.add(word + c);
}
// transposes
for (int i = 0; i < word.length() - 1; i++) {
candidates.add(word.substring(0, i) + word.substring(i + 1, i + 2)
+ word.substring(i, i + 1) + word.substring(i + 2));
}
return candidates;
}
/**
* Try to find a correction for the given word. The word may contain up to two edits. If no
* better alternative is found, the word is returned verbatim. For performance reasons
* corrections are cached.
*
* @param aWord
* the word to correct (has to be lower-case)
* @return the possible correction.
*/
public String correct(String aWord)
{
// Too short words and numbers cannot be corrected.
if ((aWord.length() < 2) || isNumber(aWord)) {
return aWord;
}
// Remember case
boolean isUpper = isUpperCase(aWord.charAt(0));
// Correct if not cached
String word = aWord.toLowerCase();
String correction = cachedCorrections.get(word);
if (correction == null) {
correction = getBestCandidate(word);
cachedCorrections.put(word, correction);
}
// Restore case
char[] buffer = correction.toCharArray();
if (isUpper) {
buffer[0] = toUpperCase(buffer[0]);
}
return new String(buffer);
}
protected boolean isNumber(String aWord)
{
try {
parseDouble(aWord);
return true;
}
catch (NumberFormatException nfe) {
return false;
}
}
/**
* Try to find a correction for the given word. The word may contain up to two edits. If no
* better alternative is found, the word is returned verbatim.
*
* @param word
* the word to correct (has to be lower-case)
* @return the possible correction.
*/
protected String getBestCandidate(String word)
{
// If the word is in the dictionary, it is probably correct
if (nWords.containsKey(word)) {
return word;
}
// Reset score
resetScore();
// Look up the potential correct words in the dictionary
List<String> candidates1 = edits(word);
for (String candidate : candidates1) {
consider(candidate);
}
// Found possible correction for one mistake
if (bestScore != -1) {
return bestCandidate;
}
// Repeat the process for a potential second mistake
for (String candidate1 : candidates1) {
List<String> candidates2 = edits(candidate1);
for (String candidate2 : candidates2) {
consider(candidate2);
}
}
if (bestScore != -1) {
return bestCandidate;
}
else {
return word;
}
}
/**
* Consider the given candidate. If it is better than a previously found candidate, then
* remember it, otherwise forget it.
*
* @param candidate
* the candidate to consider.
*/
protected void consider(String candidate)
{
AtomicInteger score = nWords.get(candidate);
if (score != null) {
if (score.get() > bestScore) {
bestScore = score.get();
bestCandidate = candidate;
}
}
}
}