SpellingSuggestions.java example

Explorer

Java-AI-Book-Code-master
- mr_temp
  - nlp
    - com
      - knowledgebooks
        mapreduce
        NameFinder.java
        nlp
        ExtractNames.java
        util
        ScoredList.java
        Tokenizer.java
- src
  - database
    - CreateSampleDatabases.java
    - DumpMetaData.java
  - geneticalgorithm
    - Genetic.java
    - TestGenetic.java
  - markov
    - Markov.java
  - neuralnetworks
  - nlp
    - com
      - knowledgebooks
        mapreduce
        NameFinder.java
        nlp
        ASpellWrapper.java
        AutoTagger.java
        ComparableDocument.java
        ExtractNames.java
        FastTag.java
        util
        NameValue.java
        NoiseWords.java
        RunExternal.java
        ScoredList.java
        Tokenizer.java
    - public_domain
      - Stemmer.java
  - opencalais
    - OpenCalaisClient.java
  - powerloom
  - search
  - semanticweb
  - spelling
    - jazzy
      - SpellingJazzyTester.java
    - norvig
      - SpellingSuggestions.java
    - norvigwordpairs
      - SpellingSuggestionsWordPairs.java
  - textsearch
  - weka
    - WekaStocks.java
  - wordnet
    - WordNetTest.java

package spelling.norvig;

import nlp.com.knowledgebooks.nlp.util.Tokenizer;

import java.io.BufferedReader;
import java.io.DataInputStream;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.util.*;

/**
 * A spelling correct suggestion utility based on Peter Norvig's
 * Python spelling program: http://norvig.com/spell-correct.html
 *
 */
public class SpellingSuggestions {

	private SpellingSuggestions() { } // disable default constructor
	
	private static List<String> edits(String word) {
		int wordL = word.length(), wordLm1 = wordL - 1;
		List<String> possible = new ArrayList<String>();
		// drop a character:
		for (int i=0; i < wordL; ++i) {
			possible.add(word.substring(0, i) + word.substring(i+1));
		}
		// reverse order of 2 characters:
		for (int i=0; i < wordLm1; ++i) {
			possible.add(word.substring(0, i) + word.substring(i+1, i+2) +
					     word.substring(i, i+1) + word.substring(i+2));
		}
		// replace a character in each location in the word:
		for (int i=0; i < wordL; ++i) {
			for (char ch='a'; ch <= 'z'; ++ch) {
				possible.add(word.substring(0, i) + ch +
						     word.substring(i+1));
			}
		}
		// add in a character in each location in the word:
		for (int i=0; i <= wordL; ++i) {
			for (char ch='a'; ch <= 'z'; ++ch) {
				possible.add(word.substring(0, i) + ch +
						     word.substring(i));
			}
		}
		return possible;
	}
	
	public static String correct(String word) {
		if(wordCounts.containsKey(word)) return word;
		List<String> list = edits(word);
		// candidate hash has as word counts as keys, word as value:
		HashMap<Integer, String> candidates = new HashMap<Integer, String>();
		for (String testWord : list) {
			if(wordCounts.containsKey(testWord)) {
				candidates.put(wordCounts.get(testWord), testWord);
			}
		}
		/**
		 *  If candidates is not empty, then return the word with
		 *  the largest key (word count) value:
		 */
		if(candidates.size() > 0) {
			return candidates.get(Collections.max(candidates.keySet()));
		}
		/**
		 * If the edits method does not provide a candidate word that matches
		 * then we will call edits again with each previous permutation words.
		 * Note: this case occurs only about 20% of the time and obviously
		 *       increases the runtime of method correct.
		 */
		candidates.clear();
		for (String editWords : list) {
			for (String wrd : edits(editWords)) {
				if(wordCounts.containsKey(wrd)) {
					candidates.put(wordCounts.get(wrd),wrd);
				}
			}
		}
		//System.out.println(candidates);

		if (candidates.size() > 0) {
			return candidates.get(Collections.max(candidates.keySet()));
		}
		return word;
	}

	/**
	 * main test method
	 */
	public static void main(String[] args) {
		System.out.println(SpellingSuggestions.correct("baank"));
		System.out.println(SpellingSuggestions.correct("hown"));
		System.out.println(SpellingSuggestions.correct("watr"));
		System.out.println(SpellingSuggestions.correct("thee"));
		System.out.println(SpellingSuggestions.correct("thhe"));
		System.out.println(SpellingSuggestions.correct("smth"));
		System.out.println(SpellingSuggestions.correct("joonees"));
		System.out.println(SpellingSuggestions.correct("waateer"));
	}

	private static Map<String, Integer> wordCounts = new HashMap<String, Integer>();
	static {
		// Use Peter Norvig's training file:
		// http://www.norvig.com/spell-correct.html
		try {
			FileInputStream fstream = new FileInputStream("/tmp/big.txt");
			DataInputStream in = new DataInputStream(fstream);
			BufferedReader br = new BufferedReader(new InputStreamReader(in));
			String line;
			while ((line = br.readLine()) != null) {
				List<String> words = Tokenizer.wordsToList(line);
				for (String word : words) {
					if (wordCounts.containsKey(word)) {
						Integer count = wordCounts.get(word);
						wordCounts.put(word, count + 1);
					} else {
						wordCounts.put(word, 1);
					}
				}
			}
			in.close();
		} catch (Exception e) {
			System.err.println("Error: " + e.getMessage());
		}
	}
}