/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.spell;
import org.apache.lucene.analysis.*;
import org.apache.lucene.document.*;
import org.apache.lucene.index.*;
import org.apache.lucene.search.*;
import org.apache.lucene.store.*;
import java.io.*;
import java.text.*;
import java.util.*;
/**
* Do spelling correction based on ngram frequency of terms in an index.
*
* Developed based on <a
* href="http://marc.theaimsgroup.com/?l=lucene-user&m=109474652805339&w=2">this
* message</a> in the lucene-user list.
*
* <p>
* There are two parts to this algorithm. First a ngram lookup table is formed
* for all terms in an index. Then suggested spelling corrections can be done
* based on this table.
* <p>
* The "lookup table" is actually another Lucene index. It is built by going
* through all terms in your original index and storing the term in a Document
* with all ngrams that make it up. Ngrams of length 3 and 4 are suggested.
* <p>
*
* In addition the prefix and suffix ngrams are stored in case you want to use a
* heuristic that people usually know the first few characters of a word.
*
* <p>
* The entry's boost is set by default to log(word_freq)/log(num_docs).
*
* <p>
*
* For a word like "kings" a {@link Document} with the following fields is made
* in the ngram index:
*
* <pre>
* word:kings
* gram3:kin
* gram3:ing
* gram3:ngs
* gram4:king
* gram4:ings
* start3:kin
* start4:king
* end3:ngs
* end4:ings
*
* boost: log(freq('kings'))/log(num_docs).
* </pre>
*
*
* When a lookup is done a query is formed with all ngrams in the misspelled
* word.
*
* <p>
* For a word like <code>"kingz"</code> a query is formed like this.
*
* Query: <br>
* <code>
* gram3:kin gram3:ing gram3:ngz start3:kin^B1 end3:ngz^B2 start4:king^B1 end4:ingz^B2
* </code>
* <br>
*
* Above B1 and B2 are the prefix and suffix boosts. The prefix boost should
* probably be >= 2.0 and the suffix boost should probably be just a little
* above 1.
*
* <p>
* <b>To build</b> the ngram index based on the "contents" field in an existing
* index 'orig_index' you run the main() driver like this:<br>
* <code>
* java org.apache.lucene.spell.NGramSpeller -f contents -i orig_index -o ngram_index
* </code>
*
* <p>
* Once you build an index you can <b>perform spelling corrections using</b>
* {@link #suggestUsingNGrams suggestUsingNGrams(...)}.
*
*
* <p>
*
* To play around with the code against an index approx 100k javadoc-generated
* web pages circa Sept/2004 go here: <a
* href='http://www.searchmorph.com/kat/spell.jsp'>http://www.searchmorph.com/kat/spell.jsp</a>.
*
* <p>
* Of interest might be the <a
* href="http://secondstring.sourceforge.net/">secondstring</a> string matching
* package and <a
* href="http://specialist.nlm.nih.gov/nls/gspell/doc/apiDoc/overview-summary.html">gspell</a>.
*
* @author <a href="mailto:dave@tropo.com?subject=NGramSpeller">David
* Spencer</a>
*
* Slightly modified from original version for use in Nutch project.
*
*/
public final class NGramSpeller {
/**
* Field name for each word in the ngram index.
*/
public static final String F_WORD = "word";
/**
* Frequency, for the popularity cutoff option which says to only return
* suggestions that occur more frequently than the misspelled word.
*/
public static final String F_FREQ = "freq";
/**
* Store transpositions too.
*/
public static final String F_TRANSPOSITION = "transposition";
/**
*
*/
private static final PrintStream o = System.out;
/**
*
*/
private static final NumberFormat nf = NumberFormat.getInstance();
public static Query lastQuery;
/**
*
*/
private NGramSpeller() {
}
/**
* Main driver, used to build an index. You probably want invoke like this:
* <br>
* <code>
* java org.apache.lucene.spell.NGramSpeller -f contents -i orig_index -o ngram_index
* </code>
*/
public static void main(String[] args) throws Throwable {
int minThreshold = 5;
int ng1 = 3;
int ng2 = 4;
int maxr = 10;
int maxd = 5;
String out = "gram_index";
String gi = "gram_index";
String name = null;
String field = "contents";
for (int i = 0; i < args.length; i++) {
if (args[i].equals("-i")) {
name = args[++i];
} else if (args[i].equals("-minThreshold")) {
minThreshold = Integer.parseInt(args[++i]);
} else if (args[i].equals("-gi")) {
gi = args[++i];
} else if (args[i].equals("-o")) {
out = args[++i];
} else if (args[i].equals("-t")) { // test transpositions
String s = args[++i];
o.println("TRANS: " + s);
String[] ar = formTranspositions(s);
for (int j = 0; j < ar.length; j++)
o.println("\t" + ar[j]);
System.exit(0);
} else if (args[i].equals("-ng1")) {
ng1 = Integer.parseInt(args[++i]);
} else if (args[i].equals("-ng2")) {
ng2 = Integer.parseInt(args[++i]);
} else if (args[i].equals("-help") || args[i].equals("--help")
|| args[i].equals("-h")) {
o.println("To form an ngram index:");
o
.println("NGramSpeller -i ORIG_INDEX -o NGRAM_INDEX [-ng1 MIN] [-ng2 MAX] [-f FIELD]");
o.println("Defaults are ng1=3, ng2=4, field='contents'");
System.exit(100);
} else if (args[i].equals("-q")) {
String goal = args[++i];
o.println("[NGrams] for " + goal + " from " + gi);
float bStart = 2.0f;
float bEnd = 1.0f;
float bTransposition = 0f;
o.println("bStart: " + bStart);
o.println("bEnd: " + bEnd);
o.println("bTrans: " + bTransposition);
o.println("ng1: " + ng1);
o.println("ng2: " + ng2);
IndexReader ir = IndexReader.open(gi);
IndexSearcher searcher = new IndexSearcher(gi);
List lis = new ArrayList(maxr);
String[] res = suggestUsingNGrams(searcher, goal, ng1, ng2, maxr,
bStart, bEnd, bTransposition, maxd, lis, true); // more popular
o.println("Returned " + res.length + " from " + gi + " which has "
+ ir.numDocs() + " words in it");
Iterator it = lis.iterator();
while (it.hasNext()) {
o.println(it.next().toString());
}
o.println();
o.println("query: " + lastQuery.toString("contents"));
Hits ghits = searcher.search(new TermQuery(
new Term(F_WORD, "recursive")));
if (ghits.length() >= 1) // umm, should only be 0 or 1
{
Document doc = ghits.doc(0);
o.println("TEST DOC: " + doc);
}
searcher.close();
ir.close();
return;
} else if (args[i].equals("-f")) {
field = args[++i];
} else {
o.println("hmm? " + args[i]);
System.exit(1);
}
}
if (name == null) {
o.println("opps, you need to specify the input index w/ -i");
System.exit(1);
}
o.println("Opening " + name);
IndexReader.unlock(FSDirectory.getDirectory(name, false));
final IndexReader r = IndexReader.open(name);
o.println("Docs: " + nf.format(r.numDocs()));
o.println("Using field: " + field);
IndexWriter writer = new IndexWriter(out, new WhitespaceAnalyzer(), true);
writer.setMergeFactor(writer.getMergeFactor()*50);
writer.setMaxBufferedDocs(writer.getMaxBufferedDocs()*50);
o.println("Forming index from " + name + " to " + out);
int res = formNGramIndex(r, writer, ng1, ng2, field, minThreshold);
o.println("done, did " + res + " ngrams");
writer.optimize();
writer.close();
r.close();
}
/**
* Using an NGram algorithm try to find alternate spellings for a "goal" word
* based on the ngrams in it.
*
* @param searcher
* the searcher for the "ngram" index
*
* @param goal
* the word you want a spell check done on
*
* @param ng1
* the min ngram length to use, probably 3 and it defaults to 3 if
* you pass in a value <= 0
*
* @param ng2
* the max ngram length to use, probably 3 or 4
*
* @param maxr
* max results to return, probably a small number like 5 for normal
* use or 10-100 for testing
*
* @param bStart
* how to boost matches that start the same way as the goal word,
* probably greater than 2
*
* @param bEnd
* how to boost matches that end the same way as the goal word,
* probably greater than or equal to 1
*
* @param bTransposition
* how to boost matches that are also simple transpositions, or 0 to
* disable
*
* @param maxd
* filter for the max Levenshtein string distance for matches,
* probably a number like 3, 4, or 5, or use 0 for it to be ignored.
* This prevents words radically longer but similar to the goal word
* from being returned.
*
* @param details
* if non null is a list with one entry per match. Each entry is an
* array of ([String] word, [Double] score, [Integer] Levenshtein
* string distance, [Integer] word freq).
*
* @param morePopular
* if true says to only return suggestions more popular than the
* misspelled word. This prevents rare words from being suggested.
* Note that for words that don't appear in the index at all this has
* no effect as those words will have a frequency of 0 anyway.
*
* @return the strings suggested with the best one first
*/
public static String[] suggestUsingNGrams(Searcher searcher, String goal,
int ng1, int ng2, int maxr, float bStart, float bEnd,
float bTransposition, int maxd, List details, boolean morePopular)
throws Throwable {
List res = new ArrayList(maxr);
BooleanQuery query = new BooleanQuery();
if (ng1 <= 0) {
ng1 = 3; // guess
}
if (ng2 < ng1) {
ng2 = ng1;
}
if (bStart < 0) {
bStart = 0;
}
if (bEnd < 0) {
bEnd = 0;
}
if (bTransposition < 0) {
bTransposition = 0;
}
// calculate table of all ngrams for goal word
String[][] gramt = new String[ng2 + 1][];
for (int ng = ng1; ng <= ng2; ng++)
gramt[ng] = formGrams(goal, ng);
int goalFreq = 0;
if (morePopular) {
Hits ghits = searcher.search(new TermQuery(new Term(F_WORD, goal)));
if (ghits.length() >= 1) // umm, should only be 0 or 1
{
Document doc = ghits.doc(0);
goalFreq = Integer.parseInt(doc.get(F_FREQ));
}
}
if (bTransposition > 0) {
add(query, F_TRANSPOSITION, goal, bTransposition);
}
TRStringDistance sd = new TRStringDistance(goal);
for (int ng = ng1; ng <= ng2; ng++) // for every ngram in range
{
String[] grams = gramt[ng]; // form word into ngrams (allow dups too)
if (grams.length == 0) {
continue; // hmm
}
String key = "gram" + ng; // form key
if (bStart > 0) { // should we boost prefixes?
add(query, "start" + ng, grams[0], bStart); // matches start of word
}
if (bEnd > 0) { // should we boost suffixes
add(query, "end" + ng, grams[grams.length - 1], bEnd); // matches end
// of word
}
// match ngrams anywhere, w/o a boost
for (int i = 0; i < grams.length; i++) {
add(query, key, grams[i]);
}
}
Hits hits = searcher.search(query);
int len = hits.length();
int remain = maxr;
int stop = Math.min(len, 100 * maxr); // go thru more than 'maxr' matches in
// case the distance filter triggers
for (int i = 0; (i < stop) && (remain > 0); i++) {
Document d = hits.doc(i);
String word = d.get(F_WORD); // get orig word
if (word.equals(goal)) {
continue; // don't suggest a word for itself, that would be silly
}
int dist = sd.getDistance(word); // use distance filter
if ((maxd > 0) && (dist > maxd)) {
continue;
}
int suggestionFreq = Integer.parseInt(d.get(F_FREQ));
if (morePopular && (goalFreq > suggestionFreq)) {
continue; // don't suggest a rarer word
}
remain--;
res.add(word);
if (details != null) // only non-null for testing probably
{
int[] matches = new int[ng2 + 1];
for (int ng = ng1; ng <= ng2; ng++) {
String[] have = formGrams(word, ng);
int match = 0;
String[] cur = gramt[ng];
for (int k = 0; k < have.length; k++) {
boolean looking = true;
for (int j = 0; (j < cur.length) && looking; j++) {
if (have[k].equals(cur[j])) {
// o.println( "\t\tmatch: " + have[ k] + " on " + word);
match++;
looking = false;
}
}
/*
* if ( looking) o.println( "\t\tNO MATCH: " + have[ k] + " on " +
* word);
*/
}
matches[ng] = match;
}
details.add(new SpellSuggestionDetails(word, hits.score(i), dist,
suggestionFreq, matches, ng1));
}
}
lastQuery = query; // hack for now
return (String[]) res.toArray(new String[0]);
}
/**
* Go thru all terms and form an index of the "ngrams" of length 'ng1' to
* 'ng2' in each term. The ngrams have field names like "gram3" for a 3 char
* ngram, and "gram4" for a 4 char one. The starting and ending (or prefix and
* suffix) "n" characters are also stored for each word with field names
* "start3" and "end3".
*
*
* @param r
* the index to read terms from
*
* @param w
* the writer to write the ngrams to, or if null an index named
* "gram_index" will be created. If you pass in non-null then you
* should optimize and close the index.
*
* @param ng1
* the min number of chars to form ngrams with (3 is suggested)
*
* @param ng2
* the max number of chars to form ngrams with, can be equal to ng1
*
* @param fields
* the field name to process ngrams from.
*
* @param minThreshold
* terms must appear in at least this many docs else they're ignored
* as the assumption is that they're so rare (...)
*
* @return the number of ngrams added
*
*/
private static int formNGramIndex(IndexReader r, IndexWriter _w, int ng1,
int ng2, String field, int minThreshold) throws IOException {
int mins = 0;
float nudge = 0.01f; // don't allow boosts to be too small
IndexWriter w;
if (_w == null) {
w = new IndexWriter("gram_index", new WhitespaceAnalyzer(), // should have
// no effect
true);
} else {
w = _w;
}
int mod = 1000; // for status
int nd = r.numDocs();
final float base = (float) Math.log(1.0d / ((double) nd));
if (field == null) {
field = "contents"; // def field
}
field = field.intern(); // is it doced that you can use == on fields?
int grams = 0; // # of ngrams added
final TermEnum te = r.terms(new Term(field, ""));
int n = 0;
int skips = 0;
while (te.next()) {
boolean show = false; // for debugging
Term t = te.term();
String have = t.field();
if ((have != field) && !have.equals(field)) // wrong field
{
break;
}
if (t.text().indexOf("-") >= 0) {
continue;
}
int df = te.docFreq();
if ((++n % mod) == 0) {
show = true;
o.println("term: " + t + " n=" + nf.format(n) + " grams="
+ nf.format(grams) + " mins=" + nf.format(mins) + " skip="
+ nf.format(skips) + " docFreq=" + df);
}
if (df < minThreshold) // not freq enough, too rare to consider
{
mins++;
continue;
}
String text = t.text();
int len = text.length();
if (len < ng1) {
continue; // too short we bail but "too long" is fine...
}
// but note that long tokens that are rare prob won't get here anyway as
// they won't
// pass the 'minThreshold' check above
Document doc = new Document();
doc.add(new Field(F_WORD, text, Field.Store.YES, Field.Index.UN_TOKENIZED)); // orig term
doc.add(new Field(F_FREQ, "" + df, Field.Store.YES, Field.Index.UN_TOKENIZED)); // for popularity cutoff optionx
String[] trans = formTranspositions(text);
for (int i = 0; i < trans.length; i++)
doc.add(new Field(F_TRANSPOSITION, trans[i], Field.Store.YES, Field.Index.UN_TOKENIZED));
// now loop thru all ngrams of lengths 'ng1' to 'ng2'
for (int ng = ng1; ng <= ng2; ng++) {
String key = "gram" + ng;
String end = null;
for (int i = 0; i < (len - ng + 1); i++) {
String gram = text.substring(i, i + ng);
doc.add(new Field(key, gram, Field.Store.YES, Field.Index.UN_TOKENIZED));
if (i == 0) {
doc.add(new Field("start" + ng, gram, Field.Store.YES, Field.Index.UN_TOKENIZED));
}
end = gram;
grams++;
}
if (end != null) { // may not be present if len==ng1
doc.add(new Field("end" + ng, end, Field.Store.YES, Field.Index.UN_TOKENIZED));
}
}
float f1 = te.docFreq();
float f2 = nd;
float bo = (float) ((Math.log(f1) / Math.log(f2)) + nudge);
doc.setBoost(bo);
if (show) {
o.println("f1=" + f1 + " nd=" + nd + " boost=" + bo + " base=" + base
+ " word=" + text);
}
w.addDocument(doc);
}
if (_w == null) // else you have to optimize/close
{
w.optimize();
w.close();
}
return grams;
}
/**
* Add a clause to a boolean query.
*/
private static void add(BooleanQuery q, String k, String v, float boost) {
Query tq = new TermQuery(new Term(k, v));
tq.setBoost(boost);
q.add(new BooleanClause(tq, BooleanClause.Occur.SHOULD));
}
/**
*
*/
public static String[] formTranspositions(String s) {
int len = s.length();
List res = new ArrayList(len - 1);
for (int i = 0; i < (len - 1); i++) {
char c1 = s.charAt(i);
char c2 = s.charAt(i + 1);
if (c1 == c2) {
continue;
}
res.add(s.substring(0, i) + c2 + c1 + s.substring(i + 2));
}
return (String[]) res.toArray(new String[0]);
}
/**
* Form all ngrams for a given word.
*
* @param text
* the word to parse
* @param ng
* the ngram length e.g. 3
* @return an array of all ngrams in the word and note that duplicates are not
* removed
*/
public static String[] formGrams(String text, int ng) {
List res = new ArrayList(text.length() - ng + 1);
int len = text.length();
for (int i = 0; i < (len - ng + 1); i++) {
res.add(text.substring(i, i + ng));
}
return (String[]) res.toArray(new String[0]);
}
/**
* Add a clause to a boolean query.
*/
private static void add(BooleanQuery q, String k, String v) {
q.add(new BooleanClause(new TermQuery(new Term(k, v)), BooleanClause.Occur.SHOULD));
}
/**
* Presumably this is implemented somewhere in the apache/jakarta/commons area
* but I couldn't find it.
*
* @link http://www.merriampark.com/ld.htm
*
*/
private static class TRStringDistance {
final char[] sa;
final int n;
final int[][][] cache = new int[30][][];
/**
* Optimized to run a bit faster than the static getDistance(). In one
* benchmark times were 5.3sec using ctr vs 8.5sec w/ static method, thus
* 37% faster.
*/
private TRStringDistance(String target) {
sa = target.toCharArray();
n = sa.length;
}
// *****************************
// Compute Levenshtein distance
// *****************************
public int getDistance(String other) {
int[][] d; // matrix
int cost; // cost
// Step 1
final char[] ta = other.toCharArray();
final int m = ta.length;
if (n == 0) {
return m;
}
if (m == 0) {
return n;
}
if (m >= cache.length) {
d = form(n, m);
} else if (cache[m] != null) {
d = cache[m];
} else {
d = cache[m] = form(n, m);
}
// Step 3
for (int i = 1; i <= n; i++) {
final char s_i = sa[i - 1];
// Step 4
for (int j = 1; j <= m; j++) {
final char t_j = ta[j - 1];
// Step 5
if (s_i == t_j) { // same
cost = 0;
} else { // not a match
cost = 1;
}
// Step 6
d[i][j] = min3(d[i - 1][j] + 1, d[i][j - 1] + 1, d[i - 1][j - 1]
+ cost);
}
}
// Step 7
return d[n][m];
}
/**
*
*/
private static int[][] form(int n, int m) {
int[][] d = new int[n + 1][m + 1];
// Step 2
for (int i = 0; i <= n; i++)
d[i][0] = i;
for (int j = 0; j <= m; j++)
d[0][j] = j;
return d;
}
// ****************************
// Get minimum of three values
// ****************************
private static int min3(int a, int b, int c) {
int mi;
mi = a;
if (b < mi) {
mi = b;
}
if (c < mi) {
mi = c;
}
return mi;
}
// *****************************
// Compute Levenshtein distance
// *****************************
public static int getDistance(String s, String t) {
return getDistance(s.toCharArray(), t.toCharArray());
}
// *****************************
// Compute Levenshtein distance
// *****************************
public static int getDistance(final char[] sa, final char[] ta) {
int[][] d; // matrix
int i; // iterates through s
int j; // iterates through t
char s_i; // ith character of s
char t_j; // jth character of t
int cost; // cost
// Step 1
final int n = sa.length;
final int m = ta.length;
if (n == 0) {
return m;
}
if (m == 0) {
return n;
}
d = new int[n + 1][m + 1];
// Step 2
for (i = 0; i <= n; i++) {
d[i][0] = i;
}
for (j = 0; j <= m; j++) {
d[0][j] = j;
}
// Step 3
for (i = 1; i <= n; i++) {
s_i = sa[i - 1];
// Step 4
for (j = 1; j <= m; j++) {
t_j = ta[j - 1];
// Step 5
if (s_i == t_j) {
cost = 0;
} else {
cost = 1;
}
// Step 6
d[i][j] = min3(d[i - 1][j] + 1, d[i][j - 1] + 1, d[i - 1][j - 1]
+ cost);
}
}
// Step 7
return d[n][m];
}
}
/* Added by Andy Liu for Nutch */
public static class SpellSuggestionDetails {
public String word;
public double score;
public int dist;
public int docFreq;
public int[] matches;
public int ng1;
public SpellSuggestionDetails(String word, double score, int dist,
int docFreq, int[] matches, int ng1) {
super();
this.word = word;
this.score = score;
this.dist = dist;
this.docFreq = docFreq;
this.matches = matches;
this.ng1 = ng1;
}
public String toString() {
StringBuffer buf = new StringBuffer("word=" + word + " score=" + score
+ " dist=" + dist + " freq=" + docFreq + "\n");
for (int j = ng1; j < matches.length; j++)
buf.append("\tmm[ " + j + " ] = " + matches[j]);
return buf.toString();
}
}
}