/**
* Copyright 2004 The Apache Software Foundation.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.similar;
import java.io.IOException;
import java.io.StringReader;
import java.util.HashSet;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
/**
* Simple similarity measures.
*
* @see MoreLikeThis
*/
public final class SimilarityQueries
{
/**
*
*/
private SimilarityQueries()
{
}
/**
* Simple similarity query generators.
* Takes every unique word and forms a boolean query where all words are optional.
* After you get this you'll use to to query your {@link IndexSearcher} for similar docs.
* The only caveat is the first hit returned <b>should be</b> your source document - you'll
* need to then ignore that.
*
* <p>
* So, if you have a code fragment like this:
* <br>
* <code>
* Query q = formSimilaryQuery( "I use Lucene to search fast. Fast searchers are good", new StandardAnalyzer(), "contents", null);
* </code>
*
* <p>
* The query returned, in string form, will be <code>'(i use lucene to search fast searchers are good')</code>.
*
* <p>
* The philosophy behind this method is "two documents are similar if they share lots of words".
* Note that behind the scenes, Lucene's scoring algorithm will tend to give two documents a higher similarity score if the share more uncommon words.
*
* <P>
* This method is fail-safe in that if a long 'body' is passed in and
* {@link BooleanQuery#add BooleanQuery.add()} (used internally)
* throws
* {@link org.apache.lucene.search.BooleanQuery.TooManyClauses BooleanQuery.TooManyClauses}, the
* query as it is will be returned.
*
* @param body the body of the document you want to find similar documents to
* @param a the analyzer to use to parse the body
* @param field the field you want to search on, probably something like "contents" or "body"
* @param stop optional set of stop words to ignore
* @return a query with all unique words in 'body'
* @throws IOException this can't happen...
*/
public static Query formSimilarQuery( String body,
Analyzer a,
String field,
Set<?> stop)
throws IOException
{
TokenStream ts = a.tokenStream( field, new StringReader( body));
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
BooleanQuery tmp = new BooleanQuery();
Set<String> already = new HashSet<String>(); // ignore dups
while (ts.incrementToken()) {
String word = termAtt.toString();
// ignore opt stop words
if ( stop != null &&
stop.contains( word)) continue;
// ignore dups
if ( ! already.add( word)) continue;
// add to query
TermQuery tq = new TermQuery( new Term( field, word));
try
{
tmp.add( tq, BooleanClause.Occur.SHOULD);
}
catch( BooleanQuery.TooManyClauses too)
{
// fail-safe, just return what we have, not the end of the world
break;
}
}
return tmp;
}
}