package edu.wiki.search; import java.io.File; import java.io.IOException; import org.apache.lucene.queryParser.ParseException; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; import edu.wiki.index.WikipediaAnalyzer; public class NormalizedWikipediaDistance { private IndexSearcher searcher; private QueryParser qparser; private Query wQuery; private TopDocs wResults; int numWikiDocs; public class NumRes { public int res1; public int res2; public int resCommon; public NumRes() { res1 = res2 = resCommon = 0; } public void reset(){ res1 = res2 = resCommon = 0; } } NumRes nres = new NumRes(); public NormalizedWikipediaDistance(String indexPath){ Directory fsDir = null; try { fsDir = FSDirectory.open(new File(indexPath)); searcher = new IndexSearcher(fsDir); numWikiDocs = searcher.maxDoc(); qparser = new QueryParser(Version.LUCENE_CURRENT, "contents", new WikipediaAnalyzer()); } catch (IOException e) { e.printStackTrace(); } } private int freqSearch(String phrase) throws ParseException, IOException{ wQuery = qparser.parse("\""+QueryParser.escape(phrase)+"\""); // wQuery = qparser.parse(QueryParser.escape(phrase)); wResults = searcher.search(wQuery,1); return wResults.totalHits; } /** * Search to find the probability of occurrence for two phrases * @param queryString * @param exactPhrase * @return * @throws ParseException * @throws IOException */ private int occurSearch(String phrase1, String phrase2) throws ParseException, IOException{ wQuery = qparser.parse("\""+QueryParser.escape(phrase1)+"\" AND " + "\""+QueryParser.escape(phrase2)+"\""); // wQuery = qparser.parse("(" + QueryParser.escape(phrase1)+") AND (" + QueryParser.escape(phrase2) + ")"); wResults = searcher.search(wQuery,1); return wResults.totalHits; } public double getDistance(String label1, String label2){ float f1 = 0.0f, f2 = 0.0f; float fCommon = 0.0f; nres.reset(); try { nres.res1 = freqSearch(label1); f1 = nres.res1; nres.res2 = freqSearch(label2); f2 = nres.res2; nres.resCommon = occurSearch(label1, label2); fCommon = nres.resCommon; } catch (ParseException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } if(f1 == 0 || f2 == 0){ return -1f; // undefined // return 10000.0f; // no information, assume inf distance } // if((fCommon == 0) && (f1 > 0 || f2 > 0) ){ if(fCommon == 0){ return 10000.0f; // infinite distance } f1 *= 2; f2 *= 2; fCommon *= 2; // just generalize double log1, log2 , logCommon, maxlog, minlog; log1 = Math.log(f1); log2 = Math.log(f2); logCommon = Math.log(fCommon); maxlog = Math.max(log1, log2); minlog = Math.min(log1, log2); return (maxlog - logCommon) / (Math.log(numWikiDocs) - minlog); } public NumRes getMatches(){ return nres; } }