Searcher.java example

Explorer

NewSumServer-master
- src
  - org
    - scify
      - NewSumServer
        Server
        Comms
        Communicator.java
        MachineLearning
        INSECTDBWithDir.java
        classificationModule.java
        dataSets.java
        labelTagging.java
        util.java
        vector.java
        writeToFile.java
        OCR
        TextHandler.java
        Searching
        Indexer.java
        Searcher.java
        Sources
        BlogParser.java
        ISourceParser.java
        RSSSources.java
        RssParser.java
        Storage
        IDataStorage.java
        InsectFileIO.java
        SimpleFileIO.java
        Structures
        Article.java
        Sentence.java
        Topic.java
        UnlabeledArticle.java
        User.java
        Summarisation
        ArticleClusterer.java
        RedundancyRemover.java
        Summariser.java
        dumpClusterer.java
        Utils
        Main.java
        Utilities.java

/*
 * Copyright 2013 SciFY NPO <info@scify.org>.
 *
 * This product is part of the NewSum Free Software.
 * For more information about NewSum visit
 * 
 * 	http://www.scify.gr/site/en/our-projects/completed-projects/newsum-menu-en
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * 
 * If this code or its output is used, extended, re-engineered, integrated, 
 * or embedded to any extent in another software or hardware, there MUST be 
 * an explicit attribution to this work in the resulting source code, 
 * the packaging (where such packaging exists), or user interface 
 * (where such an interface exists). 
 * The attribution must be of the form "Powered by NewSum, SciFY"
 */ 

package org.scify.NewSumServer.Server.Searching;

import gr.demokritos.iit.jinsect.utils;
import java.io.File;
import java.io.IOException;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.LimitTokenCountAnalyzer;
import org.apache.lucene.analysis.el.GreekAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.scify.NewSumServer.Server.Utils.Main;

/**
 * The Class used for searching in an indexed directory.
 * @author George K. <gkiom@scify.org>
 */
public class Searcher {

    private final static Logger LOGGER = Main.getLogger();

    private Analyzer anal;

    /**
     * The map containing the (docId, filename) data
     */
    private HashMap<Integer, String> docFiles = new HashMap<Integer, String>();

    /**
     * Searches the index directory for the specified query.
     * @param fIndexDir The directory where the indexed files are stored
     * @param lLoc The locale that the indexed is created in
     * @param sQuery The search term
     * @param iMaxHits The max number of results to be returned.
     * @return A list of scoredocs which correspond to the search entry
     */
    public List<ScoreDoc> searchIndex(File fIndexDir, Locale lLoc, String sQuery, int iMaxHits) {
        try {
            // Open the Directory of the Indexed Files, using
            // the FSDirectory class
            Directory FSDir = FSDirectory.open(fIndexDir);
            // Create the reader class on the Dir
            IndexReader reader = IndexReader.open(FSDir);
            IndexSearcher searcher = new IndexSearcher(reader);
            String dField = "text"; // Pass this from the Indexer Class?
            // Must Use the Same Analyzer as the index Class, otherwise
            // results will be awkward. So it get's analyzer from Indexer class
            // Create the query Parser on the Field that we want to parse
            if (lLoc.toString().equals("el")) {
                anal = new GreekAnalyzer(Version.LUCENE_36);
            } else if (lLoc.toString().equals("en")) {
                // The standard analyzer
                Analyzer stdAnal = new StandardAnalyzer(Version.LUCENE_36);
                anal = new LimitTokenCountAnalyzer(stdAnal, Integer.MAX_VALUE);
            }
            QueryParser parser =
                    new QueryParser(Version.LUCENE_36, dField, anal);
            try {
                Query q = parser.parse(sQuery);
                // Search the Index with the Query
                TopDocs hits = searcher.search(q, iMaxHits);
                ScoreDoc[] scoreDocs = hits.scoreDocs;
                //debug start
                System.out.println("files found: " + scoreDocs.length);
                //debug end
                // Iterate over the scoredocs
                for (int n = 0; n < scoreDocs.length; n++) {
                    ScoreDoc sd = scoreDocs[n];
                    float score = sd.score;
                    int docId = sd.doc;
                    Document d = searcher.doc(docId);
                    String filename = d.get("file");
//                    System.out.println //debug
//                    (filename+": "+"Score: "+score+" - "+ "DocID: "+ docId);
                    //Save the <docID, filename> data to the map
                    this.docFiles.put(docId, filename);
                }
                // Sort the Docs according to their scores
                List<ScoreDoc> returnList
                        = sortScoreDocs(scoreDocs);
                Collections.reverse(returnList);
                return returnList;
            } catch (ParseException ex) {
                LOGGER.log(Level.SEVERE, "Could not parse query {0}", sQuery);
            } catch (NullPointerException ex) {
                LOGGER.log(Level.WARNING, ex.getMessage());
                return null;
            }
        } catch (IOException ex) {
            LOGGER.log(Level.SEVERE, "Could not open Directory {0}", fIndexDir.getPath());
        } return null;
    }
    /**
     * Sorts The ScoreDocs according to their score values.
     * @param scoreDocs The Documents returned by the searcher
     * @return A sorted List of the scoreDocs
     * @throws CorruptIndexException
     * @throws IOException
     */
    private List<ScoreDoc> sortScoreDocs(ScoreDoc[] scoreDocs)
            throws CorruptIndexException, IOException {

        List<ScoreDoc> scoreDocsList = Arrays.asList(scoreDocs);
        Collections.sort(scoreDocsList, new Comparator<ScoreDoc>() {

            @Override
            public int compare(ScoreDoc o1, ScoreDoc o2) {
                return
            (o1.score >= o2.score) ? ((o1.score > o2.score) ? 1:0 ) : -1;
            }
        });
        // debug
//        for (ScoreDoc i : scoreDocsList ) {
//            System.out.println(i.toString() + " :: ");
//        }
        // debug end
        return scoreDocsList;
        }
    /**
     *
     * @return The Map containing the (docId, filename) info
     */
    public HashMap<Integer, String> getDocFiles() {
        return this.docFiles;
    }

}