/*
* To change this license header, choose License Headers in Project Properties.
* To change this template file, choose Tools | Templates
* and open the template in the editor.
*/
package di.uniba.it.tri.ir;
import di.uniba.it.tri.occ.OccUtils;
import di.uniba.it.tri.vectors.Vector;
import di.uniba.it.tri.vectors.VectorFactory;
import di.uniba.it.tri.vectors.VectorReader;
import di.uniba.it.tri.vectors.VectorType;
import java.io.File;
import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Date;
import java.util.List;
import java.util.Locale;
import java.util.Set;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.en.EnglishAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.queryparser.classic.MultiFieldQueryParser;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.NumericRangeQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.FSDirectory;
/**
*
* @author pierpaolo
*/
public class Searcher {
private final IndexSearcher searcher;
private Analyzer analyzer;
private SimpleDateFormat dateFormat = new SimpleDateFormat("EEE MMM dd HH:mm:ss 'CEST' yyyy", Locale.ENGLISH);
private String idFieldname = "id";
private String[] textFieldname = new String[]{"title", "content"};
private String sourceFieldname = "source";
private String dateFieldname = "published";
public Searcher(File inputIndex) throws IOException {
DirectoryReader dr = DirectoryReader.open(FSDirectory.open(inputIndex));
searcher = new IndexSearcher(dr);
analyzer = new StandardAnalyzer();
}
public Searcher(File inputIndex, String analyzerType, String stopWordFilename) throws Exception {
DirectoryReader dr = DirectoryReader.open(FSDirectory.open(inputIndex));
searcher = new IndexSearcher(dr);
if (analyzerType.equals("st")) {
if (stopWordFilename != null) {
Set<String> stopWords = OccUtils.loadSet(new File(stopWordFilename));
analyzer = new StandardAnalyzer(CharArraySet.copy(stopWords));
} else {
analyzer = new StandardAnalyzer();
}
} else if (analyzerType.equals("en")) {
if (stopWordFilename != null) {
Set<String> stopWords = OccUtils.loadSet(new File(stopWordFilename));
analyzer = new EnglishAnalyzer(CharArraySet.copy(stopWords));
} else {
analyzer = new EnglishAnalyzer();
}
} else {
throw new IllegalArgumentException("Not valid analyzer type: " + analyzerType);
}
}
public SimpleDateFormat getDateFormat() {
return dateFormat;
}
public void setDateFormat(SimpleDateFormat dateFormat) {
this.dateFormat = dateFormat;
}
public String getIdFieldname() {
return idFieldname;
}
public void setIdFieldname(String idFieldname) {
this.idFieldname = idFieldname;
}
public String[] getTextFieldname() {
return textFieldname;
}
public void setTextFieldname(String[] textFieldname) {
this.textFieldname = textFieldname;
}
public String getSourceFieldname() {
return sourceFieldname;
}
public void setSourceFieldname(String sourceFieldname) {
this.sourceFieldname = sourceFieldname;
}
public String getDateFieldname() {
return dateFieldname;
}
public void setDateFieldname(String dateFieldname) {
this.dateFieldname = dateFieldname;
}
public List<SearchResult> search(String query, int topn) throws ParseException, IOException {
List<SearchResult> results = new ArrayList<>();
query = QueryParser.escape(query);
QueryParser qp = new MultiFieldQueryParser(textFieldname, analyzer);
Query q = qp.parse(query);
TopDocs topDocs = searcher.search(q, topn);
for (int i = 0; i < topDocs.scoreDocs.length; i++) {
SearchResult sr = new SearchResult(topDocs.scoreDocs[i].doc, topDocs.scoreDocs[i].score);
Document doc = searcher.doc(sr.getDocid());
sr.setId(doc.get(idFieldname));
sr.setSource(doc.get(sourceFieldname));
StringBuilder sb = new StringBuilder();
for (String fn : textFieldname) {
sb.append(doc.get(fn)).append("\n");
}
sr.setText(sb.toString());
results.add(sr);
try {
Date date = dateFormat.parse(doc.get(dateFieldname));
sr.setDate(date);
} catch (java.text.ParseException ex) {
Logger.getLogger(Searcher.class.getName()).log(Level.SEVERE, null, ex);
}
}
return results;
}
public List<SearchResult> search(String query, int topn, Date start, Date end) throws ParseException, IOException {
List<SearchResult> results = new ArrayList<>();
query = QueryParser.escape(query);
QueryParser qp = new MultiFieldQueryParser(textFieldname, analyzer);
Query qtext = qp.parse(query);
Query qtime = NumericRangeQuery.newLongRange("time", start.getTime(), end.getTime(), true, true);
BooleanQuery q = new BooleanQuery();
q.add(qtext, BooleanClause.Occur.MUST);
q.add(qtime, BooleanClause.Occur.MUST);
TopDocs topDocs = searcher.search(q, topn);
for (int i = 0; i < topDocs.scoreDocs.length; i++) {
SearchResult sr = new SearchResult(topDocs.scoreDocs[i].doc, topDocs.scoreDocs[i].score);
Document doc = searcher.doc(sr.getDocid());
sr.setId(doc.get(idFieldname));
sr.setSource(doc.get(sourceFieldname));
StringBuilder sb = new StringBuilder();
for (String fn : textFieldname) {
sb.append(doc.get(fn)).append("\n");
}
sr.setText(sb.toString());
results.add(sr);
try {
Date date = dateFormat.parse(doc.get(dateFieldname));
sr.setDate(date);
} catch (java.text.ParseException ex) {
Logger.getLogger(Searcher.class.getName()).log(Level.SEVERE, null, ex);
}
}
return results;
}
private List<String> getTokens(String text) throws IOException {
List<String> tokens = new ArrayList<>();
TokenStream tokenStream = analyzer.tokenStream("text", text);
tokenStream.reset();
CharTermAttribute cattr = tokenStream.addAttribute(CharTermAttribute.class);
while (tokenStream.incrementToken()) {
String token = cattr.toString();
tokens.add(token);
}
tokenStream.end();
tokenStream.close();
return tokens;
}
private Vector buildVector(String text, VectorReader vr) throws IOException {
List<String> tokens = getTokens(text);
Vector dv = VectorFactory.createZeroVector(VectorType.REAL, vr.getDimension());
for (String t : tokens) {
Vector v = vr.getVector(t);
if (v != null) {
dv.superpose(v, 1, null);
}
}
if (!dv.isZeroVector()) {
dv.normalize();
}
return dv;
}
public List<SearchResult> search(String query, int topn, Date start, Date end, VectorReader vr) throws ParseException, IOException {
List<SearchResult> results = new ArrayList<>();
query = QueryParser.escape(query);
QueryParser qp = new MultiFieldQueryParser(textFieldname, analyzer);
Query qtext = qp.parse(query);
Query qtime = NumericRangeQuery.newLongRange("time", start.getTime(), end.getTime(), true, true);
BooleanQuery q = new BooleanQuery();
q.add(qtext, BooleanClause.Occur.MUST);
q.add(qtime, BooleanClause.Occur.MUST);
TopDocs topDocs = searcher.search(q, topn);
for (int i = 0; i < topDocs.scoreDocs.length; i++) {
SearchResult sr = new SearchResult(topDocs.scoreDocs[i].doc, topDocs.scoreDocs[i].score);
Document doc = searcher.doc(sr.getDocid());
sr.setId(doc.get(idFieldname));
sr.setSource(doc.get(sourceFieldname));
StringBuilder sb = new StringBuilder();
for (String fn : textFieldname) {
sb.append(doc.get(fn)).append("\n");
}
sr.setText(sb.toString());
Vector queryVector = buildVector(query, vr);
Vector dv = buildVector(sr.getText(), vr);
sr.setScore((float) queryVector.measureOverlap(dv));
results.add(sr);
try {
Date date = dateFormat.parse(doc.get(dateFieldname));
sr.setDate(date);
} catch (java.text.ParseException ex) {
Logger.getLogger(Searcher.class.getName()).log(Level.SEVERE, null, ex);
}
}
Collections.sort(results);
return results;
}
}