/* See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* Esri Inc. licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.esri.gpt.catalog.lucene.stats;
import com.esri.gpt.framework.util.Val;
import java.io.IOException;
import java.io.PrintWriter;
import java.net.URLEncoder;
import java.util.List;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermDocs;
import org.apache.lucene.index.TermEnum;
import org.apache.lucene.util.OpenBitSet;
/**
* Provides statistics associated with a single term.
* <p/>
* Statistics are unavailable until collected.
* <p/>
* Statistics provided are:
* <ul>
* <li>the number of documents considered during statistics collection</li>
* <li>the number of documents that contain the term</li>
* <li>the total frequency for this term across all considered fields</li>
* <li>the list of frequencies per field for this term</li>
* </ul>
*/
class SingleTermStats extends Collectable {
/** instance variables ====================================================== */
private FrequencyAccumulator fieldAccumulator = new FrequencyAccumulator();
private long numberOfDocsWithTerm = 0;
private String text;
/** constructors ============================================================ */
/**
* Construct with a supplied text string.
* @param text the term text
*/
public SingleTermStats(String text) {
super();
this.text = text;
}
/** properties ============================================================= */
/**
* Gets the term text.
* @return the term text
*/
private String getText() {
return this.text;
}
/**
* Gets the number of documents containing this term.
* @return the number of documents
*/
private long getNumberOfDocsWithTerm() {
return this.numberOfDocsWithTerm;
}
/**
* Gets the list of frequencies per field for this term.
* <br/>Each member will be named by field and counted by term frequency.
* @return the term frequencies
*/
private List<NamedFrequency> getFieldFrequencies() {
return this.fieldAccumulator.getFrequencies();
}
/**
* Gets the total frequency for this term across all considered fields.
* @return the total frequency
*/
private long getTotalFrequency() {
return this.fieldAccumulator.getTotalFrequency();
}
/** methods ================================================================= */
/**
* Executes the collection of statistics.
* @param request the active statistics request
* @param reader the index reader
* @throws IOException if an error occurs while communicating with the index
*/
public void collectStats(StatsRequest request, IndexReader reader) throws IOException {
long t1 = System.currentTimeMillis();
TermEnum termEnum = null;
TermDocs termDocs = null;
try {
OpenBitSet documentFilterBitSet = request.getDocumentFilterBitSet();
OpenBitSet docsWithTermBitSet = new OpenBitSet(reader.maxDoc());
boolean isUnfiltered = (documentFilterBitSet == null);
// return if there are no stats to collect
String[] fieldNames = request.getCollectableFieldNames(reader);
if (this.determineNumberOfDocsConsidered(reader,documentFilterBitSet) <= 0) {
return;
} else if ((fieldNames == null) || (fieldNames.length == 0)) {
return;
}
//Map<String,Long,>
// accumlate term frequencies per field
termDocs = reader.termDocs();
for (String fieldName: fieldNames) {
termEnum = reader.terms(new Term(fieldName,this.text));
do {
Term term = termEnum.term();
if (term != null && term.field().equals(fieldName)) {
if (!term.text().equals(this.text)) {
break;
}
termDocs.seek(term);
long count = 0;
while (termDocs.next()) {
int docId = termDocs.doc();
boolean bSet = isUnfiltered || documentFilterBitSet.fastGet(docId);
if (bSet) {
docsWithTermBitSet.fastSet(docId);
count++;
//this.fieldAccumulator.add(fieldName,termDocs.freq());
}
}
this.fieldAccumulator.add(fieldName,count);
} else {
break;
}
} while (termEnum.next());
termEnum.close();
termEnum = null;
}
// sort
this.numberOfDocsWithTerm = docsWithTermBitSet.cardinality();
if (Val.chkStr(request.getSortBy()).equalsIgnoreCase("name")) {
this.fieldAccumulator.sortByName();
} else {
this.fieldAccumulator.sortByFrequency();
}
} finally {
try {if (termEnum != null) termEnum.close();} catch (Exception ef) {}
try {if (termDocs != null) termDocs.close();} catch (Exception ef) {}
this.setTimeMillis(System.currentTimeMillis() - t1);
}
// print
if (request.getResponseWriter() != null) {
this.print(request);
}
}
/**
* Prints collected statistics.
* @param request the active statistics request
*/
private void print(StatsRequest request) throws IOException {
PrintWriter writer = request.getResponseWriter();
String baseUrl = request.getBaseStatsUrl();
String baseQueryUrl = request.getBaseQueryUrl();
String encTerm = URLEncoder.encode(this.getText(),"UTF-8");
String escTerm = Val.escapeXmlForBrowser(this.getText());
String callbackUrl = baseUrl+"/terms?term="+encTerm;
/*
writer.println("timeMillis="+this.getTimeMillis());
writer.println("term="+this.getText());
writer.println("numberOfDocsConsidered="+this.getNumberOfDocsConsidered());
writer.println("numberOfDocsWithTerm="+this.getNumberOfDocsWithTerm());
writer.println("totalFrequency="+getTotalFrequency());
writer.println("....................");
List<NamedFrequency> frequencies = this.getFieldFrequencies();
for (NamedFrequency frequency: frequencies) {
writer.println("frequency="+frequency.getFrequency()+", field="+frequency.getName());
}
*/
if (request.getResponseFormat().equalsIgnoreCase("json")) {
writer.println("{");
writer.println(" \"term\": \""+Val.escapeStrForJson(this.getText())+"\",");
writer.println(" \"documentsIndexed\": "+this.getNumberOfDocsConsidered()+",");
writer.println(" \"numberOfDocsWithTerm\": "+this.getNumberOfDocsWithTerm()+",");
writer.println(" \"fields\": [");
List<NamedFrequency> frequencies = this.getFieldFrequencies();
int count = 0;
int size = frequencies.size();
for (NamedFrequency frequency: frequencies) {
count++;
writer.print(" {");
writer.print("\"name\": \""+Val.escapeStrForJson(frequency.getName())+"\"");
writer.print(", \"documents\": "+frequency.getFrequency());
if (count < size) {
writer.println("},");
} else {
writer.println("}");
}
}
writer.println(" ]");
writer.println("}");
writer.flush();
} else {
// navigation
writer.println("<p><a href=\""+baseUrl+"/fields\">Fields</a>");
writer.println(" <a href=\""+baseUrl+"/terms\">Terms</a></p>");
// summary
writer.println("<p>");
writer.println("<strong>Term:</strong> "+Val.escapeXmlForBrowser(this.getText()));
writer.println("<br/><strong>Documents containing term:</strong> "+this.getNumberOfDocsWithTerm());
writer.println("<br/><strong>Documents indexed:</strong> "+this.getNumberOfDocsConsidered());
writer.println("<br/><strong>Number of fields listed:</strong> "+this.getFieldFrequencies().size());
writer.println("</p>");
// statistics table
writer.println("<table border='1'>");
writer.println("<thead><tr>");
writer.println("<th><a href=\""+callbackUrl+"&sortBy=name\">Field</a></th>");
writer.println("<th><a href=\""+callbackUrl+"\">Documents</a></th>");
writer.println("</tr></thead>");
writer.println("<tbody>");
List<NamedFrequency> frequencies = this.getFieldFrequencies();
for (NamedFrequency frequency: frequencies) {
writer.print("<tr>");
String encField = URLEncoder.encode(frequency.getName(),"UTF-8");
String escField = Val.escapeXmlForBrowser(encField);
String href1 = baseUrl+"/fields?field="+escField;
writer.print("<td>");
writer.print("<a href=\""+href1+"\">");
writer.print(Val.escapeXmlForBrowser(frequency.getName()));
writer.print("</a></td>");
String href2 = baseQueryUrl+"?f=html&searchText="+escField+":"+escTerm;
writer.print("<td>");
writer.print("<a href=\""+href2+"\">");
writer.print(frequency.getFrequency());
writer.print("</a></td>");
writer.println("</tr>");
writer.flush();
}
writer.println("</tbody>");
writer.println("</table>");
writer.flush();
}
}
}