/* See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * Esri Inc. licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.esri.gpt.catalog.lucene.stats; import com.esri.gpt.framework.security.metadata.MetadataAcl; import com.esri.gpt.framework.util.Val; import java.io.IOException; import java.io.PrintWriter; import java.net.URLEncoder; import java.util.List; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; import org.apache.lucene.index.TermDocs; import org.apache.lucene.index.TermEnum; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.util.OpenBitSet; /** * Provides statistics associated with a single field. * <p/> * Statistics are unavailable until collected. * <p/> * Statistics provided are: * <ul> * <li>the number of documents considered during statistics collection</li> * <li>the number of documents that contain the field</li> * <li>the total number of terms indexed for this field across all documents</li> * <li>the list of frequencies per term for this field</li> * </ul> */ public class SingleFieldStats extends Collectable { /** instance variables ====================================================== */ private String fieldName; private int maxRecords = 100; private int maxFrequency = -1; private int minFrequency = 1; private long numberOfDocsWithField = 0; private FrequencyAccumulator termAccumulator = new FrequencyAccumulator(); /** constructors ============================================================ */ /** * Construct with a supplied field name. * @param fieldName the field name */ public SingleFieldStats(String fieldName) { super(); this.fieldName = fieldName; } /** * Constructs with a supplied field name and min/max thresholds. * @param fieldName the field name * @param maxRecords the maximum number of records to return * @param minFrequency the minimum term frequency to consider * @param maxFrequency the maximum term frequency to consider */ public SingleFieldStats(String fieldName, int maxRecords, int minFrequency, int maxFrequency) { super(); this.fieldName = fieldName; if (maxRecords >= 0) { if (maxRecords > 10000) maxRecords = 10000; this.maxRecords = maxRecords; } if (minFrequency > 0) { this.minFrequency = minFrequency; } if (maxFrequency > 0) { this.maxFrequency = maxFrequency; } } /** properties ============================================================= */ /** * Gets the field name. * @return the field name */ private String getFieldName() { return this.fieldName; } /** * Gets the number of documents containing this field. * @return the number of documents */ private long getNumberOfDocsWithField() { return this.numberOfDocsWithField; } /** * Gets the list of frequencies per term for this field. * <br/>Each member will be named by term and counted by term frequency. * @return the term frequencies */ private List<NamedFrequency> getTermFrequencies() { return this.termAccumulator.getFrequencies(); } /** * Gets the total number of terms indexed for this field across all documents. * @return the total number of terms */ private long getTotalNumberOfTerms() { return this.termAccumulator.getTotalFrequency(); } /** methods ================================================================= */ /** * Executes the collection of statistics. * @param request the active statistics request * @param reader the index reader * @throws IOException if an error occurs while communicating with the index */ public void collectStats(StatsRequest request, IndexReader reader) throws IOException { long t1 = System.currentTimeMillis(); TermEnum termEnum = null; TermDocs termDocs = null; try { OpenBitSet documentFilterBitSet = request.getDocumentFilterBitSet(); OpenBitSet docsWithFieldBitSet = new OpenBitSet(reader.maxDoc()); boolean isUnfiltered = (documentFilterBitSet == null); boolean checkMaxFreq = (this.maxFrequency > 0); boolean checkMinFreq = (this.minFrequency > 0); // return if there are no stats to collect if (this.determineNumberOfDocsConsidered(reader,documentFilterBitSet) <= 0) { return; } else if (!request.isFieldCollectable(this.fieldName)){ return; } boolean checkTermDocs = true; if (isUnfiltered) { MetadataAcl acl = new MetadataAcl(request.getRequestContext()); if (acl.isPolicyUnrestricted()) { if (this.getNumberOfDocsConsidered() > 25000) { checkTermDocs = false; } } } // accumulate term frequencies per field termEnum = reader.terms(new Term(this.fieldName)); termDocs = reader.termDocs(); do { Term term = termEnum.term(); if (term != null && term.field().equals(this.fieldName)) { if (checkTermDocs) { termDocs.seek(term); long count = 0; while (termDocs.next()) { int docId = termDocs.doc(); boolean bSet = isUnfiltered || documentFilterBitSet.fastGet(docId); if (bSet) { docsWithFieldBitSet.fastSet(docId); count++; } } if ((!checkMaxFreq || (count <= this.maxFrequency)) && (!checkMinFreq || (count >= this.minFrequency))) { this.termAccumulator.add(term.text(),count); } } else { long count = termEnum.docFreq(); if ((!checkMaxFreq || (count <= this.maxFrequency)) && (!checkMinFreq || (count >= this.minFrequency))) { this.termAccumulator.add(term.text(),count); } } } else { break; } } while (termEnum.next()); // sort this.numberOfDocsWithField = docsWithFieldBitSet.cardinality(); if (Val.chkStr(request.getSortBy()).equalsIgnoreCase("name")) { this.termAccumulator.sortByName(); } else { this.termAccumulator.sortByFrequency(); } } finally { try {if (termEnum != null) termEnum.close();} catch (Exception ef) {} try {if (termDocs != null) termDocs.close();} catch (Exception ef) {} this.setTimeMillis(System.currentTimeMillis() - t1); } // print if (request.getResponseWriter() != null) { this.print(request); } } /** * Prints collected statistics. * @param request the active statistics request */ private void print(StatsRequest request) throws IOException { PrintWriter writer = request.getResponseWriter(); String baseUrl = request.getBaseStatsUrl(); String baseQueryUrl = request.getBaseQueryUrl(); String callbackUrl = baseUrl+"/fields?field="+this.fieldName; int max = this.maxRecords; int numToReturn = this.getTermFrequencies().size(); if ((this.maxRecords >= 0) && (this.maxRecords < numToReturn)) { numToReturn = max; } String sMaxFreq = "none"; if (this.maxFrequency > 0) { sMaxFreq = ""+this.maxFrequency; } /* writer.println("timeMillis="+this.getTimeMillis()); writer.println("numberOfDocsConsidered="+this.getNumberOfDocsConsidered()); writer.println("fieldName="+this.getFieldName()); writer.println("numberOfDocsWithField="+this.getNumberOfDocsWithField()); writer.println("totalNumberOfTerms="+getTotalNumberOfTerms()); writer.println("...................."); List<NamedFrequency> frequencies = this.getTermFrequencies(); for (NamedFrequency frequency: frequencies) { writer.println("frequency="+frequency.getFrequency()+", term="+frequency.getName()); } */ if (request.getResponseFormat().equalsIgnoreCase("json")) { writer.println("{"); writer.println(" \"field\": \""+Val.escapeStrForJson(this.fieldName)+"\","); writer.println(" \"documentsIndexed\": "+this.getNumberOfDocsConsidered()+","); //writer.println(" \"numberOfDocsWithField\": "+this.getNumberOfDocsWithField()+","); writer.println(" \"totalNumberOfTerms\": "+this.getTermFrequencies().size()+","); writer.println(" \"numberOfTermsListed\": "+numToReturn+","); writer.println(" \"minFrequencyConsidered\": "+this.minFrequency+","); writer.println(" \"maxFrequencyConsidered\": "+this.maxFrequency+","); writer.println(" \"terms\": ["); List<NamedFrequency> frequencies = this.getTermFrequencies(); int count = 0; if (numToReturn > 0) { for (NamedFrequency frequency: frequencies) { count++; boolean isLast = (count >= numToReturn); writer.print(" {"); writer.print("\"name\": \""+Val.escapeStrForJson(frequency.getName())+"\""); writer.print(", \"documents\": "+frequency.getFrequency()); if (!isLast) { writer.println("},"); } else { writer.println("}"); break; } } } writer.println(" ]"); writer.println("}"); writer.flush(); } else { // navigation writer.println("<p><a href=\""+baseUrl+"/fields\">Fields</a>"); //writer.println("  <a href=\""+baseUrl+"/terms\">Terms</a></p>"); // summary writer.println("<p>"); writer.println("<strong>Field:</strong> "+Val.escapeXmlForBrowser(this.fieldName)); writer.println("<br/><strong>Documents indexed:</strong> "+this.getNumberOfDocsConsidered()); //writer.println("<br/><strong>Documents containing field:</strong> "+this.getNumberOfDocsWithField()); writer.println("<br/><strong>Total number of terms:</strong> "+this.getTermFrequencies().size()); writer.println("<br/><strong>Number of terms listed:</strong> "+numToReturn); writer.println("<br/><strong>Minimum frequency considered:</strong> "+this.minFrequency); writer.println("<br/><strong>Maximum frequency considered:</strong> "+sMaxFreq); writer.println("</p>"); // statistics table if (numToReturn > 0) { writer.println("<table border='1'>"); writer.println("<thead><tr>"); writer.println("<th><a href=\""+callbackUrl+"&sortBy=name\">Term</a></th>"); writer.println("<th><a href=\""+callbackUrl+"\">Documents</a></th>"); writer.println("</tr></thead>"); writer.println("<tbody>"); List<NamedFrequency> frequencies = this.getTermFrequencies(); int count = 0; for (NamedFrequency frequency: frequencies) { count++; boolean isLast = (count >= numToReturn); writer.print("<tr>"); String encTerm = URLEncoder.encode(frequency.getName(),"UTF-8"); //String escTerm = Val.escapeXmlForBrowser(frequency.getName()); String escTerm = Val.escapeXmlForBrowser(encTerm); String href1 = baseUrl+"/terms?term="+escTerm; writer.print("<td>"); //writer.print("<a href=\""+href1+"\">"); writer.print(Val.escapeXmlForBrowser(frequency.getName())); //writer.print("</a>"); writer.print("</td>"); String q = this.fieldName+":"+QueryParser.escape(frequency.getName()); String href2 = baseQueryUrl+"?f=html&searchText="+URLEncoder.encode(q,"UTF-8"); writer.print("<td style=\"text-align:right;\">"); writer.print("<a href=\""+Val.escapeXmlForBrowser(href2)+"\">"); writer.print(frequency.getFrequency()); writer.print("</a></td>"); writer.println("</tr>"); //writer.flush(); if (isLast) break; } } writer.println("</tbody>"); writer.println("</table>"); writer.flush(); } } }