GlobalTermStats.java example

Explorer
GeoprocessingAppstore-master
- src
/* See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * Esri Inc. licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.esri.gpt.catalog.lucene.stats;
import com.esri.gpt.framework.util.Val;

import java.io.IOException;
import java.io.PrintWriter;
import java.net.URLEncoder;
import java.util.List;

import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermDocs;
import org.apache.lucene.index.TermEnum;
import org.apache.lucene.util.OpenBitSet;

/**
 * Provides statistics associated with all terms.
 * <p/>
 * Statistics are unavailable until collected.
 * <p/>
 * Statistics provided are:
 * <ul>
 *   <li>the number of documents considered during statistics collection</li>
 *   <li>the list of frequencies per term across all considered documents</li>
 * </ul>
 */
class GlobalTermStats extends Collectable {

  /** instance variables ====================================================== */
  private int                  maxRecords = 100;
  private int                  minFrequency = 10;
  private FrequencyAccumulator termAccumulator = new FrequencyAccumulator();
  
  /** constructors ============================================================ */
  
  /**
   * Construct with a supplied text string.
   */
  public GlobalTermStats() {
    super();
  }
  
  /** properties  ============================================================= */
    
  /**
   * Gets the list of frequencies per term across all considered documents.
   * <br/>Each member will be named by term and counted by term frequency.
   * @return the term frequencies
   */
  private List<NamedFrequency> getTermFrequencies() {
    return this.termAccumulator.getFrequencies();
  }
    
  /** methods ================================================================= */
  
  /**
   * Executes the collection of statistics.
   * @param request the active statistics request
   * @param reader the index reader
   * @throws IOException if an error occurs while communicating with the index
   */
  public void collectStats(StatsRequest request, IndexReader reader) throws IOException {
    long t1 = System.currentTimeMillis();
    TermEnum termEnum = null;
    TermDocs termDocs = null;
    try {
      OpenBitSet documentFilterBitSet = request.getDocumentFilterBitSet();
      boolean isUnfiltered = (documentFilterBitSet == null);

      // return if there are no stats to collect
      String[] fieldNames = request.getCollectableFieldNames(reader);
      if (this.determineNumberOfDocsConsidered(reader,documentFilterBitSet) <= 0) {
        return;
      } else if ((fieldNames == null) || (fieldNames.length == 0)) {
        return;
      }
      
      // accumulate term frequencies
      
      termDocs = reader.termDocs();
      for (String fieldName: fieldNames) {   
        termEnum = reader.terms(new Term(fieldName));
        do {
          Term term = termEnum.term();
          if (term != null && term.field().equals(fieldName)) {
               
            termDocs.seek(term);
            long count = 0;
            while (termDocs.next()) {
              int docId = termDocs.doc();
              boolean bSet = isUnfiltered || documentFilterBitSet.get(docId);
              if (bSet) {
                count++;
                //this.termAccumulator.add(term.text(),termDocs.freq());
              }
            }
            this.termAccumulator.add(term.text(),count);
            
          } else {
            break;
          }
        } while (termEnum.next());
        termEnum.close();
        termEnum = null;
        
      }
      
      // purge based on min frequence and min records
      
      // sort
      if (Val.chkStr(request.getSortBy()).equalsIgnoreCase("name")) {
        this.termAccumulator.sortByName();
      } else {
        this.termAccumulator.sortByFrequency();
      }
      
    } finally {
      try {if (termEnum != null) termEnum.close();} catch (Exception ef) {}
      try {if (termDocs != null) termDocs.close();} catch (Exception ef) {}
      this.setTimeMillis(System.currentTimeMillis() - t1);
    }   
    
    // print
    if (request.getResponseWriter() != null) {
      this.print(request);
    }
  }
  
  /**
   * Prints collected statistics.
   * @param request the active statistics request
   */
  private void print(StatsRequest request) throws IOException {
    PrintWriter writer = request.getResponseWriter();
    String baseUrl = request.getBaseStatsUrl();
    String baseQueryUrl = request.getBaseQueryUrl();
    String callbackUrl = baseUrl+"/terms";
    
    /*
    writer.println("timeMillis="+this.getTimeMillis());
    writer.println("numberOfDocsConsidered="+this.getNumberOfDocsConsidered());
    writer.println("....................");
    List<NamedFrequency> frequencies = this.getTermFrequencies();
    for (NamedFrequency frequency: frequencies) {
      if (frequency.getFrequency() >= this.minTermFrequency) {
        writer.println("frequency="+frequency.getFrequency()+", term="+frequency.getName());
      }
    }
    */
    
    
    if (request.getResponseFormat().equalsIgnoreCase("json")) {
      writer.println("{");
      writer.println("  \"documentsIndexed\": "+this.getNumberOfDocsConsidered()+",");
      writer.println("  \"minFrequencyConsidered\": "+this.minFrequency+",");
      writer.println("  \"maxTermsConsidered\": "+this.maxRecords+",");      
      writer.println("  \"terms\": [");
      List<NamedFrequency> frequencies = this.getTermFrequencies();
      int count = 0;
      int size = frequencies.size();
      for (NamedFrequency frequency: frequencies) {
        count++;
        boolean isLast = (count >= size);
        if ((this.maxRecords > 0) && (count >= this.maxRecords)) {
          isLast = true;
        }
        
        writer.print("    {");
        writer.print("\"name\": \""+Val.escapeStrForJson(frequency.getName())+"\"");
        writer.print(", \"documents\": "+frequency.getFrequency());
        if (!isLast) {
          writer.println("},");
        } else {
          writer.println("}");
          break;
        }
      }
      writer.println("  ]");
      writer.println("}");
      writer.flush();
      
    } else {
      
      // navigation
      writer.println("<p><a href=\""+baseUrl+"/fields\">Fields</a>");
      writer.println("  <a href=\""+baseUrl+"/terms\">Terms</a></p>");
      
      // summary
      writer.println("<p>");
      writer.println("<strong>Terms</strong>");
      writer.println("<br/><strong>Documents indexed:</strong> "+this.getNumberOfDocsConsidered());
      writer.println("<br/><strong>Number of terms listed:</strong> "+this.getTermFrequencies().size());
      writer.println("<br/><strong>Minimum frequency considered:</strong> "+this.minFrequency);
      writer.println("<br/><strong>Maximum terms considered:</strong> "+this.maxRecords);
      writer.println("</p>");
      
      // statistics table
      writer.println("<table border='1'>");
      writer.println("<thead><tr>");
      writer.println("<th><a href=\""+callbackUrl+"?sortBy=name\">Term</a></th>");
      writer.println("<th><a href=\""+callbackUrl+"\">Documents</a></th>");
      writer.println("</tr></thead>");
      writer.println("<tbody>");
      List<NamedFrequency> frequencies = this.getTermFrequencies();
      long count = 0;
      for (NamedFrequency frequency: frequencies) {
        if (frequency.getFrequency() >= this.minFrequency) {
          
          count++;
          if ((this.maxRecords > 0) && (count > this.maxRecords)) {
            break;
          }
          writer.print("<tr>");
          
          String encTerm = URLEncoder.encode(frequency.getName(),"UTF-8");
          String escTerm = Val.escapeXmlForBrowser(frequency.getName());
          String href1 = baseUrl+"/terms?term="+escTerm;
          writer.print("<td>");
          writer.print("<a href=\""+href1+"\">");
          writer.print(Val.escapeXmlForBrowser(frequency.getName()));
          writer.print("</a></td>");
          
          String href2 = baseQueryUrl+"?f=html&searchText="+escTerm;
          writer.print("<td style=\"text-align:right;\">");
          writer.print("<a href=\""+href2+"\">");
          writer.print(frequency.getFrequency());
          writer.print("</a></td>");
              
          writer.println("</tr>");
          //writer.flush();
        }
      }
      writer.println("</tbody>");
      writer.println("</table>");
      writer.flush();
    }
  }
  
}