TermsResultSet.java example

Explorer

mimir-master
- etc
  - generated-header.java
- mimir-client
  - src
    - gate
      - mimir
        index
        MimirConnector.java
        MimirIndexingPR.java
        search
        RemoteQueryRunner.java
        tool
        WebUtils.java
- mimir-cloud
  - archive-unpacker
    - src
      - gate
        mimir
        util
        MultiFileInputStream.java
        UnpackWizard.java
  - src
    - java
      - gate
        mimir
        util
        IndexArchiveState.java
        MultiFileOutputStream.java
- mimir-core
  - src
    - gate
      - mimir
        AbstractSemanticAnnotationHelper.java
        Constraint.java
        ConstraintType.java
        DocumentMetadataHelper.java
        DocumentRenderer.java
        IndexConfig.java
        MimirIndex.java
        SemanticAnnotationHelper.java
        index
        AtomicAnnotationIndex.java
        AtomicIndex.java
        AtomicTokenIndex.java
        DocumentCollection.java
        DocumentData.java
        GATEDocument.java
        GATEDocumentFactory.java
        IndexException.java
        Mention.java
        OriginalMarkupMetadataHelper.java
        package-info.java
        search
        FederatedQueryRunner.java
        IndexReaderPool.java
        QueryEngine.java
        QueryRunner.java
        RankingQueryRunnerImpl.java
        query
        AbstractIntersectionQueryExecutor.java
        AbstractOverlapQuery.java
        AbstractQueryExecutor.java
        AndQuery.java
        AnnotationQuery.java
        Binding.java
        ConstQuery.java
        ContainsQuery.java
        ExecutorsList.java
        GapQuery.java
        MinusQuery.java
        OrQuery.java
        QueryExecutor.java
        QueryNode.java
        RepeatsQuery.java
        SequenceQuery.java
        TermQuery.java
        WithinQuery.java
        parser
        ParseException.java
        Query.java
        QueryParser.java
        QueryParserConstants.java
        QueryParserTokenManager.java
        SimpleCharStream.java
        Token.java
        TokenMgrError.java
        score
        BindingScorer.java
        DelegatingScoringQueryExecutor.java
        MimirScorer.java
        terms
        AbstractCompoundTermsQuery.java
        AbstractDocumentsBasedTermsQuery.java
        AbstractIndexTermsQuery.java
        AndTermsQuery.java
        AnnotationTermsQuery.java
        CompoundTermsQuery.java
        ConstTermsQuery.java
        DocumentTermsQuery.java
        DocumentsAndTermsQuery.java
        DocumentsBasedTermsQuery.java
        DocumentsOrTermsQuery.java
        LimitTermsQuery.java
        OrTermsQuery.java
        SortedTermsQuery.java
        TermTypeTermsQuery.java
        TermsQuery.java
        TermsResultSet.java
        util
        DefaultMentionDescriber.java
        DelegatingSemanticAnnotationHelper.java
        DocumentFeaturesMetadataHelper.java
        IgnoreEmptiesTermProcessor.java
        IndexUpgrader.java
        MG4JTools.java
        NormalizingTermProcessor.java
        OntologyMentionDescriber.java
        TruncateIndex.java
- mimir-test
  - src
    - gate
      - mimir
        test
        QueryTests.java
        RenderZipCollection.java
        Scratch.java
        ScratchConsole.java
        TestQueryParser.java
        TestUtils.java
- mimir-web
  - src
    - gwt
      - gate
        mimir
        web
        client
        UI.java
    - java
      - gate
        mimir
        util
        LogAnalyser.java
        web
        client
        DocumentData.java
        GwtRpcService.java
        GwtRpcServiceAsync.java
        MimirSearchException.java
        ResultsData.java
- plugins
  - db-h2
    - src
      - gate
        mimir
        db
        AnnotationTemplateCache.java
        DBSemanticAnnotationHelper.java
  - measurements
    - src
      - gate
        mimir
        measurements
        MeasurementAnnotationHelper.java
        MeasurementPluginResource.java
  - sparql
    - src
      - gate
        mimir
        sparql
        RequestMethod.java
        SPARQLResultSet.java
        SPARQLSemanticAnnotationHelper.java

/*
 *  TermsResultSet.java
 *
 *  Copyright (c) 2007-2011, The University of Sheffield.
 *
 *  This file is part of GATE Mímir (see http://gate.ac.uk/family/mimir.html), 
 *  and is free software, licenced under the GNU Lesser General Public License,
 *  Version 3, June 2007 (also included with this distribution as file
 *  LICENCE-LGPL3.html).
 *  
 *  Valentin Tablan, 13 Jul 2012
 *
 *  $Id$
 */
package gate.mimir.search.terms;

import gate.mimir.SemanticAnnotationHelper;
import it.unimi.dsi.fastutil.Arrays;
import it.unimi.dsi.fastutil.ints.AbstractIntComparator;
import it.unimi.dsi.fastutil.ints.IntArrayList;
import it.unimi.dsi.fastutil.objects.Object2ObjectMap;
import it.unimi.dsi.fastutil.objects.Object2ObjectOpenHashMap;
import it.unimi.dsi.fastutil.objects.ObjectArrayList;
import it.unimi.dsi.fastutil.objects.ObjectIterator;
import it.unimi.dsi.fastutil.objects.ObjectOpenHashSet;

import java.io.Serializable;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;

/**
 * Class representing the results of a {@link TermsQuery}. 
 * A terms result set is a set of terms, represented by their 
 * {@link #termStrings}. Optionally {@link #termCounts}, 
 * {@link #termDescriptions}, and {@link #termLengths} may also be available.
 */
public class TermsResultSet implements Serializable {
  
  /**
   * Serialization ID.
   */
  private static final long serialVersionUID = -7722325563637139625L;

  
  /**
   * The lengths (number of tokens) for the terms. Array parallel with 
   * {@link #termStrings}, and {@link #termDescriptions}.
   */
  public final int[] termLengths;
  
  /**
   * The strings for the terms. Array parallel with 
   * {@link #termCounts} and {@link #termDescriptions}.
   */
  public final String[] termStrings;

  /**
   * This field is populated by the 
   * {@link #groupByDescription(TermsResultSet...)} method. It contains term 
   * strings from the original result sets indexed by position in this result 
   * set, and the index of the results set. For example 
   * originalTermStrings[i][j] is a String[], containing all the term strings
   * associated with termDescriptions[i] in the j<sup>th</sup> result set.  
   */
  public String[][][] originalTermStrings;
  
  /**
   * For annotation indexes, the term string is simply a URI in whatever format
   * is used by the {@link SemanticAnnotationHelper} that was used to index the
   * annotations. These URIs are not useful outside of the annotation helper 
   * and index, so term descriptions can be requested. If term descriptions were
   * produced during the search, they are stored in this array (which is aligned
   *  with {@link #termIds} and {@link #termCounts}).
   */
  public final String[] termDescriptions;
  
  /**
   * The counts (numbers of occurrences) for the terms. Array parallel with 
   * {@link #termStrings} and {@link #termIds}.
   */
  public final int[] termCounts;
  
  public TermsResultSet(String[] termStrings, int[] termLengths, 
                        int[] termCounts, String[] termDescriptions) {
    super();
    this.termStrings = termStrings;
    this.termLengths = termLengths;
    this.termCounts = termCounts;
    this.termDescriptions = termDescriptions;
  }
  
  /**
   * Constant representing the empty result set.
   */
  public static final TermsResultSet EMPTY = new TermsResultSet(
      new String[]{}, new int[] {}, new int[]{}, new String[]{});
  
  
  /**
   * Given a position in {@link #termDescriptions}, this method computes all 
   * term strings that had that description in each of the sub-indexes of the
   * federated index that produced this result set. 
   * @param termPosition the term for which the original term strings are being
   * requested.
   * @return An array where element at position i is an array containing all the
   * term strings (in the dictionary of sub-index i) that had the given term
   * description when the original query was answered by sub-index i, or null
   * if original terms strings are not available.
   */
  public String[][] getSubIndexTerms(int termPosition) {
    return (originalTermStrings != null) ?
        originalTermStrings[termPosition] : null;
  }
  
  /**
   * Tries to locate the correct term position and calls 
   * {@link #getSubIndexTerms(int)}. 
   * @param termString
   * @return
   */
  public String[][] getSubIndexTerms(String termString) {
    int termPos = -1;
    try {
      termPos = Integer.parseInt(termString);
    } catch (Exception e) {}
    if(termStrings[termPos].equals(termString)) {
      return getSubIndexTerms(termPos);
    } else{
      // could not convert it: leave it unchanged
      return new String[][]{{termString}};
    }
  }
  
  /**
   * Sorts the arrays inside a {@link TermsResultSet} using the termString for
   * comparison.
   * @param trs
   */
  public static void sortTermsResultSetByTermString(final TermsResultSet trs) {
    Arrays.quickSort(0, trs.termStrings.length, new AbstractIntComparator() {
      @Override
      public int compare(int k1, int k2) {
        return trs.termStrings[k1].compareTo(trs.termStrings[k2]);
      }
    }, new Swapper(trs));
  }

  /**
   * Enumerates a result set and produces a new one after removing all the terms
   * with descriptions in the banned list.
   * @param bannedDescriptions A String array containing all the banned term 
   * descriptions.
   * @param setToFilter the terms result set to filter
   * @return the filtered result set.
   */
  public static TermsResultSet filterByDescriptionNot(TermsResultSet setToFilter, String... bannedDescriptions) {
    final boolean descriptionsAvailable = setToFilter.termDescriptions != null;
    if(!descriptionsAvailable) return setToFilter;
    
    final boolean countsAvailable = setToFilter.termCounts != null;
    final boolean lengthsAvailable = setToFilter.termLengths != null;
    final boolean origTermsAvailable = setToFilter.originalTermStrings != null;
    
    IntArrayList counts = countsAvailable ? new IntArrayList() : null;
    IntArrayList lengths = lengthsAvailable ? new IntArrayList() : null;
    ObjectArrayList<String> strings = new ObjectArrayList<String>();
    ObjectArrayList<String> descriptions = new ObjectArrayList<String>();
    ObjectArrayList<String[][]> origTerms = new ObjectArrayList<String[][]>();
    ObjectOpenHashSet<String> bannedSet = new ObjectOpenHashSet<String>(bannedDescriptions);
    
    for(int i = 0; i < setToFilter.termDescriptions.length; i++) {
      if(!bannedSet.contains(setToFilter.termDescriptions[i])) {
        descriptions.add(setToFilter.termDescriptions[i]);
        strings.add(setToFilter.termStrings[i]);
        if(countsAvailable) counts.add(setToFilter.termCounts[i]);
        if(lengthsAvailable) lengths.add(setToFilter.termLengths[i]);
        if(origTermsAvailable)origTerms.add(setToFilter.originalTermStrings[i]);
      }
    }
    int size = descriptions.size();
    TermsResultSet res = new TermsResultSet(
      strings.toArray(new String[size]),
      lengthsAvailable ? lengths.toArray(new int[size]) : null,
      countsAvailable ? counts.toArray(new int[size]) : null,
      descriptions.toArray(new String[size]));
    if(origTermsAvailable) res.originalTermStrings = 
        origTerms.toArray(new String[size][][]);
    return res;
  }
  
  /**
   * This method re-arranges the data included in one or more 
   * {@link TermsResultSet} values so that each term description occurs only
   * once in the {@link #termDescriptions} array.
   * 
   * A {@link TermsResultSet} obtained when calling 
   * {@link TermsQuery#execute(gate.mimir.search.QueryEngine)} may include the 
   * same description for multiple term strings: depending on the implementation
   * used to describe terms, distinct terms may end up with the same 
   * description. This could cause confusion when the output is presented to 
   * the user, as they would have no way to distinguish between the different 
   * terms.
   * 
   * When executing a terms query against a federated index, each sub-index 
   * returns its own result set. Terms originating in different sub-indexes can
   * have the same description.
   * 
   * This method combines these into a unified result set that preserves the 
   * right term ID to term description mappings by populating the 
   * {@link #originalTermStrings} array.
   * 
   * @param resSets the result sets produced by the sub-indexes of a federated
   * index.
   * @return the combined result set.
   */
  public static TermsResultSet groupByDescription(TermsResultSet... resSets) {
    boolean descriptionsAvaialble = true;
    boolean countsAvaialble = true;
    boolean lengthsAvaialble = false;
    for(TermsResultSet trs : resSets) {
      if(trs.termDescriptions == null) {
        descriptionsAvaialble = false;
      }
      if(trs.termCounts == null) {
        countsAvaialble = false;
      }
      if(trs.termLengths != null) {
        lengthsAvaialble = true;
      }
    }

    Object2ObjectOpenHashMap<String, TermData> desc2TermData = 
        new Object2ObjectOpenHashMap<String, TermData>();
    
    for(int subIndexPos = 0; subIndexPos < resSets.length; subIndexPos++) {
      TermsResultSet trs = resSets[subIndexPos];
      for(int i = 0; i < trs.termStrings.length; i++) {
        String description = descriptionsAvaialble ? 
            trs.termDescriptions[i] : trs.termStrings[i];
//          String string = descriptionsAvaialble ? trs.termStrings[i] : null;
        // get all the strings describing the current term
        String[] strings = null;
        if(trs.originalTermStrings != null) {
          // old TRS already has original term strings 
          if(trs.originalTermStrings[i].length == 1) {
            // old TRS was not federated
            strings = trs.originalTermStrings[i][0];
          } else {
            // old TRS was federated: get the term strings from the correct sub-index
            strings = trs.originalTermStrings[i][subIndexPos];
          }
        } else {
          // no old original term strings: use the actual term string
          strings = descriptionsAvaialble ? 
              new String[]{trs.termStrings[i]} : null;
        }
        
        TermData tData = desc2TermData.get(description);
        if(tData == null) {
          tData = new TermData(description, resSets.length);
          desc2TermData.put(description, tData);
        }
        if(descriptionsAvaialble && strings != null){
          for(String s : strings) tData.addString(subIndexPos, s);
//          tData.addString(subIndexPos, string);
        }
        if(countsAvaialble) {
          tData.count += trs.termCounts[i];
        }
        if(lengthsAvaialble && trs.termLengths != null && tData.length < 0) {
          tData.length = trs.termLengths[i];
        }
      }
    }
    // produce the compound result set
    String[] newStrings = new String[desc2TermData.size()];
    String[] newDescriptions = descriptionsAvaialble ? 
        new String[desc2TermData.size()] : null;
    int[] newCounts = countsAvaialble ? new int[desc2TermData.size()] : null;
    int[] newLenghts = lengthsAvaialble ? new int[desc2TermData.size()] : null;
    String[][][] originalTermStrings = descriptionsAvaialble ?
      new String[desc2TermData.size()][][] : null;
    ObjectIterator<Object2ObjectMap.Entry<String, TermData>> iter = 
        desc2TermData.object2ObjectEntrySet().fastIterator();    
    int pos = 0;
    while(iter.hasNext()) {
      TermData tData = iter.next().getValue();
      if(descriptionsAvaialble) {
        newDescriptions[pos] = tData.description;
        originalTermStrings[pos] = tData.getStrings();
        // term string does not actually mean anything; 
        // we use the term position instead
        // newStrings[pos] = Integer.toString(pos);
        Set<String> uniq = new HashSet<String>();
        for(String[] terms : originalTermStrings[pos]) {
          for(String term : terms) {
            uniq.add(term);
          }
        }
        if(uniq.isEmpty()) {
          newStrings[pos] = Integer.toString(pos);
        } else {
          List<String> termList= new ArrayList<String>(uniq);
          Collections.sort(termList);
          StringBuilder strb = new StringBuilder(termList.get(0));
          for(int i = 1; i < termList.size(); i++) {
            strb.append(" | ").append(termList.get(i));
          }
          newStrings[pos] = strb.toString();          
        }
      } else {
        newStrings[pos] = tData.description;
      }
      if(countsAvaialble) newCounts[pos] = tData.count;
      if(lengthsAvaialble) newLenghts[pos] = tData.length;
      pos++;
    }
    
    TermsResultSet res = new TermsResultSet(newStrings, newLenghts, newCounts, 
      newDescriptions);
    res.originalTermStrings = originalTermStrings;
    return res;
  }
  
  /**
   * Class used internally to store the term data when grouping terms results sets.
   * See {@link TermsResultSet#groupByDescription(TermsResultSet...)}.
   */
  private static class TermData {
    private String description;
    private int count;
    private int length;
    
    /**
     * The number of result sets being combined 
     */
    private int arity;
    
    /**
     * An array of size {@link #arity}, element at position i containing the 
     * term strings in the result set at position i, for this term description.
     */
    private ObjectArrayList<String>[] strings;

    public TermData(String description, int arity) {
      super();
      this.description = description;
      this.arity = arity;
      strings = new ObjectArrayList[arity];
      this.count = 0;
      this.length = -1;
    }
    
    /**
     * Adds a new term string for the sub-index at a given position.
     * @param position
     * @param string
     */
    public void addString(int position, String string) {
      if(strings[position] == null) {
        strings[position] = new ObjectArrayList<String>();
      }
      strings[position].add(string);
    }
    
    public String[][] getStrings() {
      String[][] res = new String[strings.length][];
      for(int i = 0; i < strings.length; i++) {
        if(strings[i] == null) {
          res[i] = new String[0];
        } else {
          res[i] = strings[i].toArray(new String[strings[i].size()]);
        }
      }
      return res;
    }
  }
  
  /**
   * A {@link it.unimi.dsi.fastutil.Swapper} implementation for 
   * {@link TermsResultSet}s. 
   */
  public static class Swapper implements it.unimi.dsi.fastutil.Swapper {
    private TermsResultSet trs;
    
    public Swapper(TermsResultSet trs) {
      this.trs = trs;
    }
    
    @Override
    public void swap(int a, int b) {
      String termString = trs.termStrings[a];
      trs.termStrings[a] = trs.termStrings[b];
      trs.termStrings[b] = termString;
      if(trs.termCounts != null) {
        int termCount = trs.termCounts[a];
        trs.termCounts[a] = trs.termCounts[b];
        trs.termCounts[b] = termCount;
      }
      if(trs.termLengths != null) {
        int termLength = trs.termLengths[a];
        trs.termLengths[a] = trs.termLengths[b];
        trs.termLengths[b] = termLength;
      }
      if(trs.termDescriptions != null) {
        String termDesc = trs.termDescriptions[a];
        trs.termDescriptions[a] = trs.termDescriptions[b];
        trs.termDescriptions[b] = termDesc;
      }
      if(trs.originalTermStrings != null) {
        String[][] origTSs = trs.originalTermStrings[a];
        trs.originalTermStrings[a] = trs.originalTermStrings[b];
        trs.originalTermStrings[b] = origTSs;
      }
    }
  }
}