/* * TermsResultSet.java * * Copyright (c) 2007-2011, The University of Sheffield. * * This file is part of GATE Mímir (see http://gate.ac.uk/family/mimir.html), * and is free software, licenced under the GNU Lesser General Public License, * Version 3, June 2007 (also included with this distribution as file * LICENCE-LGPL3.html). * * Valentin Tablan, 13 Jul 2012 * * $Id$ */ package gate.mimir.search.terms; import gate.mimir.SemanticAnnotationHelper; import it.unimi.dsi.fastutil.Arrays; import it.unimi.dsi.fastutil.ints.AbstractIntComparator; import it.unimi.dsi.fastutil.ints.IntArrayList; import it.unimi.dsi.fastutil.objects.Object2ObjectMap; import it.unimi.dsi.fastutil.objects.Object2ObjectOpenHashMap; import it.unimi.dsi.fastutil.objects.ObjectArrayList; import it.unimi.dsi.fastutil.objects.ObjectIterator; import it.unimi.dsi.fastutil.objects.ObjectOpenHashSet; import java.io.Serializable; import java.util.ArrayList; import java.util.Collections; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Set; /** * Class representing the results of a {@link TermsQuery}. * A terms result set is a set of terms, represented by their * {@link #termStrings}. Optionally {@link #termCounts}, * {@link #termDescriptions}, and {@link #termLengths} may also be available. */ public class TermsResultSet implements Serializable { /** * Serialization ID. */ private static final long serialVersionUID = -7722325563637139625L; /** * The lengths (number of tokens) for the terms. Array parallel with * {@link #termStrings}, and {@link #termDescriptions}. */ public final int[] termLengths; /** * The strings for the terms. Array parallel with * {@link #termCounts} and {@link #termDescriptions}. */ public final String[] termStrings; /** * This field is populated by the * {@link #groupByDescription(TermsResultSet...)} method. It contains term * strings from the original result sets indexed by position in this result * set, and the index of the results set. For example * originalTermStrings[i][j] is a String[], containing all the term strings * associated with termDescriptions[i] in the j<sup>th</sup> result set. */ public String[][][] originalTermStrings; /** * For annotation indexes, the term string is simply a URI in whatever format * is used by the {@link SemanticAnnotationHelper} that was used to index the * annotations. These URIs are not useful outside of the annotation helper * and index, so term descriptions can be requested. If term descriptions were * produced during the search, they are stored in this array (which is aligned * with {@link #termIds} and {@link #termCounts}). */ public final String[] termDescriptions; /** * The counts (numbers of occurrences) for the terms. Array parallel with * {@link #termStrings} and {@link #termIds}. */ public final int[] termCounts; public TermsResultSet(String[] termStrings, int[] termLengths, int[] termCounts, String[] termDescriptions) { super(); this.termStrings = termStrings; this.termLengths = termLengths; this.termCounts = termCounts; this.termDescriptions = termDescriptions; } /** * Constant representing the empty result set. */ public static final TermsResultSet EMPTY = new TermsResultSet( new String[]{}, new int[] {}, new int[]{}, new String[]{}); /** * Given a position in {@link #termDescriptions}, this method computes all * term strings that had that description in each of the sub-indexes of the * federated index that produced this result set. * @param termPosition the term for which the original term strings are being * requested. * @return An array where element at position i is an array containing all the * term strings (in the dictionary of sub-index i) that had the given term * description when the original query was answered by sub-index i, or null * if original terms strings are not available. */ public String[][] getSubIndexTerms(int termPosition) { return (originalTermStrings != null) ? originalTermStrings[termPosition] : null; } /** * Tries to locate the correct term position and calls * {@link #getSubIndexTerms(int)}. * @param termString * @return */ public String[][] getSubIndexTerms(String termString) { int termPos = -1; try { termPos = Integer.parseInt(termString); } catch (Exception e) {} if(termStrings[termPos].equals(termString)) { return getSubIndexTerms(termPos); } else{ // could not convert it: leave it unchanged return new String[][]{{termString}}; } } /** * Sorts the arrays inside a {@link TermsResultSet} using the termString for * comparison. * @param trs */ public static void sortTermsResultSetByTermString(final TermsResultSet trs) { Arrays.quickSort(0, trs.termStrings.length, new AbstractIntComparator() { @Override public int compare(int k1, int k2) { return trs.termStrings[k1].compareTo(trs.termStrings[k2]); } }, new Swapper(trs)); } /** * Enumerates a result set and produces a new one after removing all the terms * with descriptions in the banned list. * @param bannedDescriptions A String array containing all the banned term * descriptions. * @param setToFilter the terms result set to filter * @return the filtered result set. */ public static TermsResultSet filterByDescriptionNot(TermsResultSet setToFilter, String... bannedDescriptions) { final boolean descriptionsAvailable = setToFilter.termDescriptions != null; if(!descriptionsAvailable) return setToFilter; final boolean countsAvailable = setToFilter.termCounts != null; final boolean lengthsAvailable = setToFilter.termLengths != null; final boolean origTermsAvailable = setToFilter.originalTermStrings != null; IntArrayList counts = countsAvailable ? new IntArrayList() : null; IntArrayList lengths = lengthsAvailable ? new IntArrayList() : null; ObjectArrayList<String> strings = new ObjectArrayList<String>(); ObjectArrayList<String> descriptions = new ObjectArrayList<String>(); ObjectArrayList<String[][]> origTerms = new ObjectArrayList<String[][]>(); ObjectOpenHashSet<String> bannedSet = new ObjectOpenHashSet<String>(bannedDescriptions); for(int i = 0; i < setToFilter.termDescriptions.length; i++) { if(!bannedSet.contains(setToFilter.termDescriptions[i])) { descriptions.add(setToFilter.termDescriptions[i]); strings.add(setToFilter.termStrings[i]); if(countsAvailable) counts.add(setToFilter.termCounts[i]); if(lengthsAvailable) lengths.add(setToFilter.termLengths[i]); if(origTermsAvailable)origTerms.add(setToFilter.originalTermStrings[i]); } } int size = descriptions.size(); TermsResultSet res = new TermsResultSet( strings.toArray(new String[size]), lengthsAvailable ? lengths.toArray(new int[size]) : null, countsAvailable ? counts.toArray(new int[size]) : null, descriptions.toArray(new String[size])); if(origTermsAvailable) res.originalTermStrings = origTerms.toArray(new String[size][][]); return res; } /** * This method re-arranges the data included in one or more * {@link TermsResultSet} values so that each term description occurs only * once in the {@link #termDescriptions} array. * * A {@link TermsResultSet} obtained when calling * {@link TermsQuery#execute(gate.mimir.search.QueryEngine)} may include the * same description for multiple term strings: depending on the implementation * used to describe terms, distinct terms may end up with the same * description. This could cause confusion when the output is presented to * the user, as they would have no way to distinguish between the different * terms. * * When executing a terms query against a federated index, each sub-index * returns its own result set. Terms originating in different sub-indexes can * have the same description. * * This method combines these into a unified result set that preserves the * right term ID to term description mappings by populating the * {@link #originalTermStrings} array. * * @param resSets the result sets produced by the sub-indexes of a federated * index. * @return the combined result set. */ public static TermsResultSet groupByDescription(TermsResultSet... resSets) { boolean descriptionsAvaialble = true; boolean countsAvaialble = true; boolean lengthsAvaialble = false; for(TermsResultSet trs : resSets) { if(trs.termDescriptions == null) { descriptionsAvaialble = false; } if(trs.termCounts == null) { countsAvaialble = false; } if(trs.termLengths != null) { lengthsAvaialble = true; } } Object2ObjectOpenHashMap<String, TermData> desc2TermData = new Object2ObjectOpenHashMap<String, TermData>(); for(int subIndexPos = 0; subIndexPos < resSets.length; subIndexPos++) { TermsResultSet trs = resSets[subIndexPos]; for(int i = 0; i < trs.termStrings.length; i++) { String description = descriptionsAvaialble ? trs.termDescriptions[i] : trs.termStrings[i]; // String string = descriptionsAvaialble ? trs.termStrings[i] : null; // get all the strings describing the current term String[] strings = null; if(trs.originalTermStrings != null) { // old TRS already has original term strings if(trs.originalTermStrings[i].length == 1) { // old TRS was not federated strings = trs.originalTermStrings[i][0]; } else { // old TRS was federated: get the term strings from the correct sub-index strings = trs.originalTermStrings[i][subIndexPos]; } } else { // no old original term strings: use the actual term string strings = descriptionsAvaialble ? new String[]{trs.termStrings[i]} : null; } TermData tData = desc2TermData.get(description); if(tData == null) { tData = new TermData(description, resSets.length); desc2TermData.put(description, tData); } if(descriptionsAvaialble && strings != null){ for(String s : strings) tData.addString(subIndexPos, s); // tData.addString(subIndexPos, string); } if(countsAvaialble) { tData.count += trs.termCounts[i]; } if(lengthsAvaialble && trs.termLengths != null && tData.length < 0) { tData.length = trs.termLengths[i]; } } } // produce the compound result set String[] newStrings = new String[desc2TermData.size()]; String[] newDescriptions = descriptionsAvaialble ? new String[desc2TermData.size()] : null; int[] newCounts = countsAvaialble ? new int[desc2TermData.size()] : null; int[] newLenghts = lengthsAvaialble ? new int[desc2TermData.size()] : null; String[][][] originalTermStrings = descriptionsAvaialble ? new String[desc2TermData.size()][][] : null; ObjectIterator<Object2ObjectMap.Entry<String, TermData>> iter = desc2TermData.object2ObjectEntrySet().fastIterator(); int pos = 0; while(iter.hasNext()) { TermData tData = iter.next().getValue(); if(descriptionsAvaialble) { newDescriptions[pos] = tData.description; originalTermStrings[pos] = tData.getStrings(); // term string does not actually mean anything; // we use the term position instead // newStrings[pos] = Integer.toString(pos); Set<String> uniq = new HashSet<String>(); for(String[] terms : originalTermStrings[pos]) { for(String term : terms) { uniq.add(term); } } if(uniq.isEmpty()) { newStrings[pos] = Integer.toString(pos); } else { List<String> termList= new ArrayList<String>(uniq); Collections.sort(termList); StringBuilder strb = new StringBuilder(termList.get(0)); for(int i = 1; i < termList.size(); i++) { strb.append(" | ").append(termList.get(i)); } newStrings[pos] = strb.toString(); } } else { newStrings[pos] = tData.description; } if(countsAvaialble) newCounts[pos] = tData.count; if(lengthsAvaialble) newLenghts[pos] = tData.length; pos++; } TermsResultSet res = new TermsResultSet(newStrings, newLenghts, newCounts, newDescriptions); res.originalTermStrings = originalTermStrings; return res; } /** * Class used internally to store the term data when grouping terms results sets. * See {@link TermsResultSet#groupByDescription(TermsResultSet...)}. */ private static class TermData { private String description; private int count; private int length; /** * The number of result sets being combined */ private int arity; /** * An array of size {@link #arity}, element at position i containing the * term strings in the result set at position i, for this term description. */ private ObjectArrayList<String>[] strings; public TermData(String description, int arity) { super(); this.description = description; this.arity = arity; strings = new ObjectArrayList[arity]; this.count = 0; this.length = -1; } /** * Adds a new term string for the sub-index at a given position. * @param position * @param string */ public void addString(int position, String string) { if(strings[position] == null) { strings[position] = new ObjectArrayList<String>(); } strings[position].add(string); } public String[][] getStrings() { String[][] res = new String[strings.length][]; for(int i = 0; i < strings.length; i++) { if(strings[i] == null) { res[i] = new String[0]; } else { res[i] = strings[i].toArray(new String[strings[i].size()]); } } return res; } } /** * A {@link it.unimi.dsi.fastutil.Swapper} implementation for * {@link TermsResultSet}s. */ public static class Swapper implements it.unimi.dsi.fastutil.Swapper { private TermsResultSet trs; public Swapper(TermsResultSet trs) { this.trs = trs; } @Override public void swap(int a, int b) { String termString = trs.termStrings[a]; trs.termStrings[a] = trs.termStrings[b]; trs.termStrings[b] = termString; if(trs.termCounts != null) { int termCount = trs.termCounts[a]; trs.termCounts[a] = trs.termCounts[b]; trs.termCounts[b] = termCount; } if(trs.termLengths != null) { int termLength = trs.termLengths[a]; trs.termLengths[a] = trs.termLengths[b]; trs.termLengths[b] = termLength; } if(trs.termDescriptions != null) { String termDesc = trs.termDescriptions[a]; trs.termDescriptions[a] = trs.termDescriptions[b]; trs.termDescriptions[b] = termDesc; } if(trs.originalTermStrings != null) { String[][] origTSs = trs.originalTermStrings[a]; trs.originalTermStrings[a] = trs.originalTermStrings[b]; trs.originalTermStrings[b] = origTSs; } } } }