PossibilityIterator.java example

Explorer
lucene-solr-master
- lucene
- solr
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.solr.spelling;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.NoSuchElementException;
import java.util.PriorityQueue;
import java.util.Set;

import org.apache.lucene.analysis.Token;

/**
 * <p>
 * Given a list of possible Spelling Corrections for multiple mis-spelled words
 * in a query, This iterator returns Possible Correction combinations ordered by
 * reasonable probability that such a combination will return actual hits if
 * re-queried. This implementation simply ranks the Possible Combinations by the
 * sum of their component ranks.
 * </p>
 * 
 */
public class PossibilityIterator implements
    Iterator<PossibilityIterator.RankedSpellPossibility> {
  private List<List<SpellCheckCorrection>> possibilityList = new ArrayList<>();
  private Iterator<RankedSpellPossibility> rankedPossibilityIterator = null;
  private int correctionIndex[];
  private boolean done = false;
  private Iterator<List<SpellCheckCorrection>> nextOnes = null;
  private int nextOnesRank = 0;
  private int nextOnesIndex = 0;
  private boolean suggestionsMayOverlap = false;
  
  @SuppressWarnings("unused")
  private PossibilityIterator() {
    throw new AssertionError("You shan't go here.");
  }
  
  /**
   * <p>
   * We assume here that the passed-in inner LinkedHashMaps are already sorted
   * in order of "Best Possible Correction".
   * </p>
   */
  public PossibilityIterator(
      Map<Token,LinkedHashMap<String,Integer>> suggestions,
      int maximumRequiredSuggestions, int maxEvaluations, boolean overlap) {
    this.suggestionsMayOverlap = overlap;
    for (Map.Entry<Token,LinkedHashMap<String,Integer>> entry : suggestions
        .entrySet()) {
      Token token = entry.getKey();
      if (entry.getValue().size() == 0) {
        continue;
      }
      List<SpellCheckCorrection> possibleCorrections = new ArrayList<>();
      for (Map.Entry<String,Integer> entry1 : entry.getValue().entrySet()) {
        SpellCheckCorrection correction = new SpellCheckCorrection();
        correction.setOriginal(token);
        correction.setCorrection(entry1.getKey());
        correction.setNumberOfOccurences(entry1.getValue());
        possibleCorrections.add(correction);
      }
      possibilityList.add(possibleCorrections);
    }
    
    int wrapSize = possibilityList.size();
    if (wrapSize == 0) {
      done = true;
    } else {
      correctionIndex = new int[wrapSize];
      for (int i = 0; i < wrapSize; i++) {
        int suggestSize = possibilityList.get(i).size();
        if (suggestSize == 0) {
          done = true;
          break;
        }
        correctionIndex[i] = 0;
      }
    }
    PriorityQueue<RankedSpellPossibility> rankedPossibilities = new PriorityQueue<>(
        11, new RankComparator());
    Set<RankedSpellPossibility> removeDuplicates = null;
    if (suggestionsMayOverlap) {
      removeDuplicates = new HashSet<>();
    }
    long numEvaluations = 0;
    while (numEvaluations < maxEvaluations && internalHasNext()) {
      RankedSpellPossibility rsp = internalNext();
      numEvaluations++;
      if (rankedPossibilities.size() >= maximumRequiredSuggestions
          && rsp.rank >= rankedPossibilities.peek().rank) {
        continue;
      }
      if (!isSuggestionForReal(rsp)) {
        continue;
      }
      if (removeDuplicates == null) {
        rankedPossibilities.offer(rsp);
      } else {
        // Needs to be in token-offset order so that the match-and-replace
        // option for collations can work.
        Collections.sort(rsp.corrections, new StartOffsetComparator());
        if (removeDuplicates.add(rsp)) {
          rankedPossibilities.offer(rsp);
        }
      }
      if (rankedPossibilities.size() > maximumRequiredSuggestions) {
        RankedSpellPossibility removed = rankedPossibilities.poll();
        if (removeDuplicates != null) {
          removeDuplicates.remove(removed);
        }
      }
    }
    
    RankedSpellPossibility[] rpArr = new RankedSpellPossibility[rankedPossibilities
        .size()];
    for (int i = rankedPossibilities.size() - 1; i >= 0; i--) {
      rpArr[i] = rankedPossibilities.remove();
    }
    rankedPossibilityIterator = Arrays.asList(rpArr).iterator();
  }
  
  private boolean isSuggestionForReal(RankedSpellPossibility rsp) {
    for (SpellCheckCorrection corr : rsp.corrections) {
      if (!corr.getOriginalAsString().equals(corr.getCorrection())) {
        return true;
      }
    }
    return false;
  }
  
  private boolean internalHasNext() {
    if (nextOnes != null && nextOnes.hasNext()) {
      return true;
    }
    if (done) {
      return false;
    }
    internalNextAdvance();
    if (nextOnes != null && nextOnes.hasNext()) {
      return true;
    }
    return false;
  }
  
  /**
   * <p>
   * This method is converting the independent LinkHashMaps containing various
   * (silo'ed) suggestions for each mis-spelled word into individual
   * "holistic query corrections", aka. "Spell Check Possibility"
   * </p>
   * <p>
   * Rank here is the sum of each selected term's position in its respective
   * LinkedHashMap.
   * </p>
   */
  private RankedSpellPossibility internalNext() {
    if (nextOnes != null && nextOnes.hasNext()) {
      RankedSpellPossibility rsl = new RankedSpellPossibility();
      rsl.corrections = nextOnes.next();
      rsl.rank = nextOnesRank;
      rsl.index = nextOnesIndex++;
      return rsl;
    }
    if (done) {
      throw new NoSuchElementException();
    }
    internalNextAdvance();
    if (nextOnes != null && nextOnes.hasNext()) {
      RankedSpellPossibility rsl = new RankedSpellPossibility();
      rsl.corrections = nextOnes.next();
      rsl.rank = nextOnesRank;
      rsl.index = nextOnesIndex++;
      return rsl;
    }
    throw new NoSuchElementException();
  }
  
  private void internalNextAdvance() {
    List<SpellCheckCorrection> possibleCorrection = null;
    if (nextOnes != null && nextOnes.hasNext()) {
      possibleCorrection = nextOnes.next();
    } else {
      if (done) {
        throw new NoSuchElementException();
      }
      possibleCorrection = new ArrayList<>();
      List<List<SpellCheckCorrection>> possibleCorrections = null;
      int rank = 0;
      while (!done
          && (possibleCorrections == null || possibleCorrections.size() == 0)) {
        rank = 0;
        for (int i = 0; i < correctionIndex.length; i++) {
          List<SpellCheckCorrection> singleWordPossibilities = possibilityList
              .get(i);
          SpellCheckCorrection singleWordPossibility = singleWordPossibilities
              .get(correctionIndex[i]);
          rank += correctionIndex[i];
          if (i == correctionIndex.length - 1) {
            correctionIndex[i]++;
            if (correctionIndex[i] == singleWordPossibilities.size()) {
              correctionIndex[i] = 0;
              if (correctionIndex.length == 1) {
                done = true;
              }
              for (int ii = i - 1; ii >= 0; ii--) {
                correctionIndex[ii]++;
                if (correctionIndex[ii] >= possibilityList.get(ii).size()
                    && ii > 0) {
                  correctionIndex[ii] = 0;
                } else {
                  break;
                }
              }
            }
          }
          possibleCorrection.add(singleWordPossibility);
        }
        if (correctionIndex[0] == possibilityList.get(0).size()) {
          done = true;
        }
        if (suggestionsMayOverlap) {
          possibleCorrections = separateOverlappingTokens(possibleCorrection);
        } else {
          possibleCorrections = new ArrayList<>(1);
          possibleCorrections.add(possibleCorrection);
        }
      }
      nextOnes = possibleCorrections.iterator();
      nextOnesRank = rank;
      nextOnesIndex = 0;
    }
  }
  
  private List<List<SpellCheckCorrection>> separateOverlappingTokens(
      List<SpellCheckCorrection> possibleCorrection) {
    List<List<SpellCheckCorrection>> ret = null;
    if (possibleCorrection.size() == 1) {
      ret = new ArrayList<>(1);
      ret.add(possibleCorrection);
      return ret;
    }
    ret = new ArrayList<>();
    for (int i = 0; i < possibleCorrection.size(); i++) {
      List<SpellCheckCorrection> c = compatible(possibleCorrection, i);
      ret.add(c);
    }
    return ret;
  }
  
  private List<SpellCheckCorrection> compatible(List<SpellCheckCorrection> all,
      int pos) {
    List<SpellCheckCorrection> priorPassCompatibles = null;
    {
      List<SpellCheckCorrection> firstPassCompatibles = new ArrayList<>(
          all.size());
      SpellCheckCorrection sacred = all.get(pos);
      firstPassCompatibles.add(sacred);
      int index = pos;
      boolean gotOne = false;
      for (int i = 0; i < all.size() - 1; i++) {
        index++;
        if (index == all.size()) {
          index = 0;
        }
        SpellCheckCorrection disposable = all.get(index);
        if (!conflicts(sacred, disposable)) {
          firstPassCompatibles.add(disposable);
          gotOne = true;
        }
      }
      if (!gotOne) {
        return firstPassCompatibles;
      }
      priorPassCompatibles = firstPassCompatibles;
    }
    
    {
      pos = 1;
      while (true) {
        if (pos == priorPassCompatibles.size() - 1) {
          return priorPassCompatibles;
        }
        List<SpellCheckCorrection> subsequentPassCompatibles = new ArrayList<>(
            priorPassCompatibles.size());
        SpellCheckCorrection sacred = null;
        for (int i = 0; i <= pos; i++) {
          sacred = priorPassCompatibles.get(i);
          subsequentPassCompatibles.add(sacred);
        }
        int index = pos;
        boolean gotOne = false;
        for (int i = 0; i < priorPassCompatibles.size() - 1; i++) {
          index++;
          if (index == priorPassCompatibles.size()) {
            break;
          }
          SpellCheckCorrection disposable = priorPassCompatibles.get(index);
          if (!conflicts(sacred, disposable)) {
            subsequentPassCompatibles.add(disposable);
            gotOne = true;
          }
        }
        if (!gotOne || pos == priorPassCompatibles.size() - 1) {
          return subsequentPassCompatibles;
        }
        priorPassCompatibles = subsequentPassCompatibles;
        pos++;
      }
    }
  }
  
  private boolean conflicts(SpellCheckCorrection c1, SpellCheckCorrection c2) {
    int s1 = c1.getOriginal().startOffset();
    int e1 = c1.getOriginal().endOffset();
    int s2 = c2.getOriginal().startOffset();
    int e2 = c2.getOriginal().endOffset();
    if (s2 >= s1 && s2 <= e1) {
      return true;
    }
    if (s1 >= s2 && s1 <= e2) {
      return true;
    }
    return false;
  }
  
  @Override
  public boolean hasNext() {
    return rankedPossibilityIterator.hasNext();
  }
  
  @Override
  public PossibilityIterator.RankedSpellPossibility next() {
    return rankedPossibilityIterator.next();
  }
  
  @Override
  public void remove() {
    throw new UnsupportedOperationException();
  }
  
  public static class RankedSpellPossibility {
    public List<SpellCheckCorrection> corrections;
    public int rank;
    public int index;
    
    @Override
    // hashCode() and equals() only consider the actual correction, not the rank
    // or index.
    public int hashCode() {
      final int prime = 31;
      int result = 1;
      result = prime * result
          + ((corrections == null) ? 0 : corrections.hashCode());
      return result;
    }
    
    @Override
    // hashCode() and equals() only consider the actual correction, not the rank
    // or index.
    public boolean equals(Object obj) {
      if (this == obj) return true;
      if (obj == null) return false;
      if (getClass() != obj.getClass()) return false;
      RankedSpellPossibility other = (RankedSpellPossibility) obj;
      if (corrections == null) {
        if (other.corrections != null) return false;
      } else if (!corrections.equals(other.corrections)) return false;
      return true;
    }
    
    @Override
    public String toString() {
      StringBuilder sb = new StringBuilder();
      sb.append("rank=").append(rank).append(" (").append(index).append(")");
      if (corrections != null) {
        for (SpellCheckCorrection corr : corrections) {
          sb.append("     ");
          sb.append(corr.getOriginal()).append(">")
              .append(corr.getCorrection()).append(" (").append(
                  corr.getNumberOfOccurences()).append(")");
        }
      }
      return sb.toString();
    }
  }
  
  private static class StartOffsetComparator implements
      Comparator<SpellCheckCorrection> {
    @Override
    public int compare(SpellCheckCorrection o1, SpellCheckCorrection o2) {
      return o1.getOriginal().startOffset() - o2.getOriginal().startOffset();
    }
  }
  
  private static class RankComparator implements Comparator<RankedSpellPossibility> {
    // Rank poorer suggestions ahead of better ones for use with a PriorityQueue
    @Override
    public int compare(RankedSpellPossibility r1, RankedSpellPossibility r2) {
      int retval = r2.rank - r1.rank;
      if (retval == 0) {
        retval = r2.index - r1.index;
      }
      return retval;
    }
  }
  
}