/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.solr.spelling; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.Comparator; import java.util.HashSet; import java.util.Iterator; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.NoSuchElementException; import java.util.PriorityQueue; import java.util.Set; import org.apache.lucene.analysis.Token; /** * <p> * Given a list of possible Spelling Corrections for multiple mis-spelled words * in a query, This iterator returns Possible Correction combinations ordered by * reasonable probability that such a combination will return actual hits if * re-queried. This implementation simply ranks the Possible Combinations by the * sum of their component ranks. * </p> * */ public class PossibilityIterator implements Iterator<PossibilityIterator.RankedSpellPossibility> { private List<List<SpellCheckCorrection>> possibilityList = new ArrayList<>(); private Iterator<RankedSpellPossibility> rankedPossibilityIterator = null; private int correctionIndex[]; private boolean done = false; private Iterator<List<SpellCheckCorrection>> nextOnes = null; private int nextOnesRank = 0; private int nextOnesIndex = 0; private boolean suggestionsMayOverlap = false; @SuppressWarnings("unused") private PossibilityIterator() { throw new AssertionError("You shan't go here."); } /** * <p> * We assume here that the passed-in inner LinkedHashMaps are already sorted * in order of "Best Possible Correction". * </p> */ public PossibilityIterator( Map<Token,LinkedHashMap<String,Integer>> suggestions, int maximumRequiredSuggestions, int maxEvaluations, boolean overlap) { this.suggestionsMayOverlap = overlap; for (Map.Entry<Token,LinkedHashMap<String,Integer>> entry : suggestions .entrySet()) { Token token = entry.getKey(); if (entry.getValue().size() == 0) { continue; } List<SpellCheckCorrection> possibleCorrections = new ArrayList<>(); for (Map.Entry<String,Integer> entry1 : entry.getValue().entrySet()) { SpellCheckCorrection correction = new SpellCheckCorrection(); correction.setOriginal(token); correction.setCorrection(entry1.getKey()); correction.setNumberOfOccurences(entry1.getValue()); possibleCorrections.add(correction); } possibilityList.add(possibleCorrections); } int wrapSize = possibilityList.size(); if (wrapSize == 0) { done = true; } else { correctionIndex = new int[wrapSize]; for (int i = 0; i < wrapSize; i++) { int suggestSize = possibilityList.get(i).size(); if (suggestSize == 0) { done = true; break; } correctionIndex[i] = 0; } } PriorityQueue<RankedSpellPossibility> rankedPossibilities = new PriorityQueue<>( 11, new RankComparator()); Set<RankedSpellPossibility> removeDuplicates = null; if (suggestionsMayOverlap) { removeDuplicates = new HashSet<>(); } long numEvaluations = 0; while (numEvaluations < maxEvaluations && internalHasNext()) { RankedSpellPossibility rsp = internalNext(); numEvaluations++; if (rankedPossibilities.size() >= maximumRequiredSuggestions && rsp.rank >= rankedPossibilities.peek().rank) { continue; } if (!isSuggestionForReal(rsp)) { continue; } if (removeDuplicates == null) { rankedPossibilities.offer(rsp); } else { // Needs to be in token-offset order so that the match-and-replace // option for collations can work. Collections.sort(rsp.corrections, new StartOffsetComparator()); if (removeDuplicates.add(rsp)) { rankedPossibilities.offer(rsp); } } if (rankedPossibilities.size() > maximumRequiredSuggestions) { RankedSpellPossibility removed = rankedPossibilities.poll(); if (removeDuplicates != null) { removeDuplicates.remove(removed); } } } RankedSpellPossibility[] rpArr = new RankedSpellPossibility[rankedPossibilities .size()]; for (int i = rankedPossibilities.size() - 1; i >= 0; i--) { rpArr[i] = rankedPossibilities.remove(); } rankedPossibilityIterator = Arrays.asList(rpArr).iterator(); } private boolean isSuggestionForReal(RankedSpellPossibility rsp) { for (SpellCheckCorrection corr : rsp.corrections) { if (!corr.getOriginalAsString().equals(corr.getCorrection())) { return true; } } return false; } private boolean internalHasNext() { if (nextOnes != null && nextOnes.hasNext()) { return true; } if (done) { return false; } internalNextAdvance(); if (nextOnes != null && nextOnes.hasNext()) { return true; } return false; } /** * <p> * This method is converting the independent LinkHashMaps containing various * (silo'ed) suggestions for each mis-spelled word into individual * "holistic query corrections", aka. "Spell Check Possibility" * </p> * <p> * Rank here is the sum of each selected term's position in its respective * LinkedHashMap. * </p> */ private RankedSpellPossibility internalNext() { if (nextOnes != null && nextOnes.hasNext()) { RankedSpellPossibility rsl = new RankedSpellPossibility(); rsl.corrections = nextOnes.next(); rsl.rank = nextOnesRank; rsl.index = nextOnesIndex++; return rsl; } if (done) { throw new NoSuchElementException(); } internalNextAdvance(); if (nextOnes != null && nextOnes.hasNext()) { RankedSpellPossibility rsl = new RankedSpellPossibility(); rsl.corrections = nextOnes.next(); rsl.rank = nextOnesRank; rsl.index = nextOnesIndex++; return rsl; } throw new NoSuchElementException(); } private void internalNextAdvance() { List<SpellCheckCorrection> possibleCorrection = null; if (nextOnes != null && nextOnes.hasNext()) { possibleCorrection = nextOnes.next(); } else { if (done) { throw new NoSuchElementException(); } possibleCorrection = new ArrayList<>(); List<List<SpellCheckCorrection>> possibleCorrections = null; int rank = 0; while (!done && (possibleCorrections == null || possibleCorrections.size() == 0)) { rank = 0; for (int i = 0; i < correctionIndex.length; i++) { List<SpellCheckCorrection> singleWordPossibilities = possibilityList .get(i); SpellCheckCorrection singleWordPossibility = singleWordPossibilities .get(correctionIndex[i]); rank += correctionIndex[i]; if (i == correctionIndex.length - 1) { correctionIndex[i]++; if (correctionIndex[i] == singleWordPossibilities.size()) { correctionIndex[i] = 0; if (correctionIndex.length == 1) { done = true; } for (int ii = i - 1; ii >= 0; ii--) { correctionIndex[ii]++; if (correctionIndex[ii] >= possibilityList.get(ii).size() && ii > 0) { correctionIndex[ii] = 0; } else { break; } } } } possibleCorrection.add(singleWordPossibility); } if (correctionIndex[0] == possibilityList.get(0).size()) { done = true; } if (suggestionsMayOverlap) { possibleCorrections = separateOverlappingTokens(possibleCorrection); } else { possibleCorrections = new ArrayList<>(1); possibleCorrections.add(possibleCorrection); } } nextOnes = possibleCorrections.iterator(); nextOnesRank = rank; nextOnesIndex = 0; } } private List<List<SpellCheckCorrection>> separateOverlappingTokens( List<SpellCheckCorrection> possibleCorrection) { List<List<SpellCheckCorrection>> ret = null; if (possibleCorrection.size() == 1) { ret = new ArrayList<>(1); ret.add(possibleCorrection); return ret; } ret = new ArrayList<>(); for (int i = 0; i < possibleCorrection.size(); i++) { List<SpellCheckCorrection> c = compatible(possibleCorrection, i); ret.add(c); } return ret; } private List<SpellCheckCorrection> compatible(List<SpellCheckCorrection> all, int pos) { List<SpellCheckCorrection> priorPassCompatibles = null; { List<SpellCheckCorrection> firstPassCompatibles = new ArrayList<>( all.size()); SpellCheckCorrection sacred = all.get(pos); firstPassCompatibles.add(sacred); int index = pos; boolean gotOne = false; for (int i = 0; i < all.size() - 1; i++) { index++; if (index == all.size()) { index = 0; } SpellCheckCorrection disposable = all.get(index); if (!conflicts(sacred, disposable)) { firstPassCompatibles.add(disposable); gotOne = true; } } if (!gotOne) { return firstPassCompatibles; } priorPassCompatibles = firstPassCompatibles; } { pos = 1; while (true) { if (pos == priorPassCompatibles.size() - 1) { return priorPassCompatibles; } List<SpellCheckCorrection> subsequentPassCompatibles = new ArrayList<>( priorPassCompatibles.size()); SpellCheckCorrection sacred = null; for (int i = 0; i <= pos; i++) { sacred = priorPassCompatibles.get(i); subsequentPassCompatibles.add(sacred); } int index = pos; boolean gotOne = false; for (int i = 0; i < priorPassCompatibles.size() - 1; i++) { index++; if (index == priorPassCompatibles.size()) { break; } SpellCheckCorrection disposable = priorPassCompatibles.get(index); if (!conflicts(sacred, disposable)) { subsequentPassCompatibles.add(disposable); gotOne = true; } } if (!gotOne || pos == priorPassCompatibles.size() - 1) { return subsequentPassCompatibles; } priorPassCompatibles = subsequentPassCompatibles; pos++; } } } private boolean conflicts(SpellCheckCorrection c1, SpellCheckCorrection c2) { int s1 = c1.getOriginal().startOffset(); int e1 = c1.getOriginal().endOffset(); int s2 = c2.getOriginal().startOffset(); int e2 = c2.getOriginal().endOffset(); if (s2 >= s1 && s2 <= e1) { return true; } if (s1 >= s2 && s1 <= e2) { return true; } return false; } @Override public boolean hasNext() { return rankedPossibilityIterator.hasNext(); } @Override public PossibilityIterator.RankedSpellPossibility next() { return rankedPossibilityIterator.next(); } @Override public void remove() { throw new UnsupportedOperationException(); } public static class RankedSpellPossibility { public List<SpellCheckCorrection> corrections; public int rank; public int index; @Override // hashCode() and equals() only consider the actual correction, not the rank // or index. public int hashCode() { final int prime = 31; int result = 1; result = prime * result + ((corrections == null) ? 0 : corrections.hashCode()); return result; } @Override // hashCode() and equals() only consider the actual correction, not the rank // or index. public boolean equals(Object obj) { if (this == obj) return true; if (obj == null) return false; if (getClass() != obj.getClass()) return false; RankedSpellPossibility other = (RankedSpellPossibility) obj; if (corrections == null) { if (other.corrections != null) return false; } else if (!corrections.equals(other.corrections)) return false; return true; } @Override public String toString() { StringBuilder sb = new StringBuilder(); sb.append("rank=").append(rank).append(" (").append(index).append(")"); if (corrections != null) { for (SpellCheckCorrection corr : corrections) { sb.append(" "); sb.append(corr.getOriginal()).append(">") .append(corr.getCorrection()).append(" (").append( corr.getNumberOfOccurences()).append(")"); } } return sb.toString(); } } private static class StartOffsetComparator implements Comparator<SpellCheckCorrection> { @Override public int compare(SpellCheckCorrection o1, SpellCheckCorrection o2) { return o1.getOriginal().startOffset() - o2.getOriginal().startOffset(); } } private static class RankComparator implements Comparator<RankedSpellPossibility> { // Rank poorer suggestions ahead of better ones for use with a PriorityQueue @Override public int compare(RankedSpellPossibility r1, RankedSpellPossibility r2) { int retval = r2.rank - r1.rank; if (retval == 0) { retval = r2.index - r1.index; } return retval; } } }