/** * Copyright (c) 2015 Lemur Consulting Ltd. * <p/> * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * <p/> * http://www.apache.org/licenses/LICENSE-2.0 * <p/> * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package uk.co.flax.biosolr.pruning; import java.util.Collection; import java.util.Comparator; import java.util.Set; import java.util.TreeSet; import java.util.stream.Collectors; import uk.co.flax.biosolr.TreeFacetField; /** * A simple {@link Pruner} implementation, which attempts to strip off the * least significant parent nodes, returning child nodes which either have * content themselves, or have direct children with content. * * @author mlp */ public class SimplePruner implements Pruner { /** * The default number of child nodes with content required for a parent * node to be considered "relevant". */ public static final int MIN_CHILD_COUNT = 3; /** * The parameter used to pass the child count into the component. */ public static final String CHILD_COUNT_PARAM = "childCount"; private final int minChildCount; public SimplePruner(int minChildCount) { this.minChildCount = minChildCount; } @Override public Collection<TreeFacetField> prune(Collection<TreeFacetField> unprunedTrees) { // Prune the trees Collection<TreeFacetField> pruned = stripNonRelevantTrees(unprunedTrees); // Now loop through the top-level nodes, making sure none of the entries // are included in another entry's children pruned = deduplicateTrees(pruned); return pruned; } /** * De-duplicate a collection of top-level trees by checking whether a top-level * node exists in the children of any of the other nodes, and removing it if so. * @param trees the collection of top-level facet trees. * @return the de-duplicated collection. */ private Collection<TreeFacetField> deduplicateTrees(Collection<TreeFacetField> trees) { return trees.stream().filter(t -> !isFacetInChildren(t, 0, trees)).collect(Collectors.toList()); } /** * Check whether a particular facet exists in the children of any other facets * in a collection. * @param facet the facet to check for. * @param level the current level in the hierarchy, starting from 0. * @param trees the collection of trees to check through. * @return <code>true</code> if the facet is found in the child lists. */ private boolean isFacetInChildren(TreeFacetField facet, int level, Collection<TreeFacetField> trees) { boolean retVal = false; if (trees != null) { for (TreeFacetField tree : trees) { if ((level != 0 && tree.equals(facet)) || (isFacetInChildren(facet, level + 1, tree.getHierarchy()))) { retVal = true; break; } } } return retVal; } /** * Prune a collection of facet trees, in order to remove nodes which are * unlikely to be relevant. "Relevant" is defined here to be either * entries with direct hits, or entries with a pre-defined number of * child nodes with direct hits. This can remove several top-level * layers from the tree which don't have direct hits. * @param unprunedTrees the trees which need pruning. * @return a sorted list of pruned trees. */ private Collection<TreeFacetField> stripNonRelevantTrees(Collection<TreeFacetField> unprunedTrees) { // Use a sorted set so the trees come out in count-descending order Set<TreeFacetField> pruned = new TreeSet<>(Comparator.reverseOrder()); for (TreeFacetField tff : unprunedTrees) { if (tff.getCount() > 0) { // Relevant - entry has direct hits pruned.add(tff); } else if (checkChildCounts(tff)) { // Relevant - entry has a number of children with direct hits pruned.add(tff); } else if (tff.hasChildren()) { // Not relevant at this level - recurse through children pruned.addAll(stripNonRelevantTrees(tff.getHierarchy())); } } return pruned; } /** * Check whether the given tree has enough children with direct hits to * be included in the pruned tree. * @param tree the facet tree. * @return <code>true</code> if the tree has enough children to be * included. */ private boolean checkChildCounts(TreeFacetField tree) { long hitCount = 0; if (tree.hasChildren()) { hitCount = tree.getHierarchy().stream().filter(t -> t.getCount() > 0).count(); } return hitCount >= minChildCount; } }