// Copyright 2011 Google Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package com.google.enterprise.connector.util.filter; import com.google.common.base.Preconditions; import com.google.common.base.Strings; import com.google.common.collect.ImmutableSet; import com.google.common.collect.Sets; import com.google.enterprise.connector.spi.Document; import com.google.enterprise.connector.spi.Property; import com.google.enterprise.connector.spi.RepositoryException; import com.google.enterprise.connector.spi.SkippedDocumentException; import com.google.enterprise.connector.spi.Value; import java.util.Set; import java.util.regex.Pattern; import java.util.regex.PatternSyntaxException; /** * A {@link Document} filter that forces a Document to be skipped (or not) * based upon the presence/abscence of a specific {@link Property}, * or based upon a match on one of the {@link Value Values} of that * property. * <p> * <b>Example {@code documentFilters.xml} Configurations:</b> * <p> * The following example skips documents that have a {@code NoIndex Property}. * <pre><code> <bean id="NoIndex" class="com.google.enterprise.connector.util.filter.SkipDocumentFilter"> <property name="propertyName" value="NoIndex"/> <property name="skipOnMatch" value="true"/> </bean> </code></pre> * The following example skips documnents whose {@code Classification Property} * value is not {@code PUBLIC} or {@code DECLASSIFIED}. * <pre><code> <!-- Filter out all but PUBLIC and DECLASSIFIED documents. --> <bean id="Classified" class="com.google.enterprise.connector.util.filter.SkipDocumentFilter"> <property name="propertyName" value="Classification"/> <property name="pattern" value="(PUBLIC)|(DECLASSIFIED)"/> <property name="skipOnMatch" value="false"/> </bean> </code></pre> * * @since 2.8.4 */ public class SkipDocumentFilter extends AbstractDocumentFilter { /** The name of the {@link Property} to match. */ protected String propertyName; /** The regex pattern to match in the property {@link Value Values}. */ protected Pattern pattern; /** * If {@code true} skip the document on a match. * If {@code false} skip the document on a failed match. */ protected boolean skipOnMatch; /** * Sets the the name of the {@link Property} to match. If no * {@code pattern} is set, then any Document that exposes the * named property is considered a matching document. If a {@code pattern} * is set, then any value of the property that matches the * regular expression is considered a match. * * @param propertyName the name of the {@link Property} to filter * @throws IllegalArgumentException if {@code propertyName} is {@code null} * or empty */ public void setPropertyName(String propertyName) { Preconditions.checkArgument(!Strings.isNullOrEmpty(propertyName), "propertyName may not be null or empty"); this.propertyName = propertyName; } /** * Sets the regular expression pattern to match in the values. * The supplied {@code pattern} must conform to the syntax defined in * {@link java.util.regex.Pattern}. If one of the property's values * matches this regular expression, this is considered a matching * document. * <p> * If no pattern is specified, then the mere presence of the named * property would be considered a match. * * @param pattern the regular expression pattern to match in the values * @throws PatternSyntaxException if {@code pattern}'s syntax is invalid */ public void setPattern(String pattern) throws PatternSyntaxException { if (!Strings.isNullOrEmpty(pattern)) { this.pattern = Pattern.compile(pattern); } } /** * Sets the skip document behaviour flag. * If {@code true} skip the document on a match. * If {@code false} skip the document on a failed match. * * @param skipOnMatch If {@code true} skip the document on a match, * otherwise skip the document if the match fails. */ public void setSkipOnMatch(boolean skipOnMatch) { this.skipOnMatch = skipOnMatch; } @Override public Set<String> getPropertyNames(Document source) throws RepositoryException { Preconditions.checkState(propertyName != null, "must set propertyName"); return Sets.union(source.getPropertyNames(), ImmutableSet.of(propertyName)); } @Override public Property findProperty(Document source, String name) throws RepositoryException { Preconditions.checkState(propertyName != null, "must set propertyName"); Property prop = source.findProperty(name); if (propertyName.equals(name)) { if (pattern == null) { // If there is no pattern, then check presence/absence of the property. if ((prop == null) ^ skipOnMatch) { throw new SkippedDocumentException("Skipping document based upon " + ((prop == null) ? "absence" : "presence") + " of property " + propertyName); } } else if (prop != null) { Set<String> propNames = source.getPropertyNames(); return new SkipProperty(prop, propNames.contains(propertyName)); } } return prop; } @Override public String toString() { return super.toString() + ": (" + propertyName + " , \"" + ((pattern == null) ? "null" : pattern.pattern()) + "\" , " + skipOnMatch + ")"; } /** * Checks for a pattern match on the source property values. */ private class SkipProperty implements Property { private final Property property; private final boolean isPublished; public SkipProperty(Property property, boolean isPublished) { this.property = property; this.isPublished = isPublished; } @Override public Value nextValue() throws RepositoryException { // Look for a pattern match in any of the property values. Value value = property.nextValue(); if (value != null) { // Use matcher.matches() or matcher.find()? I choose the latter // because you can get the behaviour of the former using \A and \Z // in the pattern. if (pattern.matcher(Strings.nullToEmpty(value.toString())).find() ^ !skipOnMatch) { throw new SkippedDocumentException("Skipping document based upon " + "property " + propertyName + " value: " + value.toString()); } if (!isPublished) { value = Value.getStringValue(""); } } return value; } } }