// Copyright 2011 Google Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package com.google.enterprise.connector.util.filter;
import com.google.common.base.Preconditions;
import com.google.common.base.Strings;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Sets;
import com.google.enterprise.connector.spi.Document;
import com.google.enterprise.connector.spi.Property;
import com.google.enterprise.connector.spi.RepositoryException;
import com.google.enterprise.connector.spi.SkippedDocumentException;
import com.google.enterprise.connector.spi.Value;
import java.util.Set;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
/**
* A {@link Document} filter that forces a Document to be skipped (or not)
* based upon the presence/abscence of a specific {@link Property},
* or based upon a match on one of the {@link Value Values} of that
* property.
* <p>
* <b>Example {@code documentFilters.xml} Configurations:</b>
* <p>
* The following example skips documents that have a {@code NoIndex Property}.
* <pre><code>
<bean id="NoIndex"
class="com.google.enterprise.connector.util.filter.SkipDocumentFilter">
<property name="propertyName" value="NoIndex"/>
<property name="skipOnMatch" value="true"/>
</bean>
</code></pre>
* The following example skips documnents whose {@code Classification Property}
* value is not {@code PUBLIC} or {@code DECLASSIFIED}.
* <pre><code>
<!-- Filter out all but PUBLIC and DECLASSIFIED documents. -->
<bean id="Classified"
class="com.google.enterprise.connector.util.filter.SkipDocumentFilter">
<property name="propertyName" value="Classification"/>
<property name="pattern" value="(PUBLIC)|(DECLASSIFIED)"/>
<property name="skipOnMatch" value="false"/>
</bean>
</code></pre>
*
* @since 2.8.4
*/
public class SkipDocumentFilter extends AbstractDocumentFilter {
/** The name of the {@link Property} to match. */
protected String propertyName;
/** The regex pattern to match in the property {@link Value Values}. */
protected Pattern pattern;
/**
* If {@code true} skip the document on a match.
* If {@code false} skip the document on a failed match.
*/
protected boolean skipOnMatch;
/**
* Sets the the name of the {@link Property} to match. If no
* {@code pattern} is set, then any Document that exposes the
* named property is considered a matching document. If a {@code pattern}
* is set, then any value of the property that matches the
* regular expression is considered a match.
*
* @param propertyName the name of the {@link Property} to filter
* @throws IllegalArgumentException if {@code propertyName} is {@code null}
* or empty
*/
public void setPropertyName(String propertyName) {
Preconditions.checkArgument(!Strings.isNullOrEmpty(propertyName),
"propertyName may not be null or empty");
this.propertyName = propertyName;
}
/**
* Sets the regular expression pattern to match in the values.
* The supplied {@code pattern} must conform to the syntax defined in
* {@link java.util.regex.Pattern}. If one of the property's values
* matches this regular expression, this is considered a matching
* document.
* <p>
* If no pattern is specified, then the mere presence of the named
* property would be considered a match.
*
* @param pattern the regular expression pattern to match in the values
* @throws PatternSyntaxException if {@code pattern}'s syntax is invalid
*/
public void setPattern(String pattern) throws PatternSyntaxException {
if (!Strings.isNullOrEmpty(pattern)) {
this.pattern = Pattern.compile(pattern);
}
}
/**
* Sets the skip document behaviour flag.
* If {@code true} skip the document on a match.
* If {@code false} skip the document on a failed match.
*
* @param skipOnMatch If {@code true} skip the document on a match,
* otherwise skip the document if the match fails.
*/
public void setSkipOnMatch(boolean skipOnMatch) {
this.skipOnMatch = skipOnMatch;
}
@Override
public Set<String> getPropertyNames(Document source)
throws RepositoryException {
Preconditions.checkState(propertyName != null, "must set propertyName");
return Sets.union(source.getPropertyNames(),
ImmutableSet.of(propertyName));
}
@Override
public Property findProperty(Document source, String name)
throws RepositoryException {
Preconditions.checkState(propertyName != null, "must set propertyName");
Property prop = source.findProperty(name);
if (propertyName.equals(name)) {
if (pattern == null) {
// If there is no pattern, then check presence/absence of the property.
if ((prop == null) ^ skipOnMatch) {
throw new SkippedDocumentException("Skipping document based upon "
+ ((prop == null) ? "absence" : "presence") + " of property "
+ propertyName);
}
} else if (prop != null) {
Set<String> propNames = source.getPropertyNames();
return new SkipProperty(prop, propNames.contains(propertyName));
}
}
return prop;
}
@Override
public String toString() {
return super.toString() + ": (" + propertyName + " , \""
+ ((pattern == null) ? "null" : pattern.pattern())
+ "\" , " + skipOnMatch + ")";
}
/**
* Checks for a pattern match on the source property values.
*/
private class SkipProperty implements Property {
private final Property property;
private final boolean isPublished;
public SkipProperty(Property property, boolean isPublished) {
this.property = property;
this.isPublished = isPublished;
}
@Override
public Value nextValue() throws RepositoryException {
// Look for a pattern match in any of the property values.
Value value = property.nextValue();
if (value != null) {
// Use matcher.matches() or matcher.find()? I choose the latter
// because you can get the behaviour of the former using \A and \Z
// in the pattern.
if (pattern.matcher(Strings.nullToEmpty(value.toString())).find()
^ !skipOnMatch) {
throw new SkippedDocumentException("Skipping document based upon "
+ "property " + propertyName + " value: " + value.toString());
}
if (!isPublished) {
value = Value.getStringValue("");
}
}
return value;
}
}
}