// Copyright 2011 Google Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package com.google.enterprise.connector.util.filter;
import com.google.common.base.Preconditions;
import com.google.common.base.Strings;
import com.google.common.io.ByteStreams;
import com.google.enterprise.connector.spi.Document;
import com.google.enterprise.connector.spi.Property;
import com.google.enterprise.connector.spi.RepositoryException;
import com.google.enterprise.connector.spi.SimpleProperty;
import com.google.enterprise.connector.spi.SpiConstants;
import com.google.enterprise.connector.spi.Value;
import com.google.enterprise.connector.spiimpl.BinaryValue;
import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.util.Collections;
import java.util.LinkedList;
import java.util.Set;
import java.util.TreeSet;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
/**
* A {@link Document} filter that alters the values of the specified
* {@link Property Properties}. The filter will scrutinize the
* {@link Value Values} returned by the supplied {@link Property Properties}.
* If the value (as a string) matches the regular expression {@code pattern},
* then all matching regions of the value will be replaced with the
* {@code replacement} string.
* <p>
* If the {@code overwrite} flag is {@code true}, the modified
* property values replace any matching values of the target property.
* Otherwise, the modified property values supplement any existing values
* of the target property.
* <p>
* <b>Example {@code documentFilters.xml} Configurations:</b>
* <p>
* The following example replaces all instances of the word "Foo" with "Bar"
* in the {@code Category} property.
* <pre><code>
<bean id="FooToBar"
class="com.google.enterprise.connector.util.filter.ModifyPropertyFilter">
<property name="propertyName" value="Category"/>
<property name="pattern" value="Foo"/>
<property name="replacement" value="Bar"/>
<property name="overwrite" value="true"/>
</bean>
</code></pre>
* The following example adds "Paul Erdös" to the list of {@code Authors}
* of documents for which I am also an author. This will give me an Erdös
* Number of 1!
* <pre><code>
<!-- Add Erdös as co-author of all my documents. -->
<bean id="AddErdosAuthor"
class="com.google.enterprise.connector.util.filter.ModifyPropertyFilter">
<property name="propertyName" value="Author"/>
<property name="pattern" value="C'est Moi"/>
<property name="replacement" value="Paul Erdös"/>
<property name="overwrite" value="false"/>
</bean>
</code></pre>
* The following example replaces one or more instances of the characters
* '.' or '_' with a single space for all values of the {@code Foo} and
* {@code Bar} properties. The original values are kept, and new values
* with whitespace delimiters are added to the properties.
* <pre><code>
<!-- Replace '.' and '_' with a space. -->
<bean id="DotUnderscoreToWhiteSpace"
class="com.google.enterprise.connector.util.filter.ModifyPropertyFilter">
<property name="propertyNames">
<set>
<value>Foo</value>
<value>Bar</value>
</set>
</property>
<property name="pattern" value="[_.]+"/>
<property name="replacement" value=" "/>
<property name="overwrite" value="false"/>
</bean>
</code></pre>
* <p>
* When used with binary values, the entire value is buffered and the
* modified value is stored in a {@code byte} array.
*
* @since 2.8
*/
/*
* TODO: Find a way to process the InputStreams without buffering them,
* maybe using java.util.Scanner or a similar third-party tool.
* TODO: Binary values based on byte arrays are likely rare outside
* the tests, but it might be nice to build the string from the
* underlying byte array directly, rather than copying it.
*/
public class ModifyPropertyFilter extends AbstractDocumentFilter {
/** The logger for this class. */
private static final Logger LOGGER =
Logger.getLogger(ModifyPropertyFilter.class.getName());
/** The names of the Properties to filter. */
protected Set<String> propertyNames;
/** The names of the mimetypes to filter. */
protected Set<String> mimeTypes;
/** The name of the encoding type used to convert binary data to string */
protected String encoding = "UTF-8";
/** The regex pattern to match in the property {@link Value Values}. */
protected Pattern pattern;
/** The replacement string for matching regions in the values. */
protected String replacement = "";
/**
* If {@code true}, overwrite the matching property values; otherwise supply
* the modified value as an additional Value (like multi-valued Properties).
*/
protected boolean overwrite = false;
/**
* Sets the the name of the {@link Property} to filter.
* <p>
* A convenience method that is equivalent to calling
* {@code setPropertyNames(Collections.singleton(propertyName)}.
*
* @param propertyName the name of the {@link Property} to filter
* @throws IllegalArgumentException if {@code propertyName} is {@code null}
* or empty
*/
public void setPropertyName(String propertyName) {
Preconditions.checkArgument(!Strings.isNullOrEmpty(propertyName),
"propertyName may not be null or empty");
this.propertyNames = Collections.singleton(propertyName);
}
/**
* Sets the the names of the {@link Property Properties} to filter.
*
* @param propertyNames a {@code Set} of names of the
* {@link Property Properties} to filter
* @throws NullPointerException if {@code propertyNames} is {@code null}
*/
public void setPropertyNames(Set<String> propertyNames) {
Preconditions.checkNotNull(propertyNames, "propertyNames may not be null");
this.propertyNames = propertyNames;
}
/**
* Sets the media types of the {@link Document} objects to modify.
*
* @param mimeTypes a {@code Set} of names of the media types to filter
* @throws NullPointerException if {@code mimeTypes} is {@code null}
*/
public void setMimeTypes(Set<String> mimeTypes) {
Preconditions.checkNotNull(mimeTypes, "mimeTypes may not be null");
this.mimeTypes = mimeTypes;
}
/**
* Sets the media types of the {@link Document} objects to modify.
*
* @param mimeType the name of the media type to filter
* @throws NullPointerException if {@code mimeType} is {@code null}
*/
public void setMimeType(String mimeType) {
Preconditions.checkNotNull(mimeType, "mimeType may not be null");
this.mimeTypes = Collections.singleton(mimeType);
}
/**
* Sets the regular expression pattern to match in the values.
* The supplied {@code pattern} must conform to the syntax defined in
* <a href="http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html">
* {@code java.util.regex.Pattern}</a>.
*
* @param pattern the regular expression pattern to match in the values
* @throws PatternSyntaxException if {@code pattern}'s syntax is invalid
* @throws IllegalArgumentException if {@code pattern} is {@code
* null} or empty
*/
public void setPattern(String pattern) throws PatternSyntaxException {
Preconditions.checkArgument(!Strings.isNullOrEmpty(pattern),
"pattern may not be null or empty");
this.pattern = Pattern.compile(pattern);
}
/**
* Sets the replacement string for matching regions in the values.
* The {@code replacement} string may refer to
* <a href="http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html#cg">
* capturing groups</a> from the {@code pattern} as {@code $1, $2}, etc.
* Therefore, literal instances of {@code '\'} and {@code '$'} in the
* replacement string need to be properly
* <a href="http://docs.oracle.com/javase/7/docs/api/java/util/regex/Matcher.html#appendReplacement(java.lang.StringBuffer,%20java.lang.String)">
* escaped</a>.
*
* @param replacement the replacement String for matching regions in the
* values
*/
public void setReplacement(String replacement) {
this.replacement = Strings.nullToEmpty(replacement);
}
/**
* Sets the {@code overwrite} values flag. If {@code true}, matching values
* are overwritten with the modified value. If {@code false}, matching
* values are augmented by adding an additional modified value.
* Default {@code overwrite} is {@code false}.
*
* @param overwrite the overwrite flag
*/
public void setOverwrite(boolean overwrite) {
this.overwrite = overwrite;
}
/**
* Sets the the name of the character encoding type to be used.
*
* @param encoding name of encoding type
*/
public void setEncoding(String encoding) {
this.encoding = encoding;
}
/**
* Finds a {@link Property} by {@code name}. If the {@code source}
* {@link Document} has a property of that name, then that property
* is returned.
* <p>
* If any of the Property's values (as a string) match the regular
* expression {@code pattern}, then all matching regions of the value
* will be replaced with the {@code replacement} string.
* <p>
* The modified value may either augment or overwrite the original value,
* based upon the {@code overwrite} flag.
*/
@Override
public Property findProperty(Document source, String name)
throws RepositoryException {
Preconditions.checkState(propertyNames != null, "must set propertyNames");
Preconditions.checkState(pattern != null, "must set pattern");
if (!propertyNames.contains(name)) {
// Not a property of interest. Just fetch it from the source.
return source.findProperty(name);
}
// For properties of interest, fetch the values and examine them.
// If a value matches the pattern, either replace or augment that value.
LinkedList<Value> values = new LinkedList<Value>();
for (Value value : super.getPropertyValues(source, name)) {
String original = null;
String modified = null;
Value originalValue = null;
Value modifiedValue = null;
if (value instanceof BinaryValue) {
String mimeType =
Value.getSingleValueString(source, SpiConstants.PROPNAME_MIMETYPE);
if (Strings.isNullOrEmpty(mimeType)) {
// There is no mimetype property in the document.
return source.findProperty(name);
}
if (mimeType.contains(";")) {
mimeType = mimeType.substring(0, mimeType.indexOf(";"));
}
// Initializing with default set
if (mimeTypes == null) {
mimeTypes = initDefaultMimeTypes();
}
// TODO(kiran) should allow match top-level
// (e.g. "text/xml" matches "text")
if (!mimeTypes.contains(mimeType)) {
return source.findProperty(name);
}
// It's a Binary Value, to be read using input stream
InputStream in = ((BinaryValue) value).getInputStream();
byte[] data = null;
try {
data = ByteStreams.toByteArray(in);
} catch (IOException e) {
throw new RepositoryException("Error while reading the source", e);
}
originalValue = Value.getBinaryValue(data);
try {
original = new String(data, encoding);
} catch (UnsupportedEncodingException e) {
throw new RepositoryException("Error while converting"
+ " data with " + encoding, e);
}
modified = pattern.matcher(original).replaceAll(replacement);
try {
modifiedValue = Value.getBinaryValue(modified.getBytes(encoding));
} catch (UnsupportedEncodingException e) {
throw new RepositoryException("Error while converting"
+ " data with " + encoding, e);
}
} else {
original = Strings.nullToEmpty(value.toString());
originalValue = value;
modified = pattern.matcher(original).replaceAll(replacement);
modifiedValue = Value.getStringValue(modified);
}
if (original.equals(modified)) {
values.add(originalValue);
} else if (overwrite) {
values.add(modifiedValue);
if (LOGGER.isLoggable(Level.FINEST)) {
LOGGER.finest("Property Filter replaced " + name + " value "
+ "\"" + originalValue
+ "\" with \"" + modifiedValue + "\"");
}
} else {
values.add(originalValue);
values.add(modifiedValue);
if (LOGGER.isLoggable(Level.FINEST)) {
LOGGER.finest("Property Filter injected " + name
+ " value \"" + modifiedValue + "\"");
}
}
}
return new SimpleProperty(values);
}
private Set<String> initDefaultMimeTypes() {
Set<String> mimeTypes = new TreeSet<String>();
mimeTypes.add("text/xml");
mimeTypes.add("text/xhtml");
mimeTypes.add("text/tab-separated-values");
mimeTypes.add("text/x-sgml");
mimeTypes.add("text/calendar");
mimeTypes.add("text/csv");
mimeTypes.add("text/plain");
mimeTypes.add("text/html");
mimeTypes.add("text/sgml");
mimeTypes.add("application/plain");
mimeTypes.add("application/rdf+xml");
mimeTypes.add("application/xhtml+xml");
mimeTypes.add("application/xml");
mimeTypes.add("message/http");
mimeTypes.add("message/s-http");
return mimeTypes;
}
@Override
public String toString() {
StringBuilder buf = new StringBuilder();
buf.append(super.toString());
buf.append(": (");
buf.append(propertyNames);
buf.append(" , \"");
buf.append(pattern.pattern());
buf.append("\" , \"");
buf.append(replacement);
buf.append("\" , ");
buf.append(overwrite);
buf.append(" , \"");
buf.append(encoding);
buf.append("\" , ");
buf.append(mimeTypes);
buf.append(")");
return buf.toString();
}
}