// Copyright 2007-2009 Google Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package com.google.enterprise.connector.traversal;
import com.google.common.annotations.VisibleForTesting;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.logging.Level;
import java.util.logging.Logger;
/**
* Provides context to the traversal process on what mime types are acceptable
* to the GSA. We might think about getting this info dynamically from the GSA
* instead of getting from config (as here).
*/
public class MimeTypeMap {
private static final Logger LOGGER =
Logger.getLogger(MimeTypeMap.class.getName());
private final Map<String, Integer> typeMap;
private int unknownMimeTypeSupportLevel;
public MimeTypeMap() {
// if no setters are called, then all mime types are supported
typeMap = new HashMap<String, Integer>();
unknownMimeTypeSupportLevel = 1;
}
/**
* Set the support level for any mime type not explicitly
* included in the map. There are hundreds of mime types
* and more a being added all the time. Setting this value
* to 0 means that unknown mime types will not be indexed.
* Setting it to 1 means that the content will be sent to
* the GSA, where it may or may not be successfully indexed.
*
* @param unknownMimeTypeSupportLevel an int greater than or equal to 0.
*/
public void setUnknownMimeTypeSupportLevel(int unknownMimeTypeSupportLevel) {
LOGGER.config("Setting unknownMimeTypeSupportLevel to "
+ unknownMimeTypeSupportLevel);
this.unknownMimeTypeSupportLevel = unknownMimeTypeSupportLevel;
}
/**
* Sets the preferred mime types to index. If a repository
* supplies a choice of document representations, the connector
* should try to provide one of these preferred types.
* These mime types require little or no preprocessing
* or file format conversion to extract text and metadata.
*
* @param mimeTypes Set of mime types that are preferred.
*/
public void setPreferredMimeTypes(Set<String> mimeTypes) {
LOGGER.config("Setting preferred mime types to " + mimeTypes);
initMimeTypes(mimeTypes, 8);
}
/**
* Sets the supported mime types to index.
* These mime types may require some preprocessing or
* file format conversion to extract text and metadata.
* Some information may be lost or discarded.
*
* @param mimeTypes Set of mime types that are preferred.
*/
public void setSupportedMimeTypes(Set<String> mimeTypes) {
LOGGER.config("Setting supported mime types to " + mimeTypes);
initMimeTypes(mimeTypes, 4);
}
/**
* Set the unsupported mime types whose content cannot be indexed.
* These mime types provide little or no textual content, or are
* data formats that are either unknown or do not have a format converter.
* The connector may still provide meta-data describing the content,
* but the content itself should not be pushed.
*
* @param mimeTypes Set of mime types that are not indexable.
*/
public void setUnsupportedMimeTypes(Set<String> mimeTypes) {
LOGGER.config("Setting unsupported mime types to " + mimeTypes);
initMimeTypes(mimeTypes, -1);
}
/**
* Set the excluded mime types whose content should not be indexed.
* The connector should not feed these documents at all, supplying
* neither meta-data nor content.
*
* @param mimeTypes Set of mime types that should not be fed.
*/
public void setExcludedMimeTypes(Set<String> mimeTypes) {
LOGGER.config("Setting excluded mime types to " + mimeTypes);
// -5 is for historical reasons, as Excluded was added after Unsupported.
initMimeTypes(mimeTypes, -5);
}
/*
* Add the set of mimetypes to the typeMap at the desired support level.
* Mimetypes with "/vnd.*" subtypes are preferred over others, and
* mimetypes registered with IANA are preferred over those with "/x-*"
* experimental subtypes. This ranking is done by adjusting the support
* level +/- 1, accordingly. Content types sans subtypes are preferred
* least of all, so their support level is adjusted by -2.
*/
@VisibleForTesting
void initMimeTypes(Set<String> mimeTypes, int supportLevel) {
if (mimeTypes == null || mimeTypes.size() == 0)
return;
// Adjust the support level so that "/vdn." and "/x-" subtype
// sorting does not accidentally cross above or below 0.
if (supportLevel == 0) {
supportLevel = -1;
} else if (supportLevel > 0 && supportLevel < 3) {
supportLevel = 3;
}
Integer level0 = Integer.valueOf(supportLevel - 2);
Integer level1 = Integer.valueOf(supportLevel - 1);
Integer level2 = Integer.valueOf(supportLevel);
Integer level3 = Integer.valueOf(supportLevel + 1);
// Add the mimetypes to the map. We adjust the support levels
// slightly to prefer "vnd." subtypes over others, and prefer
// any other subtype over "x-" subtypes. Rank */plain below
// possibly more specific types.
// Content types sans subtypes are ranked below all others.
for (Iterator<String> i = mimeTypes.iterator(); i.hasNext(); ) {
String mimeType = i.next().trim().toLowerCase();
if (mimeType.indexOf('/') < 0) {
typeMap.put(mimeType, level0);
} else if (mimeType.startsWith("x-") || (mimeType.indexOf("/x-") > 0)) {
typeMap.put(mimeType, level1);
} else if (mimeType.endsWith("/plain")) {
typeMap.put(mimeType, level1);
} else if (mimeType.indexOf("/vnd.") > 0) {
typeMap.put(mimeType, level3);
} else if (mimeType.length() > 0) {
typeMap.put(mimeType, level2);
}
}
}
/**
* Return the support level for a given mime type. No validation is
* performed.
*
* @param mimeType
* @return zero (or negative) means that this mimetype is not supported, with
* negative values indicating the document should be skipped entirely.
* Positive integers may be compared to choose which mime types are
* preferred.
*/
/*
* TODO(jlacey): Support globbing or regular expressions in the types,
* so that we can match against "application/vnd.ms-excel*" for example.
*/
public int mimeTypeSupportLevel(String mimeType) {
Integer result = null;
if (mimeType != null) {
result = typeMap.get(mimeType.trim().toLowerCase());
if (result == null) {
// If exact match not found, look for a match on just the
// primary mimetype (sans the subtype).
int i = mimeType.indexOf('/');
if (i > 0) {
result = typeMap.get(mimeType.substring(0, i));
}
}
}
int sl;
if (result == null) {
sl = unknownMimeTypeSupportLevel;
} else {
// Map all Unsupported to 0, and all Excluded to -1.
sl = (result > 0) ? result : ((result < -3) ? -1 : 0);
}
if (LOGGER.isLoggable(Level.FINEST)) {
LOGGER.finest("Mime type support level for " + mimeType + " is " + sl);
}
return sl;
}
/**
* Return the most preferred mime type from a Set of candidates.
* Mime types with higher support levels are preferred over those
* with lower support levels. For those with equal support levels,
* non-'x-*' subtypes are preferred over 'x-*' subtypes, and mimetypes
* with subtypes are preferred over those without.
*
* @param mimeTypes a Set of mime type Strings.
* @return the most preferred mime type from the Set.
*/
public String preferredMimeType(Set<String> mimeTypes) {
if (mimeTypes == null || mimeTypes.size() == 0)
return null;
// Look for an exact match on one of the mimeTypes.
int bestLevel = Integer.MIN_VALUE;
String bestMimeType = null;
for (String mimeType : mimeTypes) {
int thisLevel = mimeTypeSupportLevel(mimeType);
if (thisLevel > bestLevel) {
bestLevel = thisLevel;
bestMimeType = mimeType;
} else if (thisLevel == bestLevel) {
if (mimeType.trim().length() < bestMimeType.trim().length()) {
bestMimeType = mimeType;
}
}
}
if (LOGGER.isLoggable(Level.FINEST)) {
LOGGER.finest("Preferred mime type from " + mimeTypes.toString()
+ " is " + bestMimeType);
}
return bestMimeType;
}
}