/*
* Licensed to Laurent Broudoux (the "Author") under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. Author licenses this
* file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package com.github.lbroudoux.elasticsearch.river.s3.river;
import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder;
import java.util.*;
import org.elasticsearch.common.Strings;
import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.common.xcontent.support.XContentMapValues;
/**
* Utility class for Amazon S3 indexing management.
* @author laurent
*/
public class S3RiverUtil{
public static final String INDEX_TYPE_DOC = "doc";
public static final String DOC_FIELD_TITLE = "title";
public static final String DOC_FIELD_MODIFIED_DATE = "modifiedDate";
public static final String DOC_FIELD_SOURCE_URL = "source_url";
public static final String DOC_FIELD_METADATA = "metadata";
/**
* Build mapping description for Amazon S3 files.
* @param type The name of type for S3 files
* @return A content builder for mapping informations
* @throws Exception it something goes wrong
*/
public static XContentBuilder buildS3FileMapping(String type) throws Exception{
XContentBuilder xbMapping = jsonBuilder().prettyPrint().startObject()
.startObject(type).startObject("properties")
.startObject(DOC_FIELD_TITLE).field("type", "string").field("analyzer","keyword").endObject()
.startObject(DOC_FIELD_MODIFIED_DATE).field("type", "date").endObject()
.startObject(DOC_FIELD_SOURCE_URL).field("type", "string").endObject()
.startObject(DOC_FIELD_METADATA).field("type", "object").endObject()
.startObject("file")
.startObject("properties")
.startObject("title").field("type", "string").field("store", "yes").endObject()
.startObject("file").field("type", "string")
.field("term_vector", "with_positions_offsets")
.field("store", "yes")
.endObject()
.startObject("metadata").field("type", "object").field("store", "yes").endObject()
.endObject()
.endObject()
.endObject().endObject().endObject();
return xbMapping;
}
/**
* Extract array from settings (array or ; delimited String)
* @param settings Settings
* @param path Path to settings definition
* @return Array of settings
*/
@SuppressWarnings("unchecked")
public static String[] buildArrayFromSettings(Map<String, Object> settings, String path){
String[] includes;
// We manage comma separated format and arrays
if (XContentMapValues.isArray(XContentMapValues.extractValue(path, settings))) {
List<String> includesarray = (List<String>) XContentMapValues.extractValue(path, settings);
int i = 0;
includes = new String[includesarray.size()];
for (String include : includesarray) {
includes[i++] = trimAllWhitespace(include);
}
} else {
String includedef = (String) XContentMapValues.extractValue(path, settings);
includes = Strings.commaDelimitedListToStringArray(trimAllWhitespace(includedef));
}
String[] uniquelist = removeDuplicateStrings(includes);
return uniquelist;
}
/**
* Tells if an Aamzon S3 file is indexable from its key (file name), based on includes
* and excludes rules.
* @return true if file should be indexed, false otherwise
*/
public static boolean isIndexable(String key, List<String> includes, List<String> excludes){
// If no rules specified, we index everything !
if ((includes == null && excludes == null)
|| (includes.isEmpty() && excludes.isEmpty())){
return true;
}
// Exclude rules : we know that whatever includes rules are, we should exclude matching files.
if (excludes != null){
for (String exclude : excludes){
String regex = exclude.replace("?", ".?").replace("*", ".*?");
if (key.matches(regex)){
return false;
}
}
}
// Include rules : we should add document if it match include rules.
if (includes == null || includes.isEmpty()){
return true;
}
if (includes != null){
for (String include : includes){
String regex = include.replace("?", ".?").replace("*", ".*?");
if (key.matches(regex)){
return true;
}
}
}
return false;
}
/**
* Trim <i>all</i> whitespace from the given String: leading, trailing, and inbetween characters.
* @param str the String to check
* @return the trimmed String
* @see java.lang.Character#isWhitespace
*/
public static String trimAllWhitespace(String str) {
if (!Strings.hasLength(str)) {
return str;
}
StringBuilder sb = new StringBuilder(str);
int index = 0;
while (sb.length() > index) {
if (Character.isWhitespace(sb.charAt(index))) {
sb.deleteCharAt(index);
} else {
index++;
}
}
return sb.toString();
}
/**
* Remove duplicate Strings from the given array. Also sorts the array, as it uses a TreeSet.
* @param array the String array
* @return an array without duplicates, in natural sort order
*/
public static String[] removeDuplicateStrings(String[] array) {
if (array == null || array.length == 0) {
return array;
}
Set<String> set = new TreeSet<String>();
set.addAll(Arrays.asList(array));
return Strings.toStringArray(set);
}
}