S3RiverUtil.java example

Explorer

es-amazon-s3-river-master
- src
  - itest
    - java
      - com
        github
        lbroudoux
        elasticsearch
        river
        s3
        connector
        S3ConnectorTest.java
  - main
    - java
      - com
        github
        lbroudoux
        elasticsearch
        river
        s3
        connector
        S3Connector.java
        S3ObjectSummaries.java
        plugin
        S3RiverPlugin.java
        rest
        S3ManageAction.java
        river
        S3River.java
        S3RiverFeedDefinition.java
        S3RiverModule.java
        S3RiverUtil.java
        TikaHolder.java
  - test
    - java
      - com
        github
        lbroudoux
        elasticsearch
        river
        s3
        river
        S3RiverUtilTest.java

/*
 * Licensed to Laurent Broudoux (the "Author") under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership. Author licenses this
 * file to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package com.github.lbroudoux.elasticsearch.river.s3.river;

import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder;

import java.util.*;

import org.elasticsearch.common.Strings;
import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.common.xcontent.support.XContentMapValues;
/**
 * Utility class for Amazon S3 indexing management.
 * @author laurent
 */
public class S3RiverUtil{

   public static final String INDEX_TYPE_DOC = "doc";
   
   public static final String DOC_FIELD_TITLE = "title";
   public static final String DOC_FIELD_MODIFIED_DATE = "modifiedDate";
   public static final String DOC_FIELD_SOURCE_URL = "source_url";
   public static final String DOC_FIELD_METADATA = "metadata";
   
   /**
    * Build mapping description for Amazon S3 files.
    * @param type The name of type for S3 files
    * @return A content builder for mapping informations
    * @throws Exception it something goes wrong
    */
   public static XContentBuilder buildS3FileMapping(String type) throws Exception{
      XContentBuilder xbMapping = jsonBuilder().prettyPrint().startObject()
            .startObject(type).startObject("properties")
            .startObject(DOC_FIELD_TITLE).field("type", "string").field("analyzer","keyword").endObject()
            .startObject(DOC_FIELD_MODIFIED_DATE).field("type", "date").endObject()
            .startObject(DOC_FIELD_SOURCE_URL).field("type", "string").endObject()
            .startObject(DOC_FIELD_METADATA).field("type", "object").endObject()
            .startObject("file")
               .startObject("properties")
                  .startObject("title").field("type", "string").field("store", "yes").endObject()
                  .startObject("file").field("type", "string")
                     .field("term_vector", "with_positions_offsets")
                     .field("store", "yes")
                  .endObject()
                  .startObject("metadata").field("type", "object").field("store", "yes").endObject()
               .endObject()
            .endObject()
            .endObject().endObject().endObject();
      return xbMapping;
   }
   
   /**
    * Extract array from settings (array or ; delimited String)
    * @param settings Settings
    * @param path Path to settings definition
    * @return Array of settings
    */
   @SuppressWarnings("unchecked")
   public static String[] buildArrayFromSettings(Map<String, Object> settings, String path){
      String[] includes;

      // We manage comma separated format and arrays
      if (XContentMapValues.isArray(XContentMapValues.extractValue(path, settings))) {
         List<String> includesarray = (List<String>) XContentMapValues.extractValue(path, settings);
         int i = 0;
         includes = new String[includesarray.size()];
         for (String include : includesarray) {
            includes[i++] = trimAllWhitespace(include);
         }
      } else {
         String includedef = (String) XContentMapValues.extractValue(path, settings);
         includes = Strings.commaDelimitedListToStringArray(trimAllWhitespace(includedef));
      }
      
      String[] uniquelist = removeDuplicateStrings(includes);
      
      return uniquelist;
   }

   /**
    * Tells if an Aamzon S3 file is indexable from its key (file name), based on includes
    * and excludes rules. 
    * @return true if file should be indexed, false otherwise
    */
   public static boolean isIndexable(String key, List<String> includes, List<String> excludes){
      // If no rules specified, we index everything !
      if ((includes == null && excludes == null) 
            || (includes.isEmpty() && excludes.isEmpty())){
         return true;
      }
      
      // Exclude rules : we know that whatever includes rules are, we should exclude matching files.
      if (excludes != null){
         for (String exclude : excludes){
            String regex = exclude.replace("?", ".?").replace("*", ".*?");
            if (key.matches(regex)){
               return false;
            }
         }
      }
      
      // Include rules : we should add document if it match include rules.
      if (includes == null || includes.isEmpty()){
         return true;
      }
      if (includes != null){
         for (String include : includes){
            String regex = include.replace("?", ".?").replace("*", ".*?");
            if (key.matches(regex)){
               return true;
            }
         }
      }
      
      return false;
   }

   /**
    * Trim <i>all</i> whitespace from the given String: leading, trailing, and inbetween characters.
    * @param str the String to check
    * @return the trimmed String
    * @see java.lang.Character#isWhitespace
    */
   public static String trimAllWhitespace(String str) {
      if (!Strings.hasLength(str)) {
         return str;
      }
      StringBuilder sb = new StringBuilder(str);
      int index = 0;
      while (sb.length() > index) {
         if (Character.isWhitespace(sb.charAt(index))) {
            sb.deleteCharAt(index);
         } else {
            index++;
         }
      }
      return sb.toString();
   }

   /**
    * Remove duplicate Strings from the given array. Also sorts the array, as it uses a TreeSet.
    * @param array the String array
    * @return an array without duplicates, in natural sort order
    */
   public static String[] removeDuplicateStrings(String[] array) {
      if (array == null || array.length == 0) {
         return array;
      }
      Set<String> set = new TreeSet<String>();
      set.addAll(Arrays.asList(array));
      return Strings.toStringArray(set);
   }
}