DefaultJsonSchemaInterpreter.java example

Explorer
sensei-master
/**
 * This software is licensed to you under the Apache License, Version 2.0 (the
 * "Apache License").
 *
 * LinkedIn's contributions are made under the Apache License. If you contribute
 * to the Software, the contributions will be deemed to have been made under the
 * Apache License, unless you expressly indicate otherwise. Please do not make any
 * contributions that would be inconsistent with the Apache License.
 *
 * You may obtain a copy of the Apache License at http://www.apache.org/licenses/LICENSE-2.0
 * Unless required by applicable law or agreed to in writing, this software
 * distributed under the Apache License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the Apache
 * License for the specific language governing permissions and limitations for the
 * software governed under the Apache License.
 *
 * © 2012 LinkedIn Corp. All Rights Reserved.  
 */
package com.senseidb.indexing;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.nio.charset.Charset;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.StringTokenizer;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;

import org.apache.commons.configuration.ConfigurationException;
import org.apache.log4j.Logger;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Index;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.json.JSONException;
import org.json.JSONObject;

import proj.zoie.api.indexing.AbstractZoieIndexable;
import proj.zoie.api.indexing.AbstractZoieIndexableInterpreter;
import proj.zoie.api.indexing.ZoieIndexable;

import com.senseidb.conf.SenseiSchema;
import com.senseidb.conf.SenseiSchema.FieldDefinition;
import com.senseidb.search.plugin.PluggableSearchEngineManager;

public class DefaultJsonSchemaInterpreter extends
        AbstractZoieIndexableInterpreter<JSONObject> {

    private static final Logger logger = Logger.getLogger(DefaultJsonSchemaInterpreter.class);


    private final SenseiSchema _schema;
    private final Set<Entry<String, FieldDefinition>> entries;
    private final String _uidField;
    private final String _delField;
    private final String _skipField;
    private final boolean _compressSrcData;

    private final Map<String, JsonValExtractor> _dateExtractorMap;

    private JsonFilter _jsonFilter = null;

    private static Charset UTF8 = Charset.forName("UTF-8");

    private CustomIndexingPipeline _customIndexingPipeline = null;

    private Set<String> nonLuceneFields = new HashSet<String>();

    public DefaultJsonSchemaInterpreter(SenseiSchema schema) throws ConfigurationException {
        this(schema, null);
    }

    public DefaultJsonSchemaInterpreter(SenseiSchema schema, PluggableSearchEngineManager pluggableSearchEngineManager) throws ConfigurationException {
        _schema = schema;
        if (pluggableSearchEngineManager != null) {
            nonLuceneFields.addAll(pluggableSearchEngineManager.getFieldNames());
        }
        entries = _schema.getFieldDefMap().entrySet();
        _uidField = _schema.getUidField();
        _delField = _schema.getDeleteField();
        _skipField = _schema.getSkipField();
        _compressSrcData = _schema.isCompressSrcData();
        _dateExtractorMap = new HashMap<String, JsonValExtractor>();
        for (Entry<String, FieldDefinition> entry : entries) {
            final FieldDefinition def = entry.getValue();
            if (Date.class.equals(def.type)) {
                _dateExtractorMap.put(entry.getKey(), new JsonValExtractor() {

                    @Override
                    public Object extract(String val) {
                        try {
                            return ((SimpleDateFormat) (def.formatter)).parse(val);
                        } catch (Exception e) {
                            throw new RuntimeException(e.getMessage(), e);
                        }
                    }

                });
            }
        }
    }

    private static interface JsonValExtractor {
        Object extract(String val);
    }

    private final static Map<Class, JsonValExtractor> ExtractorMap = new HashMap<Class, JsonValExtractor>();

    static {
        ExtractorMap.put(int.class, new JsonValExtractor() {

            @Override
            public Object extract(String val) {
                if (val == null || val.length() == 0) {
                    return 0;
                } else {
                    int num = Integer.parseInt(val);
                    /*if (num<0){
                      logger.error("we don't yet support negative values, skipping.");
                      return null;
                    }*/
                    return num;
                }
            }

        });
        ExtractorMap.put(double.class, new JsonValExtractor() {

            @Override
            public Object extract(String val) {

                if (val == null || val.length() == 0) {
                    return 0.0;
                } else {
                    double num = Double.parseDouble(val);
                    /*if (num<0.0){
                      logger.error("we don't yet support negative values, skipping.");
                      return null;
                    }*/
                    return num;
                }
            }

        });
        ExtractorMap.put(long.class, new JsonValExtractor() {

            @Override
            public Object extract(String val) {
                if (val == null || val.length() == 0) {
                    return 0.0;
                } else {
                    long num = Long.parseLong(val);
                    /* if (num<0){
                      logger.error("we don't yet support negative values, skipping.");
                      return null;
                    }*/
                    return num;
                }
            }

        });
        ExtractorMap.put(String.class, new JsonValExtractor() {

            @Override
            public Object extract(String val) {
                return val;
            }

        });

    }

    public static byte[] compress(byte[] src) throws Exception {
        byte[] data = null;
        if (src != null) {
            ByteArrayOutputStream bout = new ByteArrayOutputStream();
            GZIPOutputStream gzipStream = new GZIPOutputStream(bout);

            gzipStream.write(src);
            gzipStream.flush();
            gzipStream.close();
            bout.flush();

            data = bout.toByteArray();
        }

        return data;
    }

    public static byte[] decompress(byte[] src) throws Exception {
        byte[] data = null;
        if (src != null) {
            ByteArrayOutputStream bout = new ByteArrayOutputStream();
            byte[] buf = new byte[1024];  // 1k buffer
            ByteArrayInputStream bin = new ByteArrayInputStream(src);
            GZIPInputStream gzipStream = new GZIPInputStream(bin);

            int len;
            while ((len = gzipStream.read(buf)) > 0) {
                bout.write(buf, 0, len);
            }
            bout.flush();

            data = bout.toByteArray();
        }

        return data;
    }

    public void setCustomIndexingPipeline(CustomIndexingPipeline customIndexingPipeline) {
        _customIndexingPipeline = customIndexingPipeline;
    }

    public CustomIndexingPipeline getCustomIndexingPipeline() {
        return _customIndexingPipeline;
    }

    public void setJsonFilter(JsonFilter jsonFilter) {
        _jsonFilter = jsonFilter;
    }

    public static List<String> tokenize(String val, String delim) {
        List<String> result = new ArrayList<String>();

        if (val == null || val.length() == 0) return result;

        if (delim == null || delim.length() == 0)
            result.add(val);
        else if (delim.length() == 1) {
            char de = delim.charAt(0);
            StringBuilder sb = new StringBuilder();
            boolean escape = false;
            for (char c : val.toCharArray()) {
                if (escape) {
                    if (c == '\\' || c == de)
                        sb.append(c);
                    else
                        sb.append('\\').append(c);

                    escape = false;
                } else {
                    if (c == '\\') {
                        escape = true;
                        continue;
                    } else if (c == de) {
                        if (sb.length() > 0) {
                            result.add(sb.toString());
                            sb.setLength(0);
                        }
                    } else
                        sb.append(c);
                }
            }
            if (escape) sb.append('\\');
            if (sb.length() > 0)
                result.add(sb.toString());
        } else {
            StringTokenizer strtok = new StringTokenizer(val, delim);
            while (strtok.hasMoreTokens()) {
                result.add(strtok.nextToken());
            }
        }

        return result;
    }

    @Override
    public ZoieIndexable convertAndInterpret(JSONObject obj) {
        final JSONObject src = obj;
        final JSONObject filtered;
        if (_jsonFilter != null) {
            try {
                filtered = _jsonFilter.filter(src);
            } catch (Exception e) {
                throw new RuntimeException(e.getMessage(), e);
            }
        } else {
            filtered = src;
        }
        return new AbstractZoieIndexable() {
            /**
             * Process a field with the given value according to the field definition and add the result to lucene document.
             * @param filedName name of the field.
             * @param fieldVal  value of the field.
             * @param fieldDef  field specification
             * @param doc  lucene document to which to add the field.
             */
            void addField(String filedName, String fieldVal, FieldDefinition fieldDef, Document doc) {

                if (fieldVal == null) {
                    return;
                }

                if (fieldDef.isMeta) {
                    JsonValExtractor extractor = ExtractorMap.get(fieldDef.type);

                    if (extractor == null) {
                        if (Date.class.equals(fieldDef.type)) {
                            extractor = _dateExtractorMap.get(filedName);
                        } else {
                            extractor = ExtractorMap.get(String.class);
                        }
                    }
                    List<Object> vals = new LinkedList<Object>();
                    if (fieldDef.isMulti) {
                        for (String token : tokenize(fieldVal, fieldDef.delim)) {
                            Object obj = extractor.extract(token);
                            if (obj != null) {
                                vals.add(obj);
                            }
                        }
                    } else {
                        Object obj = extractor.extract(fieldVal);
                        if (obj != null) {
                            vals.add(obj);
                        }
                    }

                    for (Object val : vals) {
                        if (val == null) continue;
                        String strVal = null;
                        if (fieldDef.formatter != null) {
                            strVal = fieldDef.formatter.format(val);
                        } else {
                            strVal = String.valueOf(val);
                        }
                        Field metaField = new Field(filedName, strVal, Store.NO, Index.NOT_ANALYZED_NO_NORMS);
                        metaField.setOmitNorms(true);
                        metaField.setIndexOptions(IndexOptions.DOCS_ONLY);
                        doc.add(metaField);
                    }
                } else {
                    Field textField = new Field(filedName, fieldVal,
                            fieldDef.textIndexSpec.store, fieldDef.textIndexSpec.index, fieldDef.textIndexSpec.tv);
                    doc.add(textField);
                }
            }

            /**
             * Process a field name that has wildcards
             * We iterate through all the fields in the document and add them to Lucene document
             * if they match the pattern.
             */
            void processWildCards(String fieldName, FieldDefinition fieldDef, JSONObject input, Document doc) {
                Iterator keyIterator = input.keys();

                while (keyIterator.hasNext()) {
                    String docField = keyIterator.next().toString();
                    if (fieldDef.wildCardPattern.matcher(docField).matches()) {
                        String val = input.optString(docField, null);
                        addField(docField, val, fieldDef, doc);
                    }
                }
            }

            @Override
            public IndexingReq[] buildIndexingReqs() {

                org.apache.lucene.document.Document luceneDoc = new org.apache.lucene.document.Document();
                for (Entry<String, FieldDefinition> entry : entries) {
                    String name = entry.getKey();
                    try {
                        final FieldDefinition fldDef = entry.getValue();
                        if (nonLuceneFields.contains(entry.getKey())) {
                            continue;
                        }
                        if (fldDef.hasWildCards) {
                            processWildCards(name, fldDef, filtered, luceneDoc);
                        }
                        else {
                            String val = filtered.optString(name, null);
                            addField(name, val, fldDef, luceneDoc);
                        }
                    } catch (Exception e) {
                        logger.error("Problem extracting data for field: " + name, e);
                        throw new RuntimeException(e);
                    }
                }

                if (_customIndexingPipeline != null) {
                    _customIndexingPipeline.applyCustomization(luceneDoc, _schema, filtered);
                }
                return new IndexingReq[]{new IndexingReq(luceneDoc)};
            }

            @Override
            public long getUID() {
                try {
                    return Long.parseLong(filtered.getString(_uidField));
                } catch (JSONException e) {
                    throw new IllegalStateException(e.getMessage(), e);
                }
            }

            @Override
            public boolean isDeleted() {
                try {
                    String type = filtered.optString(SenseiSchema.EVENT_TYPE_FIELD, null);
                    if (type == null)
                        return filtered.optBoolean(_delField);
                    else
                        return SenseiSchema.EVENT_TYPE_DELETE.equalsIgnoreCase(type);
                } catch (Exception e) {
                    logger.error(e.getMessage(), e);
                    return false;
                }
            }

            @Override
            public boolean isSkip() {
                try {
                    String type = filtered.optString(SenseiSchema.EVENT_TYPE_FIELD, null);
                    if (type == null)
                        return filtered.optBoolean(_skipField);
                    else
                        return SenseiSchema.EVENT_TYPE_SKIP.equalsIgnoreCase(type);
                } catch (Exception e) {
                    logger.error(e.getMessage(), e);
                    return false;
                }
            }

            @Override
            public byte[] getStoreValue() {
                byte[] data = null;
                if (src != null) {
                    Object type = src.remove(SenseiSchema.EVENT_TYPE_FIELD);
                    try {
                        String srcData = src.optString(_schema.getSrcDataField(), null);
                        if (srcData == null) {
                            srcData = src.toString();
                        }
                        if (_compressSrcData)
                            data = compress(srcData.getBytes("UTF-8"));
                        else
                            data = srcData.getBytes("UTF-8");
                    } catch (Exception e) {
                        logger.error(e.getMessage(), e);
                    }

                    if (type != null) {
                        try {
                            src.put(SenseiSchema.EVENT_TYPE_FIELD, type);
                        } catch (Exception e) {
                            logger.error("Should never happen", e);
                        }
                    }
                }

                return data;
            }

            @Override
            public boolean isStorable() {
                return true;
            }


        };
    }

}