/** * This software is licensed to you under the Apache License, Version 2.0 (the * "Apache License"). * * LinkedIn's contributions are made under the Apache License. If you contribute * to the Software, the contributions will be deemed to have been made under the * Apache License, unless you expressly indicate otherwise. Please do not make any * contributions that would be inconsistent with the Apache License. * * You may obtain a copy of the Apache License at http://www.apache.org/licenses/LICENSE-2.0 * Unless required by applicable law or agreed to in writing, this software * distributed under the Apache License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the Apache * License for the specific language governing permissions and limitations for the * software governed under the Apache License. * * © 2012 LinkedIn Corp. All Rights Reserved. */ package com.senseidb.indexing; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.nio.charset.Charset; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Date; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Set; import java.util.StringTokenizer; import java.util.zip.GZIPInputStream; import java.util.zip.GZIPOutputStream; import org.apache.commons.configuration.ConfigurationException; import org.apache.log4j.Logger; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.Field.Index; import org.apache.lucene.document.Field.Store; import org.apache.lucene.index.FieldInfo.IndexOptions; import org.json.JSONException; import org.json.JSONObject; import proj.zoie.api.indexing.AbstractZoieIndexable; import proj.zoie.api.indexing.AbstractZoieIndexableInterpreter; import proj.zoie.api.indexing.ZoieIndexable; import com.senseidb.conf.SenseiSchema; import com.senseidb.conf.SenseiSchema.FieldDefinition; import com.senseidb.search.plugin.PluggableSearchEngineManager; public class DefaultJsonSchemaInterpreter extends AbstractZoieIndexableInterpreter<JSONObject> { private static final Logger logger = Logger.getLogger(DefaultJsonSchemaInterpreter.class); private final SenseiSchema _schema; private final Set<Entry<String, FieldDefinition>> entries; private final String _uidField; private final String _delField; private final String _skipField; private final boolean _compressSrcData; private final Map<String, JsonValExtractor> _dateExtractorMap; private JsonFilter _jsonFilter = null; private static Charset UTF8 = Charset.forName("UTF-8"); private CustomIndexingPipeline _customIndexingPipeline = null; private Set<String> nonLuceneFields = new HashSet<String>(); public DefaultJsonSchemaInterpreter(SenseiSchema schema) throws ConfigurationException { this(schema, null); } public DefaultJsonSchemaInterpreter(SenseiSchema schema, PluggableSearchEngineManager pluggableSearchEngineManager) throws ConfigurationException { _schema = schema; if (pluggableSearchEngineManager != null) { nonLuceneFields.addAll(pluggableSearchEngineManager.getFieldNames()); } entries = _schema.getFieldDefMap().entrySet(); _uidField = _schema.getUidField(); _delField = _schema.getDeleteField(); _skipField = _schema.getSkipField(); _compressSrcData = _schema.isCompressSrcData(); _dateExtractorMap = new HashMap<String, JsonValExtractor>(); for (Entry<String, FieldDefinition> entry : entries) { final FieldDefinition def = entry.getValue(); if (Date.class.equals(def.type)) { _dateExtractorMap.put(entry.getKey(), new JsonValExtractor() { @Override public Object extract(String val) { try { return ((SimpleDateFormat) (def.formatter)).parse(val); } catch (Exception e) { throw new RuntimeException(e.getMessage(), e); } } }); } } } private static interface JsonValExtractor { Object extract(String val); } private final static Map<Class, JsonValExtractor> ExtractorMap = new HashMap<Class, JsonValExtractor>(); static { ExtractorMap.put(int.class, new JsonValExtractor() { @Override public Object extract(String val) { if (val == null || val.length() == 0) { return 0; } else { int num = Integer.parseInt(val); /*if (num<0){ logger.error("we don't yet support negative values, skipping."); return null; }*/ return num; } } }); ExtractorMap.put(double.class, new JsonValExtractor() { @Override public Object extract(String val) { if (val == null || val.length() == 0) { return 0.0; } else { double num = Double.parseDouble(val); /*if (num<0.0){ logger.error("we don't yet support negative values, skipping."); return null; }*/ return num; } } }); ExtractorMap.put(long.class, new JsonValExtractor() { @Override public Object extract(String val) { if (val == null || val.length() == 0) { return 0.0; } else { long num = Long.parseLong(val); /* if (num<0){ logger.error("we don't yet support negative values, skipping."); return null; }*/ return num; } } }); ExtractorMap.put(String.class, new JsonValExtractor() { @Override public Object extract(String val) { return val; } }); } public static byte[] compress(byte[] src) throws Exception { byte[] data = null; if (src != null) { ByteArrayOutputStream bout = new ByteArrayOutputStream(); GZIPOutputStream gzipStream = new GZIPOutputStream(bout); gzipStream.write(src); gzipStream.flush(); gzipStream.close(); bout.flush(); data = bout.toByteArray(); } return data; } public static byte[] decompress(byte[] src) throws Exception { byte[] data = null; if (src != null) { ByteArrayOutputStream bout = new ByteArrayOutputStream(); byte[] buf = new byte[1024]; // 1k buffer ByteArrayInputStream bin = new ByteArrayInputStream(src); GZIPInputStream gzipStream = new GZIPInputStream(bin); int len; while ((len = gzipStream.read(buf)) > 0) { bout.write(buf, 0, len); } bout.flush(); data = bout.toByteArray(); } return data; } public void setCustomIndexingPipeline(CustomIndexingPipeline customIndexingPipeline) { _customIndexingPipeline = customIndexingPipeline; } public CustomIndexingPipeline getCustomIndexingPipeline() { return _customIndexingPipeline; } public void setJsonFilter(JsonFilter jsonFilter) { _jsonFilter = jsonFilter; } public static List<String> tokenize(String val, String delim) { List<String> result = new ArrayList<String>(); if (val == null || val.length() == 0) return result; if (delim == null || delim.length() == 0) result.add(val); else if (delim.length() == 1) { char de = delim.charAt(0); StringBuilder sb = new StringBuilder(); boolean escape = false; for (char c : val.toCharArray()) { if (escape) { if (c == '\\' || c == de) sb.append(c); else sb.append('\\').append(c); escape = false; } else { if (c == '\\') { escape = true; continue; } else if (c == de) { if (sb.length() > 0) { result.add(sb.toString()); sb.setLength(0); } } else sb.append(c); } } if (escape) sb.append('\\'); if (sb.length() > 0) result.add(sb.toString()); } else { StringTokenizer strtok = new StringTokenizer(val, delim); while (strtok.hasMoreTokens()) { result.add(strtok.nextToken()); } } return result; } @Override public ZoieIndexable convertAndInterpret(JSONObject obj) { final JSONObject src = obj; final JSONObject filtered; if (_jsonFilter != null) { try { filtered = _jsonFilter.filter(src); } catch (Exception e) { throw new RuntimeException(e.getMessage(), e); } } else { filtered = src; } return new AbstractZoieIndexable() { /** * Process a field with the given value according to the field definition and add the result to lucene document. * @param filedName name of the field. * @param fieldVal value of the field. * @param fieldDef field specification * @param doc lucene document to which to add the field. */ void addField(String filedName, String fieldVal, FieldDefinition fieldDef, Document doc) { if (fieldVal == null) { return; } if (fieldDef.isMeta) { JsonValExtractor extractor = ExtractorMap.get(fieldDef.type); if (extractor == null) { if (Date.class.equals(fieldDef.type)) { extractor = _dateExtractorMap.get(filedName); } else { extractor = ExtractorMap.get(String.class); } } List<Object> vals = new LinkedList<Object>(); if (fieldDef.isMulti) { for (String token : tokenize(fieldVal, fieldDef.delim)) { Object obj = extractor.extract(token); if (obj != null) { vals.add(obj); } } } else { Object obj = extractor.extract(fieldVal); if (obj != null) { vals.add(obj); } } for (Object val : vals) { if (val == null) continue; String strVal = null; if (fieldDef.formatter != null) { strVal = fieldDef.formatter.format(val); } else { strVal = String.valueOf(val); } Field metaField = new Field(filedName, strVal, Store.NO, Index.NOT_ANALYZED_NO_NORMS); metaField.setOmitNorms(true); metaField.setIndexOptions(IndexOptions.DOCS_ONLY); doc.add(metaField); } } else { Field textField = new Field(filedName, fieldVal, fieldDef.textIndexSpec.store, fieldDef.textIndexSpec.index, fieldDef.textIndexSpec.tv); doc.add(textField); } } /** * Process a field name that has wildcards * We iterate through all the fields in the document and add them to Lucene document * if they match the pattern. */ void processWildCards(String fieldName, FieldDefinition fieldDef, JSONObject input, Document doc) { Iterator keyIterator = input.keys(); while (keyIterator.hasNext()) { String docField = keyIterator.next().toString(); if (fieldDef.wildCardPattern.matcher(docField).matches()) { String val = input.optString(docField, null); addField(docField, val, fieldDef, doc); } } } @Override public IndexingReq[] buildIndexingReqs() { org.apache.lucene.document.Document luceneDoc = new org.apache.lucene.document.Document(); for (Entry<String, FieldDefinition> entry : entries) { String name = entry.getKey(); try { final FieldDefinition fldDef = entry.getValue(); if (nonLuceneFields.contains(entry.getKey())) { continue; } if (fldDef.hasWildCards) { processWildCards(name, fldDef, filtered, luceneDoc); } else { String val = filtered.optString(name, null); addField(name, val, fldDef, luceneDoc); } } catch (Exception e) { logger.error("Problem extracting data for field: " + name, e); throw new RuntimeException(e); } } if (_customIndexingPipeline != null) { _customIndexingPipeline.applyCustomization(luceneDoc, _schema, filtered); } return new IndexingReq[]{new IndexingReq(luceneDoc)}; } @Override public long getUID() { try { return Long.parseLong(filtered.getString(_uidField)); } catch (JSONException e) { throw new IllegalStateException(e.getMessage(), e); } } @Override public boolean isDeleted() { try { String type = filtered.optString(SenseiSchema.EVENT_TYPE_FIELD, null); if (type == null) return filtered.optBoolean(_delField); else return SenseiSchema.EVENT_TYPE_DELETE.equalsIgnoreCase(type); } catch (Exception e) { logger.error(e.getMessage(), e); return false; } } @Override public boolean isSkip() { try { String type = filtered.optString(SenseiSchema.EVENT_TYPE_FIELD, null); if (type == null) return filtered.optBoolean(_skipField); else return SenseiSchema.EVENT_TYPE_SKIP.equalsIgnoreCase(type); } catch (Exception e) { logger.error(e.getMessage(), e); return false; } } @Override public byte[] getStoreValue() { byte[] data = null; if (src != null) { Object type = src.remove(SenseiSchema.EVENT_TYPE_FIELD); try { String srcData = src.optString(_schema.getSrcDataField(), null); if (srcData == null) { srcData = src.toString(); } if (_compressSrcData) data = compress(srcData.getBytes("UTF-8")); else data = srcData.getBytes("UTF-8"); } catch (Exception e) { logger.error(e.getMessage(), e); } if (type != null) { try { src.put(SenseiSchema.EVENT_TYPE_FIELD, type); } catch (Exception e) { logger.error("Should never happen", e); } } } return data; } @Override public boolean isStorable() { return true; } }; } }