/* * chombo: Hadoop Map Reduce utility * Author: Pranab Ghosh * * Licensed under the Apache License, Version 2.0 (the "License"); you * may not use this file except in compliance with the License. You may * obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or * implied. See the License for the specific language governing * permissions and limitations under the License. */ package org.chombo.transformer; import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; import java.io.Serializable; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import org.chombo.util.BasicUtils; import org.codehaus.jackson.JsonParseException; import org.codehaus.jackson.map.JsonMappingException; import org.codehaus.jackson.map.ObjectMapper; import org.codehaus.jackson.type.TypeReference; /** * @author pranab * */ public class JsonFieldExtractor implements Serializable { private ObjectMapper mapper; private Map<String, Object> map; private AttributeList extField = null; private boolean failOnInvalid; private boolean normalize; private AttributeList[] records; private Map<Integer, String> fieldTypes = new HashMap<Integer, String>(); private Map<String, Integer> childObjectPaths = new HashMap<String, Integer>(); private int numChildObjects; private List<String[]> extractedRecords = new ArrayList<String[]>(); private int numAttributes; private String listChild = "@a"; private int listChildLen = 2; private Map<String, List<Integer>> entityColumnIndexes = new HashMap<String, List<Integer>>(); private String[] extractedParentRecord; private Map<String, List<String[]>> extractedChildRecords = new HashMap<String, List<String[]>>(); private int numParentFields; private boolean debugOn; private static final String ROOT_ENTITY = "root"; private String idFieldPath; private int idFieldIndex; private boolean autoIdGeneration; /** * @param failOnInvalid */ public JsonFieldExtractor(boolean failOnInvalid, boolean normalize) { //mapper = new ObjectMapper(); this.failOnInvalid = failOnInvalid; this.normalize = normalize; } /** * @param idFieldPath * @return */ public JsonFieldExtractor withIdFieldPath(String idFieldPath) { this.idFieldPath = idFieldPath; if (debugOn) { System.out.println("parent id field defined"); } return this; } /** * @param autoIdGeneration * @return */ public JsonFieldExtractor withAutoIdGeneration() { this.autoIdGeneration = true; if (debugOn) { System.out.println("parent id auto generated"); } return this; } public void setDebugOn(boolean debugOn) { this.debugOn = debugOn; } /** * @return */ public List<String[]> getExtractedRecords() { return extractedRecords; } public String[] getExtractedParentRecord() { return extractedParentRecord; } public Map<String, List<String[]>> getExtractedChildRecords() { return extractedChildRecords; } /** * @param record */ public void parse(String record) { try { if (null == mapper) { mapper = new ObjectMapper(); } InputStream is = new ByteArrayInputStream(record.getBytes()); map = mapper.readValue(is, new TypeReference<Map<String, Object>>() {}); } catch (JsonParseException ex) { handleParseError(ex); } catch (JsonMappingException ex) { handleParseError(ex); } catch (IOException ex) { handleParseError(ex); } } /** * @param ex */ private void handleParseError(Exception ex) { map = null; if (failOnInvalid) { throw new IllegalArgumentException("failed to parse json " + ex.getMessage()); } } /** * @param path * @return */ public AttributeList extractField(String path) { extField = new AttributeList(); if (null != map) { String[] pathElements = path.split("\\."); extractField(map, pathElements, 0); } return extField; } /** * @param map * @param pathElements * @param index */ public void extractField(Map<String, Object> map, String[] pathElements, int index) { //extract index in case current path element point to list String key = null; int keyIndex = 0; String pathElem = pathElements[index]; int pos = pathElem.indexOf(listChild); if (pos == -1) { //scalar key = pathElem; if (debugOn) System.out.println("non array key: " + key); } else { //array key = pathElem.substring(0, pos); if (debugOn) System.out.println("array key: " + key); //whole list if no index provided String indexPart = pathElem.substring(pos + listChildLen); if (debugOn) System.out.println("indexPart: " + indexPart); if (!indexPart.isEmpty()) { keyIndex = Integer.parseInt(indexPart.substring(1)); } else { keyIndex = -1; } if (debugOn) System.out.println("keyIndex: " + keyIndex); } Object obj = map.get(key); if (null == obj) { //invalid key if (failOnInvalid) { throw new IllegalArgumentException("field not reachable with json path"); } else { extField.add(""); } } else { //traverse further if (obj instanceof Map<?,?>) { if (debugOn) System.out.println("got map next"); //got object if (index == pathElements.length - 1) { throw new IllegalArgumentException("got map at end of json path"); } extractField((Map<String, Object>)obj, pathElements, index + 1); } else if (obj instanceof List<?>) { if (debugOn) System.out.println("got list next"); //got list List<?> listObj = (List<?>)obj; if (keyIndex >= 0) { //specific item in list Object child = listObj.get(keyIndex); if (child instanceof Map<?,?>) { // non primitive list if (index == pathElements.length - 1) { throw new IllegalArgumentException("got list of map at end of json path"); } //call recursively all map object extractField((Map<String, Object>)child, pathElements, index + 1); } else { //element in primitive list extField.add(child.toString()); } } else { //all items in list if (listObj.get(0) instanceof Map<?,?>) { // non primitive list if (index == pathElements.length - 1) { throw new IllegalArgumentException("got list of map at end of json path"); } //call recursively all child map object for (Object item : listObj) { extractField((Map<String, Object>)item, pathElements, index + 1); } } else { for (Object item : listObj) { //all elements in primitive list extField.add(item.toString()); } } } } else { //primitive extField.add(obj.toString()); } } } /** * @param record * @param paths * @param items * @return */ public boolean extractAllFields(String record, List<String> paths) { boolean valid = true; records = new AttributeList[paths.size()]; parse(record); numChildObjects = 1; numAttributes = paths.size(); if (null != map) { int i = 0; for (String path : paths) { if (debugOn) System.out.println("next path: " + path); AttributeList fields = extractField(path); if (!fields.isEmpty()) { records[i++] = fields; if (fields.size() > numChildObjects) { numChildObjects = fields.size(); } } else { valid = false; break; } } if (!normalize) { deNormalize(paths); } else { normalize(paths); } } return valid; } /** * @param paths */ private void deNormalize(List<String> paths) { int index = 0; fieldTypes.clear(); childObjectPaths.clear(); //all paths for (String path : paths) { if (isChildObject(path)) { String childPath = getChildPath(path); fieldTypes.put(index, childPath); //number of child object fields for a given child path childObjectPaths.put(childPath, records[index].size()); } else { fieldTypes.put(index, "root"); } ++index; } if (childObjectPaths.size() > 1) { throw new IllegalStateException("can not normalize with multiple child object types"); } //replicated parent attributes replicateParentAttributes(); //get normalized records getDeNormalizedRecords(); } /** * @param path * @return */ private boolean isChildObject(String path) { //ends with either list of primitives or child of object which is an element of a list String[] pathElements = path.split("\\."); int len = pathElements.length; return len >= 2 && pathElements[len - 2].endsWith(listChild) || pathElements[len - 1].endsWith(listChild); } /** * @param path * @return */ private String getChildPath(String path) { String[] pathElements = path.split("\\."); String childPath = path; if (!pathElements[pathElements.length - 1].endsWith(listChild)) { int pos = path.lastIndexOf("."); childPath = path.substring(0, pos); } return childPath; } /** * Replicate parent attributes * */ private void replicateParentAttributes() { for (AttributeList fields : records) { if (fields.size() == 1) { //parent object field String value = fields.get(0); //replicate as many times as the number of child objects for(int i = 1; i < numChildObjects; ++i) { fields.add(value); } } } } /** * */ private void getDeNormalizedRecords() { extractedRecords.clear(); //rows for (int i = 0; i < numChildObjects; ++i) { String[] record = new String[numAttributes]; //fields for (int j = 0; j < numAttributes; ++j) { record[j] = records[j].get(i); } extractedRecords.add(record); } } /** * @param paths */ private void normalize(List<String> paths) { int index = 0; childObjectPaths.clear(); numParentFields = 0; entityColumnIndexes.clear(); idFieldIndex = -1; for (String path : paths) { if (isChildObject(path)) { //child String childPath = getChildPath(path); //field indexes for this child object List<Integer> indexes = getEnityColIndexes(childPath); indexes.add(index); childObjectPaths.put(childPath, records[index].size()); } else { //root List<Integer> indexes = getEnityColIndexes(ROOT_ENTITY); indexes.add(index); //root object ID field if (null != idFieldPath && path.equals(idFieldPath)) { idFieldIndex = index; } } ++index; } //auto generate Id if (-1 == idFieldIndex) { if (autoIdGeneration) { idFieldIndex = 0; } } if (idFieldIndex < 0) { throw new IllegalStateException("parent entity id field not found"); } //build parent record buildParentRecord(); //build all child records buildChildRecords(); } /** * @param entity * @return */ private List<Integer> getEnityColIndexes(String entity) { List<Integer> indexes = entityColumnIndexes.get(entity); if (null == indexes) { indexes = new ArrayList<Integer>(); entityColumnIndexes.put(entity, indexes); } return indexes; } /** * */ private void buildParentRecord() { List<Integer> indexes = getEnityColIndexes(ROOT_ENTITY); int i = 0; if (autoIdGeneration) { //additional field for entity type synthetic Id extractedParentRecord = new String[indexes.size() + 2]; extractedParentRecord[i++] = ROOT_ENTITY; extractedParentRecord[i++] = BasicUtils.generateId(); } else { //additional field for entity type extractedParentRecord = new String[indexes.size() + 1]; extractedParentRecord[i++] = ROOT_ENTITY; } //populate all fields of root object for (int index : indexes) { extractedParentRecord[i++] = records[index].get(0); } } /** * */ private void buildChildRecords() { extractedChildRecords.clear(); //all child objects for (String entity : entityColumnIndexes.keySet()) { if (!entity.equals(ROOT_ENTITY)) { List<String[]> childRecList = new ArrayList<String[]>(); //field indexes for this child object List<Integer> indexes = getEnityColIndexes(entity); //number of records for this child object int numRecs = childObjectPaths.get(entity); //for all child records for(int i = 0; i < numRecs; ++i) { //additional fields for entity type and parent ID String[] childRec = new String[indexes.size() + 2]; int j = 0; //entity type childRec[j++] = entity; //and reference to parent record, index shifted to accommodate entity type in parent record childRec[j++] = extractedParentRecord[idFieldIndex + 1]; //all child record fields for (int index : indexes) { childRec[j++] = records[index].get(i); } childRecList.add(childRec); } extractedChildRecords.put(entity, childRecList); } } } private static class AttributeList extends ArrayList<String> {} }