/* (c) 2014 LinkedIn Corp. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not use * this file except in compliance with the License. You may obtain a copy of the * License at http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software distributed * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR * CONDITIONS OF ANY KIND, either express or implied. */ package com.linkedin.cubert.block; import java.util.Arrays; import java.util.HashMap; import java.util.HashSet; import java.util.Map; import java.util.Set; import org.codehaus.jackson.JsonNode; import org.codehaus.jackson.map.ObjectMapper; import org.codehaus.jackson.node.ArrayNode; /** * Describes the schema of a block. * * The description is a list of ColumnType for the fields in the Tuple. * * @author Maneesh Varshney * */ public class BlockSchema { private final ColumnType[] columnTypes; private Map<String, Integer> indexMap; public BlockSchema(BlockSchema schema) { this.columnTypes = schema.columnTypes; } public BlockSchema(ColumnType[] columnTypes) { if (columnTypes == null) throw new IllegalArgumentException("input argument is null"); this.columnTypes = columnTypes; } public BlockSchema(JsonNode json) { int ncols = json.size(); columnTypes = new ColumnType[ncols]; for (int i = 0; i < ncols; i++) columnTypes[i] = new ColumnType(json.get(i)); } public BlockSchema(String str) { if (str == null) throw new IllegalArgumentException("input argument is null"); String pairs[] = str.split(","); columnTypes = new ColumnType[pairs.length]; int idx = 0; for (String pair : pairs) { pair = pair.trim(); String[] typeName = pair.split("\\s+"); String type = typeName[0].trim(); String name = typeName[1].trim(); ColumnType ctype = new ColumnType(); ctype.setName(name); ctype.setType(type); columnTypes[idx++] = ctype; } } public int getNumColumns() { return columnTypes.length; } public String getName(int index) { return columnTypes[index].getName(); } public DataType getType(int index) { return columnTypes[index].getType(); } public ColumnType getColumnType(int index) { return columnTypes[index]; } public ColumnType[] getColumnTypes() { return columnTypes; } public boolean hasIndex(String colName) { return getIndexMap().get(colName) != null; } public int getIndex(String columnName) { Integer index = getIndexMap().get(columnName); if (index == null) throw new IllegalArgumentException("Column [" + columnName + "] is not part of schema : " + toString()); return index; } public boolean isFlatSchema() { for (ColumnType ct : columnTypes) if (!ct.getType().isPrimitive()) return false; return true; } public int getMemorySize() { int size = 0; for (ColumnType type : columnTypes) { switch (type.getType()) { case BYTE: size += 1; break; case DOUBLE: size += 8; break; case FLOAT: size += 4; break; case INT: size += 4; break; case LONG: size += 8; break; default: throw new IllegalArgumentException("Cannot estimate memory size of Tuple with non-numerical fields"); } } return size; } public Map<String, Integer> getIndexMap() { if (indexMap == null) { indexMap = new HashMap<String, Integer>(); int idx = 0; for (ColumnType type : columnTypes) { indexMap.put(type.getName(), idx++); } } return indexMap; } /** * Returns an array of column names for the specified ColumnType array. * * @return */ public String[] getColumnNames() { String[] colNames = new String[columnTypes.length]; int ncols = colNames.length; for (int i = 0; i < ncols; i++) { colNames[i] = columnTypes[i].getName(); } return colNames; } /** * Returns the subset of the input ColumnType array, where the name of the columns in * the subset matches the specified array of string. * * The order of the ColumnType in the subset will be identical to the order specified * in the array of strings. * * @param subset * @return */ public BlockSchema getSubset(String[] subset) { ColumnType[] subsetTypes = new ColumnType[subset.length]; Map<String, ColumnType> map = asMap(); int idx = 0; for (String colname : subset) { ColumnType type = map.get(colname); if (type == null) throw new IllegalArgumentException("Column " + colname + " is not present in ColumnTypes"); subsetTypes[idx++] = type; } return new BlockSchema(subsetTypes); } /** * Returns the subset of the input ColumnType array where the name of columns in the * subset DO NOT match the specified array of string. * * The order of ColumnType in the subset will be same as the order found in the input * ColumnType array. * * @param subset * @return */ public BlockSchema getComplementSubset(String[] subset) { ColumnType[] complement = new ColumnType[getNumColumns() - subset.length]; Set<String> set = new HashSet<String>(); set.addAll(Arrays.asList(subset)); int idx = 0; for (ColumnType type : columnTypes) { if (!set.contains(type.getName())) { complement[idx++] = type; } } return new BlockSchema(complement); } public BlockSchema append(BlockSchema other) { ColumnType[] first = this.columnTypes; ColumnType[] second = other.columnTypes; ColumnType[] append = new ColumnType[first.length + second.length]; System.arraycopy(first, 0, append, 0, first.length); System.arraycopy(second, 0, append, first.length, second.length); return new BlockSchema(append); } public boolean allFieldsAllowShallowCopy() { for (ColumnType columnType : columnTypes) if (!columnType.getType().allowShallowCopy()) return false; return true; } /** * Returns a map of column name and the column type for the specified ColumnType * array. * * @return */ public Map<String, ColumnType> asMap() { Map<String, ColumnType> map = new HashMap<String, ColumnType>(); for (ColumnType type : columnTypes) { map.put(type.getName(), type); } return map; } public JsonNode toJson() { ArrayNode node = new ObjectMapper().createArrayNode(); for (ColumnType ct : columnTypes) node.add(ct.toJson()); return node; } @Override public boolean equals(Object obj) { if (this == obj) return true; if (obj == null) return false; if (getClass() != obj.getClass()) return false; BlockSchema other = (BlockSchema) obj; if (!Arrays.equals(columnTypes, other.columnTypes)) return false; return true; } /** * Compares schemas by ignoring mismatches in the numerical types. * * @param other * @return */ public boolean equalsIgnoreNumeric(BlockSchema other) { if (this.columnTypes.length != other.columnTypes.length) return false; for (int i = 0; i < columnTypes.length; i++) { ColumnType type1 = columnTypes[i]; ColumnType type2 = other.columnTypes[i]; if (!type1.getName().equals(type2.getName())) return false; if (type1.getType().isNumerical() && type2.getType().isNumerical()) continue; if (!type1.matches(type2)) return false; } return true; } @Override public String toString() { return Arrays.toString(columnTypes); } }