/* (c) 2014 LinkedIn Corp. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use
* this file except in compliance with the License. You may obtain a copy of the
* License at http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed
* under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
* CONDITIONS OF ANY KIND, either express or implied.
*/
package com.linkedin.cubert.functions.builtin;
import com.linkedin.cubert.block.BlockSchema;
import com.linkedin.cubert.block.ColumnType;
import com.linkedin.cubert.block.DataType;
import com.linkedin.cubert.functions.Function;
import com.linkedin.cubert.operator.PreconditionException;
import com.linkedin.cubert.operator.PreconditionExceptionType;
import org.apache.pig.data.Tuple;
import java.io.IOException;
import java.util.HashSet;
import java.util.Set;
/**
* Function that uses a HashSet to check if the input column has been seen before.
*
* @author Maneesh Varshney
*/
public class IsDistinct extends Function
{
private final Set<Object> set = new HashSet<Object>();
@Override
public Object eval(Tuple tuple) throws IOException
{
Object obj = tuple.get(0);
boolean isDistinct = !set.contains(obj);
if (isDistinct)
set.add(obj);
return isDistinct;
}
@Override
public ColumnType outputSchema(BlockSchema inputSchema) throws PreconditionException
{
if (inputSchema.getNumColumns() != 1)
{
throw new PreconditionException(PreconditionExceptionType.INVALID_SCHEMA,
"IsDistinct function takes one argument only");
}
return new ColumnType("", DataType.BOOLEAN);
}
}