/* (c) 2014 LinkedIn Corp. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use
* this file except in compliance with the License. You may obtain a copy of the
* License at http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed
* under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
* CONDITIONS OF ANY KIND, either express or implied.
*/
package com.linkedin.cubert.functions.builtin;
import java.util.regex.Pattern;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.pig.backend.executionengine.ExecException;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.expressionOperators.regex.CompiledAutomaton;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.expressionOperators.regex.CompiledRegex;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.expressionOperators.regex.RegexImpl;
import org.apache.pig.data.Tuple;
import com.linkedin.cubert.block.BlockSchema;
import com.linkedin.cubert.block.ColumnType;
import com.linkedin.cubert.block.DataType;
import com.linkedin.cubert.functions.Function;
import com.linkedin.cubert.operator.PreconditionException;
import com.linkedin.cubert.operator.PreconditionExceptionType;
/**
* Built-in MATCHES function.
*
* @author Rui Liu
*
*/
public class Match extends Function
{
// Intersection and subtraction ( subtraction cannot be used w/o intersection )
// ,reluctant and possesive quantifiers
// is only possible in java.util.regex
private static final String[] javaRegexOnly = { "&&", "??", "*?", "+?", "}?", "?+",
"*+", "++", "}+", "^", "$", "(?" };
private RegexImpl regexImpl;
@Override
public Object eval(Tuple tuple) throws ExecException
{
Object val = tuple.get(0);
if (val == null)
return null;
if (regexImpl == null)
{
regexImpl = compile((String) tuple.get(1));
}
return regexImpl.match((String) val, null);
}
private RegexImpl compile(String pattern)
{
RegexImpl impl = null;
int regexMethod = determineBestRegexMethod(pattern);
switch (regexMethod)
{
case 0:
impl = new CompiledRegex(Pattern.compile(pattern));
break;
case 1:
try
{
impl = new CompiledAutomaton(pattern);
}
catch (IllegalArgumentException e)
{
Log log = LogFactory.getLog(getClass());
log.debug("Got an IllegalArgumentException for Pattern: " + pattern);
log.debug(e.getMessage());
log.debug("Switching to java.util.regex");
impl = new CompiledRegex(Pattern.compile(pattern));
}
break;
default:
break;
}
return impl;
}
/**
* This function determines the type of pattern we are working with The return value
* of the function determines the type we are expecting
*
* @param pattern
* @return int, 0 means this is java.util.regex, 1 means this is dk.brics.automaton
*/
private int determineBestRegexMethod(String pattern)
{
for (int i = 0; i < javaRegexOnly.length; i++)
{
for (int j = pattern.length(); j > 0;)
{
j = pattern.lastIndexOf(javaRegexOnly[i], j);
if (j > 0)
{
int precedingEsc = precedingEscapes(pattern, j);
if (precedingEsc % 2 == 0)
{
return 0;
}
j = j - precedingEsc;
}
else if (j == 0)
{
return 0;
}
}
}
// Determine if there are any complex unions in pattern
// Complex unions are [a-m[n-z]]
int index = pattern.indexOf('[');
if (index >= 0)
{
int precedingEsc = precedingEscapes(pattern, index);
if (index != 0)
{
while (precedingEsc % 2 == 1)
{
index = pattern.indexOf('[', index + 1);
precedingEsc = precedingEscapes(pattern, index);
}
}
int index2 = 0;
int index3 = 0;
while (index != -1 && index < pattern.length())
{
index2 = pattern.indexOf(']', index);
if (index2 == -1)
{
break;
}
precedingEsc = precedingEscapes(pattern, index2);
// Find the next ']' which is not '\\]'
while (precedingEsc % 2 == 1)
{
index2 = pattern.indexOf(']', index2 + 1);
precedingEsc = precedingEscapes(pattern, index2);
}
if (index2 == -1)
{
break;
}
index3 = pattern.indexOf('[', index + 1);
precedingEsc = precedingEscapes(pattern, index3);
if (index3 == -1)
{
break;
}
// Find the next '[' which is not '\\['
while (precedingEsc % 2 == 1)
{
index3 = pattern.indexOf('[', index3 + 1);
precedingEsc = precedingEscapes(pattern, index3);
}
if (index3 == -1)
{
break;
}
if (index3 < index2)
{
return 0;
}
index = index3;
}
}
index = pattern.lastIndexOf('\\');
if (index > -1)
{
int precedingEsc = precedingEscapes(pattern, index);
// This is the case where we have complex regexes
// e.g. \d, \D, \s...etc
while (index != -1)
{
if (precedingEsc % 2 == 0 && (index + 1) < pattern.length())
{
char index_1 = pattern.charAt(index + 1);
if (index_1 == '1' || index_1 == '2' || index_1 == '3'
|| index_1 == '4' || index_1 == '5' || index_1 == '6'
|| index_1 == '7' || index_1 == '8' || index_1 == '9'
|| index_1 == 'a' || index_1 == 'e' || index_1 == '0'
|| index_1 == 'x' || index_1 == 'u' || index_1 == 'c'
|| index_1 == 'Q' || index_1 == 'w' || index_1 == 'W'
|| index_1 == 'd' || index_1 == 'D' || index_1 == 's'
|| index_1 == 'S' || index_1 == 'p' || index_1 == 'P'
|| index_1 == 'b' || index_1 == 'B' || index_1 == 'A'
|| index_1 == 'G' || index_1 == 'z' || index_1 == 'Z')
{
return 0;
}
}
// We skip past all the escapes
index = index - (precedingEsc + 1);
precedingEsc = -1;
if (index >= 0)
{
index = pattern.lastIndexOf('\\', index);
precedingEsc = precedingEscapes(pattern, index);
}
}
}
return 1;
}
private int precedingEscapes(String pattern, int startIndex)
{
if (startIndex > 0)
{
// This is the case when there are an odd number of escapes '//'
int precedingEscapes = 0;
for (int j = startIndex - 1; j >= 0; j--)
{
if (pattern.charAt(j) == '\\')
{
precedingEscapes++;
}
else
{
break;
}
}
return precedingEscapes;
}
else if (startIndex == 0)
{
return 0;
}
return -1;
}
@Override
public ColumnType outputSchema(BlockSchema inputSchema) throws PreconditionException
{
if (inputSchema.getColumnType(0).getType() != DataType.STRING)
throw new PreconditionException(PreconditionExceptionType.INVALID_SCHEMA,
"MATCH function should be applied to string value");
return new ColumnType(null, DataType.BOOLEAN);
}
}