/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.pig.backend.hadoop.executionengine.physicalLayer.expressionOperators.regex;
import java.io.Serializable;
import java.util.regex.Pattern;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.expressionOperators.PORegexp;
public class RegexInit implements RegexImpl, Serializable {
private static final long serialVersionUID = 1L;
// Intersection and subtraction ( subtraction cannot be used w/o intersection )
// ,reluctant and possesive quantifiers
// is only possible in java.util.regex
private static final String[] javaRegexOnly = { "&&", "??", "*?", "+?", "}?",
"?+", "*+", "++", "}+", "^", "$", "(?" };
PORegexp regexop = null;
int side = -1;
boolean rhsConstant = false;
public RegexInit(PORegexp regexoperator ) {
this.regexop = regexoperator;
}
public void setConstExpr( boolean rhsConstant ) {
this.rhsConstant = rhsConstant;
}
/**
* This function determines the type of pattern we are working with
* The return value of the function determines the type we are expecting
* @param pattern
* @return int, 0 means this is java.util.regex,
* 1 means this is dk.brics.automaton
*/
private int determineBestRegexMethod( String pattern ) {
for( int i = 0; i < javaRegexOnly.length; i++ ) {
for( int j = pattern.length(); j > 0; ) {
j = pattern.lastIndexOf(javaRegexOnly[i], j );
if( j > 0 ) {
int precedingEsc = precedingEscapes(pattern, j);
if( precedingEsc %2 == 0 ) {
return 0;
}
j = j - precedingEsc;
} else if ( j == 0 ) {
return 0;
}
}
}
// Determine if there are any complex unions in pattern
// Complex unions are [a-m[n-z]]
int index = pattern.indexOf('[');
if( index >= 0 ) {
int precedingEsc = precedingEscapes(pattern, index);
if( index != 0 ) {
while( precedingEsc %2 == 1 ) {
index = pattern.indexOf('[', index + 1);
precedingEsc = precedingEscapes(pattern, index);
}
}
int index2 = 0;
int index3 = 0;
while( index != -1 && index < pattern.length() ) {
index2 = pattern.indexOf(']', index );
if( index2 == -1 ) {
break;
}
precedingEsc = precedingEscapes(pattern, index2);
// Find the next ']' which is not '\\]'
while( precedingEsc %2 == 1 ) {
index2 = pattern.indexOf(']', index2 + 1);
precedingEsc = precedingEscapes(pattern, index2);
}
if( index2 == -1 ) {
break;
}
index3 = pattern.indexOf('[', index + 1 );
precedingEsc = precedingEscapes(pattern, index3);
if( index3 == -1 ) {
break;
}
// Find the next '[' which is not '\\['
while( precedingEsc %2 == 1 ) {
index3 = pattern.indexOf('[', index3 + 1);
precedingEsc = precedingEscapes(pattern, index3);
}
if( index3 == -1 ) {
break;
}
if( index3 < index2 ) {
return 0;
}
index = index3;
}
}
index = pattern.lastIndexOf('\\');
if( index > -1 ) {
int precedingEsc = precedingEscapes(pattern, index);
// This is the case where we have complex regexes
// e.g. \d, \D, \s...etc
while( index != -1 ) {
if( precedingEsc %2 == 0 && (index + 1 ) < pattern.length() ) {
char index_1 = pattern.charAt(index + 1 );
if( index_1 == '1' || index_1 == '2' ||
index_1 == '3' || index_1 == '4' ||
index_1 == '5' || index_1 == '6' ||
index_1 == '7' || index_1 == '8' ||
index_1 == '9' ||
index_1 == 'a' || index_1 == 'e' ||
index_1 == '0' || index_1 == 'x' ||
index_1 == 'u' ||
index_1 == 'c' ||
index_1 == 'Q' ||
index_1 == 'w' || index_1 == 'W' ||
index_1 == 'd' || index_1 == 'D' ||
index_1 == 's' || index_1 == 'S' ||
index_1 == 'p' || index_1 == 'P' ||
index_1 == 'b' || index_1 == 'B' ||
index_1 == 'A' || index_1 == 'G' ||
index_1 == 'z' || index_1 == 'Z'
) {
return 0;
}
}
// We skip past all the escapes
index = index - ( precedingEsc + 1 );
precedingEsc = -1;
if( index >= 0 ){
index = pattern.lastIndexOf('\\',index);
precedingEsc = precedingEscapes(pattern, index);
}
}
}
return 1;
}
private int precedingEscapes( String pattern, int startIndex ) {
if( startIndex > 0 ) {
// This is the case when there are an odd number of escapes '//'
int precedingEscapes = 0;
for(int j = startIndex - 1; j >= 0; j-- ) {
if( pattern.charAt(j) == '\\' ) {
precedingEscapes++;
} else {
break;
}
}
return precedingEscapes;
} else if ( startIndex == 0 ) {
return 0;
}
return -1;
}
private RegexImpl compile( String pattern ) {
RegexImpl impl = null;
int regexMethod = determineBestRegexMethod(pattern);
switch( regexMethod ) {
case 0:
impl = new CompiledRegex(Pattern.compile(pattern));
break;
case 1:
try {
impl = new CompiledAutomaton(pattern);
} catch( IllegalArgumentException e ) {
Log log = LogFactory.getLog(getClass());
log.debug("Got an IllegalArgumentException for Pattern: " + pattern );
log.debug(e.getMessage());
log.debug("Switching to java.util.regex" );
impl = new CompiledRegex(Pattern.compile(pattern));
}
break;
default:
break;
}
return impl;
}
@Override
public boolean match(String lhs, String rhs) {
RegexImpl impl = null;
if( rhsConstant ) {
impl = compile( rhs );
} else {
impl = new NonConstantRegex();
}
this.regexop.setImplementation( impl );
return impl.match(lhs, rhs);
}
}