/**
* Copyright (C) 2010-2017 Gordon Fraser, Andrea Arcuri and EvoSuite
* contributors
*
* This file is part of EvoSuite.
*
* EvoSuite is free software: you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as published
* by the Free Software Foundation, either version 3.0 of the License, or
* (at your option) any later version.
*
* EvoSuite is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with EvoSuite. If not, see <http://www.gnu.org/licenses/>.
*/
package org.evosuite.utils;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Queue;
import java.util.Set;
import java.util.regex.Pattern;
import org.jgrapht.DirectedGraph;
import org.jgrapht.alg.CycleDetector;
import org.jgrapht.graph.DefaultDirectedGraph;
import org.jgrapht.graph.DefaultEdge;
import org.jgrapht.traverse.TopologicalOrderIterator;
import dk.brics.automaton.Automaton;
import dk.brics.automaton.RegExp;
import dk.brics.automaton.State;
import dk.brics.automaton.Transition;
/**
* Class used to define the distance between a string and a regex
*/
public class RegexDistanceUtils {
/*
* Automatons for regex can be expensive to build. So we cache them,
* as we might need to access to them several times during the search
*/
private static Map<String, List<State>> regexStateCache = new HashMap<String, List<State>>();
private static Map<String, Automaton> regexAutomatonCache = new HashMap<String, Automaton>();
public static Automaton getRegexAutomaton(String regex) {
if (!regexAutomatonCache.containsKey(regex)) {
cacheRegex(regex);
}
return regexAutomatonCache.get(regex);
}
public static String getRegexInstance(String regex) {
if (!regexAutomatonCache.containsKey(regex)) {
cacheRegex(regex);
}
Automaton automaton = regexAutomatonCache.get(regex);
return automaton.getShortestExample(true);
}
public static String getNonMatchingRegexInstance(String regex) {
if (!regexAutomatonCache.containsKey(regex)) {
cacheRegex(regex);
}
Automaton automaton = regexAutomatonCache.get(regex);
return automaton.getShortestExample(false);
}
private static class GraphTransition {
public enum TransitionType{INSERTION, DELETION, REPLACEMENT,
/**
* A phantom transition is an artificial transition from the sink/final states to a single artificial sink/state.
* This is used to simplify the recursion calculation of the subpath costs.
*/
PHANTOM};
public final double cost;
public final int fromRow;
public final State fromState;
public final TransitionType type;
public GraphTransition(double cost, int fromRow, State fromState, TransitionType type) {
this.cost = cost;
this.fromRow = fromRow;
this.fromState = fromState;
this.type = type;
}
}
/**
* Normalize x in [0,1]
*
* @param x
* @return
*/
private static double normalize(double x) {
return x / (x + 1.0);
}
/**
* Java regular expressions contain predefined character classes which the
* regex parser cannot handle
*
* @param regex
* @return
*/
public static String expandRegex(String regex) {
// . Any character (may or may not match line terminators)
// \d A digit: [0-9]
String newRegex = regex.replaceAll("\\\\d", "[0-9]");
// \D A non-digit: [^0-9]
newRegex = newRegex.replaceAll("\\\\D", "[^0-9]");
// \s A whitespace character: [ \t\n\x0B\f\r]
newRegex = newRegex.replaceAll("\\\\s", "[ \\t\\n\\f\\r]");
// \S A non-whitespace character: [^\s]
newRegex = newRegex.replaceAll("\\\\S", "[^ \\t\\n\\f\\r]");
// \w A word character: [a-zA-Z_0-9]
newRegex = newRegex.replaceAll("\\\\w", "[a-zA-Z_0-9]");
// \W A non-word character: [^\w]
newRegex = newRegex.replaceAll("\\\\W", "[^a-zA-Z_0-9]");
if(newRegex.startsWith("^"))
newRegex = newRegex.substring(1);
if(newRegex.endsWith("$"))
newRegex = newRegex.substring(0, newRegex.length() - 1);
// TODO: Some of these should be handled, not just ignored!
newRegex = removeFlagExpressions(newRegex);
newRegex = removeReluctantOperators(newRegex);
return newRegex;
}
protected static String removeFlagExpressions(String regex) {
// Case insensitive
regex = regex.replaceAll("\\(\\?i\\)", "");
// Unix lines mode
regex = regex.replaceAll("\\(\\?d\\)", "");
// Permit comments and whitespace in pattern
regex = regex.replaceAll("\\(\\?x\\)", "");
// Multiline mode
regex = regex.replaceAll("\\(\\?m\\)", "");
// Dotall
regex = regex.replaceAll("\\(\\?s\\)", "");
// Unicode case
regex = regex.replaceAll("\\(\\?u\\)", "");
return regex;
}
protected static String removeReluctantOperators(String regex) {
regex = regex.replaceAll("\\+\\?", "\\+");
regex = regex.replaceAll("\\*\\?", "\\*");
regex = regex.replaceAll("\\?\\?", "\\?");
return regex;
}
/**
* Ensure that each row has the full data structures containing the target state
*
* @param transitions
* @param state
* @param numRows
*/
private static void ensureState(
Map<Integer, Map<State, Set<GraphTransition>>> transitions, State state,
int numRows) {
for (int row = 0; row <= numRows; row++) {
if (!transitions.containsKey(row))
transitions.put(row, new HashMap<State, Set<GraphTransition>>());
if (!transitions.get(row).containsKey(state))
transitions.get(row).put(state, new HashSet<GraphTransition>());
}
}
private static void cacheRegex(String regex) {
String r = expandRegex(regex);
Automaton automaton = new RegExp(r, RegExp.NONE).toAutomaton();
automaton.expandSingleton();
// We convert this to a graph without self-loops in order to determine the topological order
DirectedGraph<State, DefaultEdge> regexGraph = new DefaultDirectedGraph<State, DefaultEdge>(
DefaultEdge.class);
Set<State> visitedStates = new HashSet<State>();
Queue<State> states = new LinkedList<State>();
State initialState = automaton.getInitialState();
states.add(initialState);
while (!states.isEmpty()) {
State currentState = states.poll();
if (visitedStates.contains(currentState))
continue;
if (!regexGraph.containsVertex(currentState))
regexGraph.addVertex(currentState);
for (Transition t : currentState.getTransitions()) {
// Need to get rid of back edges, otherwise there is no topological order!
if (!t.getDest().equals(currentState)) {
regexGraph.addVertex(t.getDest());
regexGraph.addEdge(currentState, t.getDest());
states.add(t.getDest());
CycleDetector<State, DefaultEdge> det = new CycleDetector<State, DefaultEdge>(
regexGraph);
if (det.detectCycles()) {
regexGraph.removeEdge(currentState, t.getDest());
}
}
}
visitedStates.add(currentState);
}
TopologicalOrderIterator<State, DefaultEdge> iterator = new TopologicalOrderIterator<State, DefaultEdge>(
regexGraph);
List<State> topologicalOrder = new ArrayList<State>();
while (iterator.hasNext()) {
topologicalOrder.add(iterator.next());
}
regexStateCache.put(regex, topologicalOrder);
regexAutomatonCache.put(regex, automaton);
}
/**
* <p>
* Get the distance between the arg and the given regex.
* All operations (insertion/deletion/replacement) cost 1.
* There is no assumption on where and how the operations
* can be done (ie all sequences are valid).
* </p>
*/
public static int getStandardDistance(String arg, String regex) {
if(!isSupportedRegex(regex)) {
return getDefaultDistance(arg, regex);
}
RegexGraph graph = new RegexGraph(arg, regex);
CostMatrix matrix = new CostMatrix();
return matrix.calculateStandardCost(graph);
}
private static int getDefaultDistance(String arg, String regex) {
Pattern p = Pattern.compile(regex);
if (p.matcher(arg).matches())
return 0;
else
return 1;
}
/**
* Determine whether the regex requires features that are
* not supported by the regex automaton library
*
* @param regex
* @return
*/
private static boolean isSupportedRegex(String regex) {
if(regex.contains("\\b"))
return false;
return true;
}
/**
* <p>Get the distance between the arg and the given regex.
* Insertion/deletion cost 1, whereas replacement is in [0,1] depending
* on the actual character values. </p>
*
* <p> Note: the distance is tailored for the <b>StringAVM<b/> algorithm,
* in which characters are only inserted/appended at the end.</p>
*
* @param arg
* @param regex
* @return
*/
public static double getDistanceTailoredForStringAVM(String arg, String regex) {
RegexGraph graph = new RegexGraph(arg, regex);
CostMatrix matrix = new CostMatrix();
return matrix.calculateCostForStringAVM(graph);
}
protected static Automaton getAndCacheAutomaton(String regex){
/*
* Cache it if first time we build it
*/
if (!regexAutomatonCache.containsKey(regex)) {
/*
* Create an automaton representing the regex
*/
cacheRegex(regex);
}
Automaton automaton = regexAutomatonCache.get(regex);
return automaton;
}
/**
* A graph created based on an "arg" that is matched against a "regex".
* There is going to be arg.length+1 copies of the regex automaton.
* Each copy represents a "row".
* Each automaton state, in topological order, represents a "column".
* The graph can be considered as a "rows"x"columns" matrix.
*
* @author arcuri
*
*/
private static class RegexGraph {
private Map<Integer, Map<State, Set<GraphTransition>>> transitions;
private Map<Integer, State> intToStateMap;
private Map<State, Integer> stateToIntMap;
/**
* Build the graph
* @param arg
* @param regex
*/
public RegexGraph(String arg, String regex){
transitions = createGraph(arg,regex);
}
public int getNumberOfRows(){
return transitions.keySet().size();
}
public int getNumberOfColumns(){
return stateToIntMap.size();
}
/**
* Get all the incoming transitions to the node located at coordinate "row" and "column"
* @param row
* @param column
* @return
*/
public Set<GraphTransition> getIncomingTransitions(int row, int column){
State state = intToStateMap.get(column);
return transitions.get(row).get(state);
}
public int getColumn(State state){
return stateToIntMap.get(state);
}
private Map<Integer, Map<State, Set<GraphTransition>>> createGraph(String arg, String regex){
/*
* Create a graph to calculate the distance. The algorithm is based on what discussed in:
*
* Mohammad Alshraideh and Leonardo Bottaci
* Search-based software test data generation for string data using program-specific search operators
* http://neo.lcc.uma.es/mase/attachments/085_TestDataGenerationForStringData.pdf
*
* and
*
* EUGENE W. MYERS and WEBB MILLER
* APPROXIMATE MATCHING OF REGULAR EXPRESSIONS
* http://www.cs.mun.ca/~harold/Courses/Old/Ling6800.W06/Diary/reg.aprox.pdf
*/
Automaton automaton = getAndCacheAutomaton(regex);
final int NUM_CHARS = arg.length();
List<State> topologicalOrder = regexStateCache.get(regex);
Map<Integer, Map<State, Set<GraphTransition>>> transitions = new HashMap<Integer, Map<State, Set<GraphTransition>>>();
intToStateMap = new HashMap<Integer, State>();
stateToIntMap = new HashMap<State, Integer>();
int numState = 0;
for (State currentState : topologicalOrder) {
/*
* Init data structure to quickly map/access state/index
*/
stateToIntMap.put(currentState, numState);
intToStateMap.put(numState, currentState);
numState++;
for (Transition t : currentState.getTransitions()) {
State destination = t.getDest();
ensureState(transitions, destination , NUM_CHARS);
for (int row = 0; row <= NUM_CHARS; row++) {
/*
* add an insertion edge from currentState in row to target state in same row
*/
transitions.get(row).get(destination).add(new GraphTransition(1.0, row, currentState, GraphTransition.TransitionType.INSERTION));
}
for (int row = 0; row < NUM_CHARS; row++) {
/*
* Add a replacement edge from currentState in row to t.getDest in row+1
* if charAt row+1 == the parameter of this transition, this is a zero-cost edge
*/
double cost = 0.0;
if (arg.charAt(row) < t.getMin() || arg.charAt(row) > t.getMax()) {
int distMin = Math.abs(arg.charAt(row) - t.getMin());
int distMax = Math.abs(arg.charAt(row) - t.getMax());
cost = normalize(Math.min(distMin, distMax));
}
/*
* Important: even if the cost is 0 (eg match on the arg/regex in which we replace char X with X), we CANNOT
* use a PHANTOM transition. Even if we do not replace anything, we still need to consider it as a replacement
* transition. Consider the case
*
* "ac".matches("abc")
*
* If we used a phantom transition to represent the alignment c/c, then it would be possible to insert 'b' in the
* middle of "abc". On the other hand, if we use a replacement c/c, then inserting 'b' would not be allowed, as an
* insertion cannot be followed by a replacement.
*/
transitions.get(row + 1).get(destination).add(new GraphTransition(cost, row, currentState, GraphTransition.TransitionType.REPLACEMENT));
}
}
ensureState(transitions, currentState, NUM_CHARS);
for (int row = 0; row < NUM_CHARS; row++) {
/*
* add a deletion edge with cost 1 from currentState to currentState in next row
*/
transitions.get(row + 1).get(currentState).add(new GraphTransition(1.0, row, currentState, GraphTransition.TransitionType.DELETION));
}
}
// Add zero-cost transitions from accepting states to final state
State finalState = new State();
ensureState(transitions, finalState, NUM_CHARS);
for (State s : automaton.getStates()) {
if (s.isAccept()) {
transitions.get(NUM_CHARS).get(finalState).add(new GraphTransition(0, NUM_CHARS, s, GraphTransition.TransitionType.PHANTOM));
}
}
intToStateMap.put(numState, finalState);
stateToIntMap.put(finalState, numState);
return transitions;
}
}
/**
* Class used to calculate the cost, ie the actual distance, based on a RegexGraph.
*
* @author arcuri
*/
private static class CostMatrix{
private final int DEL = 0;
private final int REP = 1;
private final int INS = 2;
public CostMatrix() {
super();
}
public int calculateStandardCost(RegexGraph graph){
final int ROWS = graph.getNumberOfRows();
final int COLUMNS = graph.getNumberOfColumns();
final double[][] matrix = new double[ROWS][COLUMNS];
// First row is cost of matching empty sequence on regex
final int FIRST_ROW = 0;
/*
* init first starting state with 0 costs
*/
matrix[FIRST_ROW][0] = 0;
//look at first row (which is special)
for (int col = 1; col < graph.getNumberOfColumns(); col++) {
double min = Double.MAX_VALUE;
for (GraphTransition t : graph.getIncomingTransitions(FIRST_ROW, col)) {
int otherCol = graph.getColumn(t.fromState);
//self transition
if (col == otherCol){
continue;
}
double otherCost = matrix[FIRST_ROW][otherCol];
min = Math.min(min, getSubPathCost(otherCost, Math.ceil(t.cost)));
}
matrix[FIRST_ROW][col] = min;
}
//then look at the other rows
for(int i=1; i<ROWS; i++){
for (int col = 0; col < COLUMNS; col++) {
matrix[i][col] = Double.MAX_VALUE;
for (GraphTransition t : graph.getIncomingTransitions(i, col)) {
int otherCol = graph.getColumn(t.fromState);
int otherRow = t.fromRow;
if(! t.type.equals(GraphTransition.TransitionType.PHANTOM)){
matrix[i][col] = Math.min(matrix[i][col], getSubPathCost(matrix[otherRow][otherCol],Math.ceil(t.cost)));
} else {
/*
* artificial transition to final/sink state, so just take same values as previous state
*/
matrix[i][col] = Math.min(matrix[i][col], matrix[otherRow][otherCol]);
}
}
}
}
double min = matrix[ROWS-1][COLUMNS-1];
return (int)Math.round(min);
}
/**
* Note: this is different from normal matching algorithms, as we enforce an order
* among the operators: delete, replace and then insert.
* @param graph
* @return
*/
public double calculateCostForStringAVM(RegexGraph graph){
final int ROWS = graph.getNumberOfRows();
final int COLUMNS = graph.getNumberOfColumns();
/*
* we create a matrix based on each row and each column in the graph.
* Each cell has 3 values, each representing the cost of thre different types of path:
*
* 0) only deletion
* 1) deletions followed by replacement
* 2) as above, and then followed by insertions
*/
final double[][][] matrix = new double[ROWS][COLUMNS][3];
calculateInsertionCostOnFirstRow(graph, matrix);
for(int i=1; i<ROWS; i++){
for (int col = 0; col < COLUMNS; col++) {
/*
* unless a path is explicitly updated, it will have maximum distance by default
*/
matrix[i][col][DEL] = Double.MAX_VALUE;
matrix[i][col][REP] = Double.MAX_VALUE;
matrix[i][col][INS] = Double.MAX_VALUE;
for (GraphTransition t : graph.getIncomingTransitions(i, col)) {
int otherCol = graph.getColumn(t.fromState);
int otherRow = t.fromRow;
if(t.type.equals(GraphTransition.TransitionType.INSERTION)){
assert otherRow == i;
/*
* if we have an insertion, only the insertion path can be continued.
* that's the reason why on the left side we only update for [INS].
* An insertion can continue any type of path (and so all types are present on the right side).
*/
matrix[i][col][INS] = Math.min(matrix[i][col][INS], getSubPathCost(matrix[otherRow][otherCol][DEL],t.cost));
matrix[i][col][INS] = Math.min(matrix[i][col][INS], getSubPathCost(matrix[otherRow][otherCol][REP],t.cost));
matrix[i][col][INS] = Math.min(matrix[i][col][INS], getSubPathCost(matrix[otherRow][otherCol][INS],t.cost));
} else if(t.type.equals(GraphTransition.TransitionType.REPLACEMENT)){
/*
* if we have a replacement, then we cannot continue a delete path.
* So, no [DEL] on the left side.
* A replacement can continue a delete or replace path, but not an insertion one (and so [DEL] and
* [REP] on right side)
*/
matrix[i][col][REP] = Math.min(matrix[i][col][REP], getSubPathCost(matrix[otherRow][otherCol][DEL],t.cost));
matrix[i][col][REP] = Math.min(matrix[i][col][REP], getSubPathCost(matrix[otherRow][otherCol][REP],t.cost));
/*
* from this state on, an insertion path can be followed, with same cost (ie right side) as replacement path
*/
matrix[i][col][INS] = Math.min(matrix[i][col][INS], getSubPathCost(matrix[otherRow][otherCol][DEL],t.cost));
matrix[i][col][INS] = Math.min(matrix[i][col][INS], getSubPathCost(matrix[otherRow][otherCol][REP],t.cost));
} else if(t.type.equals(GraphTransition.TransitionType.DELETION)){
/*
* deletion can only follow a deletion path (so only [DEL] or right side).
* but, from this state on, any new path can be followed (so all on left side)
*/
matrix[i][col][DEL] = Math.min(matrix[i][col][DEL], getSubPathCost(matrix[otherRow][otherCol][DEL],t.cost));
matrix[i][col][REP] = Math.min(matrix[i][col][REP], getSubPathCost(matrix[otherRow][otherCol][DEL],t.cost));
matrix[i][col][INS] = Math.min(matrix[i][col][INS], getSubPathCost(matrix[otherRow][otherCol][DEL],t.cost));
} else if(t.type.equals(GraphTransition.TransitionType.PHANTOM)){
assert t.cost == 0;
/*
* artificial transition to final/sink state, so just take same values as previous state
*/
matrix[i][col][DEL] = Math.min(matrix[i][col][DEL], matrix[otherRow][otherCol][DEL]);
matrix[i][col][REP] = Math.min(matrix[i][col][REP], matrix[otherRow][otherCol][REP]);
matrix[i][col][INS] = Math.min(matrix[i][col][INS], matrix[otherRow][otherCol][INS]);
}
}
}
/*
* TODO: The algorithm of Myers's paper, at page 12, makes a distinction between D and E transitions.
* Insertions of type E are done last. Not fully clear if it has an effect here: ie, recall that
* here we do minimization (calculate distance) and not maximization (similarity)
*/
}
/*
* get the minimum among the 3 different paths in the sink state
*/
double min = Double.MAX_VALUE;
for(double value : matrix[ROWS-1][COLUMNS-1]){
if(value < min){
min = value;
}
}
return min;
}
/**
* We cannot just do previousStateCost + transitionCost, as there might be computational overflows
*
* @param previousStateCost
* @param transitionCost
* @return
* @throws IllegalArgumentException
*/
private double getSubPathCost(double previousStateCost, double transitionCost) throws IllegalArgumentException{
if(previousStateCost<0){
throw new IllegalArgumentException("previousStateCost cannot be negative: "+previousStateCost);
}
if(transitionCost<0){
throw new IllegalArgumentException("transitionCost cannot be negative: "+transitionCost);
}
if(previousStateCost == Double.MAX_VALUE || transitionCost == Double.MAX_VALUE){
return Double.MAX_VALUE;
}
double sum = previousStateCost + transitionCost;
if(sum<previousStateCost || sum<transitionCost){
/*
* likely overflow
*/
return Double.MAX_VALUE;
}
return sum;
}
/**
* First row is special, ie very different from the others
*
* @param graph
* @param matrix
*/
private void calculateInsertionCostOnFirstRow(RegexGraph graph, final double[][][] matrix) {
// First row is cost of matching empty sequence on regex
final int FIRST_ROW = 0;
/*
* init first starting state with 0 costs
*/
matrix[FIRST_ROW][0][0] = 0;
matrix[FIRST_ROW][0][1] = 0;
matrix[FIRST_ROW][0][2] = 0;
for (int col = 1; col < graph.getNumberOfColumns(); col++) {
double min = Double.MAX_VALUE;
for (GraphTransition t : graph.getIncomingTransitions(FIRST_ROW, col)) {
/*
* on first row, there can be only insertions coming from the same row,
* apart from last node that can have a phantom transition to sink state
*/
assert t.type.equals(GraphTransition.TransitionType.INSERTION) ||
t.type.equals(GraphTransition.TransitionType.PHANTOM);
assert t.fromRow == 0;
int otherCol = graph.getColumn(t.fromState);
//self transition
if (col == otherCol){
continue;
}
double otherCost = matrix[FIRST_ROW][otherCol][2];
min = Math.min(min, getSubPathCost(otherCost, t.cost));
}
/*
* as there can be only insertions, the delete and replace paths cannot be followed, and
* so maximum distance
*/
matrix[FIRST_ROW][col][0] = Double.MAX_VALUE;
matrix[FIRST_ROW][col][1] = Double.MAX_VALUE;
matrix[FIRST_ROW][col][2] = min;
}
}
}
}