package com.formulasearchengine.mathosphere.mathpd.text;
import java.util.ArrayList;
import java.util.List;
/**
* Basic String Matching Algorithm based on the {@link BoyerMooreSearch}.
* All matches are found, including repetitions after the first match.
* <br />
* Findings are independent of the capitalization, otherwise the
* tokenizer is not worth mentioning.
* <p>
* Description: <br/>
* Basic String Matching finds exact text matches with n-words, including
* repetitions after the first match.
*
* @author Vincent Stange
*/
public class BasicStringMatcher {
/* Matches should at least find some words to increase significance */
private int minWordLength;
/* Minimum character length of each pattern found to increase significance */
private int minPatternLength;
/**
* Creates a BasicStringMatcher with default parameters.
*/
public BasicStringMatcher() {
minWordLength = 6;
minPatternLength = 12;
}
/**
* Creates a BasicStringMatcher
*
* @param minWordLength Minimum word of a pattern.
* @param minPatternLength Minimum character length of each pattern found.
*/
public BasicStringMatcher(int minWordLength, int minPatternLength) {
this.minWordLength = minWordLength;
this.minPatternLength = minPatternLength;
}
/**
* Compares two texts and returns every match with the exact length
* of x words - whereby x is the parameter minWordLength.
*
* @param text1 Text 1
* @param text2 Text 2
* @return ordered list of matches between text1 and text2
* @throws Exception I would assume arrayoutofbound.
* The alphabet in BoyerMoore is too small?
*/
List<int[]> compare(String text1, String text2) throws Exception {
// prepare our result
BoyerMooreSearch bms = new BoyerMooreSearch();
List<int[]> matches = new ArrayList<>();
// prepare text1 - simplistic tokenizer
text1 = normalizeString(text1);
String[] split1 = text1.split(" ");
// prepare text2
text2 = normalizeString(text2);
char[] text2Array = text2.toCharArray();
int startIdxA = 0; // our current pointer for document A
int startIdxB = 0; // our current pointer for document B
for (int i = 0; i < (split1.length - minWordLength); ) {
// prepare pattern string
String sw = "";
int wc = 0;
for (; wc < minWordLength; wc++) {
sw = sw + (wc == 0 ? "" : " ") + split1[i + wc];
}
char[] pattern = sw.toCharArray();
int patternLng = sw.length();
// search for each pattern (minimum requirement)
if (patternLng >= minPatternLength) {
startIdxB = bms.search(pattern, text2Array, startIdxB, 0);
if (startIdxB != -1) {
matches.add(new int[] {startIdxA,
startIdxA + patternLng, startIdxB, startIdxB + patternLng, patternLng});
// jump over to the next words in docB after a complete match
startIdxB += patternLng + 1;
continue;
}
}
startIdxA += split1[i].length() + 1;
startIdxB = 0; // start anew in docB
i++; // at last just take the next word in docA
}
return matches;
}
/**
* A match-array is
* [start index text 1, end index text 1 , start index text 2, end index text 2 ]
*
* @param text1 Text 1
* @param text2 Text 2
* @return list of matches.
* @throws Exception ArrayOutOfBoundException
* The alphabet in BoyerMoore is too small?
*/
public List<int[]> getMatches(String text1, String text2) throws Exception {
return reconcileOverlappings(compare(text1, text2));
}
/**
* Are two matches overlapping in both documents?
* <p>
* I use a simplification since I know the matches are ordered.
* This means that all positions in match2 are after match1
* or at the same position.
*
* @param m1 previous match
* @param m2 succeeding match
* @return do they overlap? [..[__]_]
*/
boolean isOverlapping(int[] m1, int[] m2) {
if (m1[0] <= m2[0] && m2[0] <= m1[1]) { // [..[__] in docA
return m1[2] <= m2[2] && m2[2] <= m1[3]; // [..[__] in docB
}
return false;
}
private String normalizeString(String text1) {
text1 = text1.toLowerCase().replaceAll("([^a-z]+)", " ").toLowerCase();
return text1;
}
/**
* Mashes overlapping matches and saves every unique match into the textPattern.
*
* @param tmpMatches Ordered list of overlapping matches.
*/
List<int[]> reconcileOverlappings(final List<int[]> tmpMatches) {
final List<int[]> resultMatches = new ArrayList<>();
if (tmpMatches.size() > 1) {
int[] curMatch = tmpMatches.get(0);
for (int i = 1; i < tmpMatches.size() - 1; i++) {
final int[] nextMatch = tmpMatches.get(i);
if (isOverlapping(curMatch, nextMatch)) {
// expand and mash up
curMatch = new int[] {
Math.min(curMatch[0], nextMatch[0]),
Math.max(curMatch[1], nextMatch[1]),
Math.min(curMatch[2], nextMatch[2]),
Math.max(curMatch[3], nextMatch[3]),
Math.max(curMatch[1], nextMatch[1]) - Math.min(curMatch[0], nextMatch[0])
};
} else {
// save as a unique match and move on
resultMatches.add(curMatch);
curMatch = nextMatch;
}
}
// save the last match
resultMatches.add(curMatch);
}
return resultMatches;
}
/**
* Compares two texts and returns a score that represents its similarity.
*
* @param text1 Text 1
* @param text2 Text 2
* @return ordered list of matches between text1 and text2
* @throws Exception I would assume arrayoutofbound.
* The alphabet in BoyerMoore is too small?
*/
double scoreSimilarity(String text1, String text2) throws Exception {
final java.util.List<int[]> matches = compare(text1, text2);
int sumMatchLength = 0;
for (int[] match : matches) {
sumMatchLength += match[4];
}
return Math.min((double) sumMatchLength / text1.length(), 1.0);
}
}