package arkref.parsestuff;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.lang.ArrayUtils;
import org.apache.commons.lang.NotImplementedException;
/**
* Do a regex substitution on a string and remember its alignments to the original.
*
* This class represents a string ready to have such a computation, as well as its result.
* The result can have further operations done to it and it maintains references
* to the original string. Thus chaining is supported.
*
* Alignments are represented as an integer array parallel to the new string, each one
* being the position in the original. This is only really true for fragments of the string
* which were present in the original. New fragments from the substitution are all assigned
* to be the original starting position.
*
* Capturing groups are not supported in the replacement.
*
* Example:
*
* AlignedSub cleanText = new AlignedSub(text).replaceAll("<\\S+>","");
* AlignedSub better = cleanText.replaceAll("\\s+"," ").replaceAll(":)","[HAPPY]");
*
* Also run main() for more examples.
*
* @author Brendan O'Connor (http://anyall.org)
*
*/
public class AlignedSub {
public String text;
/** parallel to 'text'. **/
public int[] alignments = null;
public AlignedSub(String s) {
text = s;
}
public static AlignedSub selfAligned(String text) {
// has identity alignment
List <Integer> alignments = new ArrayList<Integer>();
AlignedSub as = new AlignedSub(text);
for (int i=0; i<text.length(); i++) alignments.add(i);
as.alignments = convert(alignments);
return as;
}
public String replace(CharSequence target, CharSequence replacement) {
throw new NotImplementedException();
}
public AlignedSub replaceAll(String regex, String replacement) {
Pattern p = Pattern.compile(regex);
return replaceAll(p, replacement);
}
public AlignedSub replaceAll(Pattern pattern, String replacement) {
AlignedSub as = replace(this.text, pattern, replacement, false);
if (this.alignments != null)
as.alignments = project(as.alignments, this.alignments);
return as;
}
public AlignedSub replaceFirst(String regex, String replacement) {
throw new NotImplementedException();
}
private static AlignedSub replace(String text, Pattern pattern, String replacement, boolean justOne) {
Matcher m = pattern.matcher(text);
boolean result = m.find();
if (!result) {
return selfAligned(text);
}
List <Integer> alignments = new ArrayList<Integer>();
StringBuffer sb = new StringBuffer();
int i = 0;
do {
sb.append( text.substring(i, m.start()) );
if (i < m.start()) {
do { alignments.add(i); } while (++i < m.start());
}
sb.append(replacement);
for (int j=0; j<replacement.length(); j++) {
alignments.add(m.start());
}
i = m.end();
if (justOne) break;
result = m.find();
} while (result);
sb.append( text.substring(i, text.length()));
do { alignments.add(i); } while (++i < text.length());
AlignedSub as = new AlignedSub(sb.toString());
as.alignments = convert(alignments);
return as;
}
/**
* pipe x through map. output is parallel to x.
* if map indices don't span the full range of x values, an error will happen.
* this is equivalent to R/Matlab/Python: map[x]
**/
public static int[] project(int[] x, int[] map) {
int[] ret = new int[x.length];
for (int i=0; i < ret.length; i++) {
ret[i] = map[x[i]];
}
return ret;
}
private static int[] convert(List<Integer> list) {
// true magic
return ArrayUtils.toPrimitive((Integer []) list.toArray(new Integer[0]));
}
public String toString() {
String s = "";
for (int i=0; i < text.length(); i++) {
s += String.format("%-2s", text.substring(i,i+1));
}
s += "\n";
for (int i=0; i < text.length(); i++) {
s += String.format("%-2d", alignments[i]);
}
return s;
}
public static void main(String[] args) {
AlignedSub s, s2;
s = selfAligned("hello world");
s2 = s.replaceAll("wor", "ZZ");
U.pl(s);
U.pl(s2);
s2 = s.replaceAll("w", "ABCDEFG");
U.pl(s2);
s2 = s.replaceAll("12345","");
U.pl(s2);
s2 = s.replaceAll("llo","");
U.pl(s2);
U.pl(s.replaceAll("h","H"));
s2 = s.replaceAll("h","H").replaceAll("e","E");
U.pl(s2);
s2 = s.replaceAll("hell","WOW").replaceAll("OWo","=======");
U.pl(s2);
}
}