package folioxml.core;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class TokenUtils {
/**
* Returns true if the specified string contains only [A-Za-z0-9-]
*
* @param s
* @return
*/
public static boolean isPlaintext(String s) {
for (int i = 0; i < s.length(); i++) {
char c = s.charAt(i); //Return false if c isn't one of the allowed values.
if (!(c == '-' ||
(c >= 'A' && c <= 'Z') ||
(c >= 'a' && c <= 'z') ||
(c >= '0' && c <= '9')
)) return false;
}
return true;
}
/**
* Returns true if the string is composed of whitespace [ \t\n\x0B\f\r] or is empty.
*
* @param s
* @return
*/
public static boolean isWhitespace(String s) {
for (int i = 0; i < s.length(); i++) {
char c = s.charAt(i); //Return false if c isn't one of the allowed values.
if (!(c == ' ' ||
c == '\t' ||
c == '\n' ||
c == '\u000b' ||
c == '\f' ||
c == '\r')
) return false;
}
return true;
}
/**
* Returns true if the specified string contains only [A-Za-z0-9-] and |
*
* @param s
* @return
*/
protected static boolean isAlternation(String s) {
for (int i = 0; i < s.length(); i++) {
char c = s.charAt(i); //Return false if c isn't one of the allowed values.
if (!(c == '-' || c == '|' ||
(c >= 'A' && c <= 'Z') ||
(c >= 'a' && c <= 'z') ||
(c >= '0' && c <= '9')
)) return false;
}
return true;
}
/**
* Case-insensitive. If s.length() == 0, false is always returned.
*
* @param alt
* @param s
* @return
*/
protected static boolean matchesAlternation(String alt, String s) {
if (s.length() == 0) return false;
//A = index of current starting |
//B = index of current ending |
int a = -1; //Imaginary | before beginning of alt
int b = -1;
while (b < alt.length() - 1) {
a = b;
b = alt.indexOf('|', a + 1); //Find the next alternation |
if (b < 0) b = alt.length(); //Imaginary | after the ending of alt.
if (a + s.length() + 1 == b) { //s.length() must exactly match the distance between a and b for a valid match. Can't do an inequality without opening substring loophole.
if (alt.regionMatches(true, a + 1, s, 0, s.length())) {
return true;
}
}
}
return false;
}
protected static Pattern pPlaintext = Pattern.compile("^[A-Za-z0-9\\-]*+$");
protected static Pattern pSimpleAlternation = Pattern.compile("^[A-Za-z0-9\\-\\|]*+$");
/**
* Returns true if the tag name (case-insensitive) matches the regex. Tries an simple case-insenstive compare first, then an alternation-sensitive compare, then a full case-insensitive regex.
* Both successful and failed matches are cached in a HashSet by hashcode.
*
* @param regex
* @return
*/
public static boolean fastMatches(String regex, String name) {
if (name == null || regex == null) return false;
/* This slows things down... 155 seconds vs...195
* All I can say is... HashSet must be awful fast to beat char comparisons.
if (isPlaintext(regex)){
return regex.equalsIgnoreCase(name); //Optimization - quick answer for 60% of cases
}else if (isAlternation(regex)){
return matchesAlternation(regex,name); //For the other 30%
}
*/
//HASHING - DANGEROUS.
Integer pair = Integer.valueOf(name.hashCode() ^ ((regex.hashCode() ^ 0xf0f0f0f) >>> 32));
//TODO: run asserts here to check validity of hashing
//Double-check cached to non-cached results
if (cached_matches == null) cached_matches = new HashSet<Integer>(8000); //Not too many valid combinations
if (cached_failures == null)
cached_failures = new HashSet<Integer>(40000); //n^2 invalid combinations. Guessing at 120^2
//failures are 60x more common
if (cached_failures.contains(pair))
return false;
if (cached_matches.contains(pair))
return true;
//Never encountered before??
boolean result = fastMatchesNonCached(regex, name);
if (result) cached_matches.add(pair);
else cached_failures.add(pair);
return result;
}
private static Set<Integer> cached_matches = null;
private static Set<Integer> cached_failures = null;
/**
* Returns true if the tag name (case-insensitive) matches the regex. Tries an simple case-insenstive compare first, then an alternation-sensitive compare, then a full case-insensitive regex.
*
* @param regex
* @return
*/
public static boolean fastMatchesNonCached(String regex, String name) {
if (name == null || regex == null) return false;
if (isPlaintext(regex)) {
return regex.equalsIgnoreCase(name); //Optimization - quick answer for 60% of cases
} else if (isAlternation(regex)) {
return matchesAlternation(regex, name); //For the other 30%
}
/*
//Alternation compaare
if (regex.indexOf('|') >= 0 && pSimpleAlternation.matcher(regex).matches()){
String[] options = regex.split("\\|");
for (int i =0; i < options.length; i++){
if (name.equalsIgnoreCase(options[i])) return true;
}
return false; //It's a simple alternation. Compiling a regex won't make a difference
}
*/
return matchesCI(regex, name);
}
/**
* Case-insensitive version of matches()
*
* @param regex
* @param s
* @return
*/
public static boolean matchesCI(String regex, String s) {
Pattern p = getPatternCachedCI(regex);//Pattern.compile(regex,Pattern.CASE_INSENSITIVE);
Matcher m = p.matcher(s);
return m.matches();
}
private static HashMap<String, Pattern> cachedPatterns;
public static Pattern getPatternCachedCI(String regex) {
if (cachedPatterns == null) cachedPatterns = new HashMap<String, Pattern>(2000);
Pattern p = cachedPatterns.get(regex);
if (p == null) {
p = Pattern.compile(regex, Pattern.CASE_INSENSITIVE);
cachedPatterns.put(regex, p);
}
return p;
}
/* XML spec
* 4.1 Character and Entity References
[Definition: A character reference refers to a specific character in the ISO/IEC 10646 character set, for example one not directly accessible from available input devices.]
Character Reference
[66] CharRef ::= '' [0-9]+ ';'
| '' [0-9a-fA-F]+ ';' [WFC: Legal Character]
Well-formedness constraint: Legal Character
Characters referred to using character references must match the production for Char.
If the character reference begins with " ", the digits and letters up to the terminating ; provide a hexadecimal representation of the character's code point in ISO/IEC 10646. If it begins just with " ", the digits up to the terminating ; provide a decimal representation of the character's code point.
[Definition: An entity reference refers to the content of a named entity.] [Definition: References to parsed general entities use ampersand (&) and semicolon (;) as delimiters.] [Definition: Parameter-entity references use percent-sign (%) and semicolon (;) as delimiters.]
Named entities:
*[4] NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF]
[4a] NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040]
*
*/
/**
* Decodes entity references (XML character refs and named, doesn't yet support the HTML list)
*/
public static String attributeDecode(String s) {
return entityDecodeString(s);
}
/**
* Decodes the name of an entity to its value. Ex, "quot" -> "
* Currently only supports XML 1.0 basic entities (char references and the 5 named).
* XHTML entities on todo list.
*
* @param s
* @return
*/
private static String decodeEntityValue(String s) {
if (s == null || s.length() == 0) return null;
//Most common first (these are all the XML 1.0 entities)
if (s.equalsIgnoreCase("apos")) return "'";
if (s.equalsIgnoreCase("quot")) return "\"";
if (s.equalsIgnoreCase("amp")) return "&";
if (s.equalsIgnoreCase("lt")) return "<";
if (s.equalsIgnoreCase("gt")) return ">";
//Character references
if (s.charAt(0) == '#' && s.length() > 1) {
try {
if (s.charAt(1) == 'x') {
return new String(Character.toChars(Integer.parseInt(s.substring(2), 16)));
}
return new String(Character.toChars(Integer.parseInt(s.substring(1))));
} catch (NumberFormatException nfe) {
//Invalid entity.
return null;
}
}
//Named references
//TODO: add support (separate class, hashtree lookup) for all XHTML entities in http://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references
return null;
}
/**
* Decodes all entities found in the specified string. Unrecognized entities are ignored.
*
* @param s
* @return
*/
public static String entityDecodeString(String s) {
StringBuilder sb = new StringBuilder();
boolean insideEntity = false;
int entityStart = 0;
for (int i = 0; i < s.length(); i++) {
char c = s.charAt(i);
if (insideEntity) {
if (c == ' ' || c == '&') {
sb.append(s.substring(entityStart, i)); //Flush that false entity out. No decoding
//TODO: Add validation warning for invalid characters in SLX attributes.
insideEntity = false;
} else if (c == ';') {
insideEntity = false;
String result = decodeEntityValue(s.substring(entityStart + 1, i));
if (result == null) {
//Invalid entity.
sb.append(s.substring(entityStart, i)); //Flush that false entity out. No decoding
//TODO: Add validation warning for invalid entities in SLX attributes.
} else {
sb.append(result);
continue; //We don't need to process the trailing semicolon.
}
} else {
//We skip characters when (insideEntity == true)
continue;
}
}
if (c == '&') {
insideEntity = true;
entityStart = i;
continue;
} else {
sb.append(c);
}
}
//Flush last bit out if needed. It's impossible for an valid entity to cause this - the semicolon would finish it.
if (insideEntity) sb.append(s.substring(entityStart));
return sb.toString();
}
/**
* Encodes the 5 special XML characters > < " ' and &
*
* @param s
* @return
*/
public static String attributeEncode(String s) {
return entityEncode(s);
}
/**
* Encodes the 5 special XML characters > < " ' and &. use lightEntityEncode for text bodies.
* TODO: Does this handle << properly? Create a unit test to make sure things are decoded properly
*
* @param s
* @return
*/
public static String entityEncode(String s) {
StringBuilder sb = new StringBuilder();
for (int i = 0; i < s.length(); i++) {
char c = s.charAt(i);
if (c == '\"') sb.append(""");
else if (c == '\'') sb.append("'");
else if (c == '<') sb.append("<");
else if (c == '>') sb.append(">");
else if (c == '&') sb.append("&");
else sb.append(c);
}
return sb.toString();
}
/**
* Encodes the 2 special XML characters for body text: < and &
* TODO: Does this handle << properly? Create a unit test to make sure things are decoded properly
*
* @param s
* @return
*/
public static String lightEntityEncode(String s) {
StringBuilder sb = new StringBuilder();
for (int i = 0; i < s.length(); i++) {
char c = s.charAt(i);
if (c == '<') sb.append("<");
else if (c == '&') sb.append("&");
else sb.append(c);
}
return sb.toString();
}
public static String lightEntityEncodeAndConvertFolioBrackets(String s) {
StringBuilder sb = new StringBuilder();
boolean lastWasBracket = false;
for (int i = 0; i < s.length(); i++) {
char c = s.charAt(i);
if (c == '<') {
if (!lastWasBracket)
sb.append("<");
lastWasBracket = true;
} else if (c == '&') {
sb.append("&");
lastWasBracket = false;
} else {
sb.append(c);
lastWasBracket = false;
}
}
return sb.toString();
}
}