CssClassCleaner.java example

Explorer
folioxml-master
- commandline
  - src
    - folioxml
      - command
        Main.java
      - export
        ExportRunner.java
  - testsrc
    - folioxml
      - export
        TestExportRunner.java
- contrib
  - folioxml-lucene
    - src
      - folioxml
        export
        plugins
        ResolveHyperlinks.java
        lucene
        FieldCollector.java
        IndexFieldOpts.java
        IndexFieldOptsProvider.java
        InfobaseFieldOptsSet.java
        InfobaseSetIndexer.java
        analysis
        AnalyzerPicker.java
        DynamicAnalyzer.java
        ListAnalyzer.java
        ListTokenizer.java
        LowercaseKeywordAnalyzer.java
        folio
        FolioEnuAnalyzer.java
        FolioEnuPhraseAnalyzer.java
        FolioEnuTokenizer.java
        LookAroundCharTokenizer.java
        TokenCombiner.java
        folioQueryParser
        QueryParser.java
        QueryToken.java
        QueryTokenReader.java
    - testsrc
      - apache
        lucene
        CharTokenizer.java
      - folioxml
        directexport
        SimultaneousTest.java
        lucene
        analysis
        folio
        TokenCombinerTest.java
        folioQueryParser
        QueryParserTest.java
        tests
        Indexer.java
- core
  - folioxml
- diff_match_patch
  - oldtest
    - name
      - fraser
        neil
        plaintext
        diff_match_patch_test.java
  - src
    - name
      - fraser
        neil
        plaintext
        diff_match_patch.java
package folioxml.css;

import folioxml.core.InvalidMarkupException;
import folioxml.core.Pair;
import folioxml.core.TokenBase;
import folioxml.slx.SlxRecord;
import folioxml.slx.SlxToken;
import folioxml.xml.NodeFilter;
import folioxml.xml.NodeList;

import java.util.*;
import java.util.regex.Pattern;

/**
 * Don't use this at the same time as SlxTranslator, since this will process tokens before SlxTranslator has finished adding the class names... (They're subsequent tags)
 *
 * @author nathanael
 */
public class CssClassCleaner {

    public CssClassCleaner() {
    }


    /**
     * Map of namespace -> Map of 'originalName'.lowerCaseCultureEnglish -> (newName, originalName)
     */
    public Map<String, Map<String, Pair<String, String>>> dict = new HashMap<String, Map<String, Pair<String, String>>>();
    /**
     * Map of namespace -> Map of 'newName'.lowerCaseCultureEnglish -> (originalName)
     */
    public Map<String, Map<String, String>> reverseDict = new HashMap<String, Map<String, String>>();


    /**
     * Map of namespace -> Collection of newName.lowerCaseCultureEnglish for conflict checking.
     */
    public Map<String, HashSet<String>> valueDict = new HashMap<String, HashSet<String>>();

    /**
     * Valid CSS names (and now, XML IDs) (A subset of the specification, since some browsers don't support all the spec)
     */
    protected static Pattern pName = Pattern.compile("^[_a-zA-Z][_a-zA-Z0-9-]*$");
    //Removed leading dash. It wasn't allowed in XML IDs, and we need this multi-purpose. Was ^(-)?[_a-zA-Z][_a-zA-Z0-9-]*$

	/* XML 1.0 rev 5 spec for IDs
	 * [4]   	NameStartChar	   ::=   	":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF]
	  [4a]   	NameChar	   ::=   	NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040]
	 */

    /**
     * Provides a CSS-compliant version of the specified identifier.
     * It is suggested that you process the infobase header <style-def/> tags first, since the
     * additional information they contain can assist in making more intelligent naming
     * choices in the case of conflicts. (Such as Normal-highlighter).
     * Appending a random 4-digit hex value such as "-f92a" is a last resort.
     * <p>
     * It is important to use the same CssClassCleaner instance across the entire infobase to maintain consistency (or save and restore the name table).
     * The name table should be preserved so translation back to FFF can be performed.
     * Storage in the root infobase record is a good idea...
     *
     * @param name   The class name to sanitize
     * @param parent The token this class name originated from. Helps choose names in the case of conflicts.
     * @return
     * @throws InvalidMarkupException
     */
    public String cleanId(String name, String namespace, boolean throwExceptionIfDuplicate) throws InvalidMarkupException {
        //Namespaces are separated
        Map<String, Pair<String, String>> mappings = dict.get(namespace);
        if (mappings == null) {
            mappings = new HashMap<String, Pair<String, String>>();
            dict.put(namespace, mappings);
        }

        Map<String, String> reverseMappings = reverseDict.get(namespace);
        if (reverseMappings == null) {
            reverseMappings = new HashMap<String, String>();
            reverseDict.put(namespace, reverseMappings);
        }

        HashSet<String> values = valueDict.get(namespace);
        if (values == null) {
            values = new HashSet<String>();
            valueDict.put(namespace, values);
        }
		
		/*CSS identifiers
		-?[_a-z]|{nonascii}|{escape}([_a-z0-9-]|{nonascii}|{escape})*
		
		Simplified:
		-?[_a-zA-z][_a-zA-Z0-9-]*
		
		
		Folio names are case-insensitive, but preserve case.
		
		Do browsers limit length? Test this! We have names that are probably longer than 255.
	 	*/
        String lowerName = name.toLowerCase(Locale.ENGLISH);


        //TODO: What if there is both a character style named "Normal" and a link style named "Normal"? This needs the intelligence to know which Normal variant to use when the <span> or <link> tag is reached. Style-def isn't enough
        //What if character-style, field, and highlighter have overlapping names?
        //The map doesn't add new conflicts, but what if they already exist between style types?

        //First check if 'name' exists in the mappings.
        Pair<String, String> result = mappings.get(lowerName);

        //If so, return precomputed result.
        if (result != null) {
            if (throwExceptionIfDuplicate)
                throw new InvalidMarkupException("Duplicate mapping for (" + lowerName + ") encountered: " + result.getFirst() + " -> " + result.getSecond() + ". Please rename character styles, highlighters, and fields to use unique names; they cannot overlap in CSS.");
            return result.getFirst();
        }

        //If 'name' is valid anyways, cache to the Map
		/* Optmization causes problem when run twice on the same css class
		 * if (pName.matcher(name).matches()) {
		 
			
			
			mappings.put(lowerName, new Pair<String,String>(name,name));
			values.add(lowerName);
			return name; //Nothing to do - already a valid name.
		}else{*/
        String sanitizedName = sanitizeString(name);
        //Sanitize
        String newName = sanitizedName;
        String lowerNewName = newName.toLowerCase(Locale.ENGLISH);

        //Check for conflicts. Attempt style-def naming
			/*if (values.contains(lowerNewName) && parent != null && parent.matches("style-def")){
				//Append -type if present on parent and valid.
				String type = parent.get("type");
				if (type != null && pName.matcher(type).matches()){
					newName = sanitizedName + "-" + type;
					lowerNewName = newName.toLowerCase(Locale.ENGLISH);
				}
				
			}*/
        boolean foundConflict = false;
        //Check for conflicts. Generate 4-digit hex suffix.
        while (values.contains(lowerNewName)) {
            newName = sanitizedName + "-" + Integer.toHexString(new Random().nextInt(256 * (256 - 16)) + (256 * 16));
            lowerNewName = newName.toLowerCase(Locale.ENGLISH);
            foundConflict = true;
        }
        if (foundConflict) {
            for (Pair<String, String> pair : mappings.values()) {
                if (pair.getFirst().equalsIgnoreCase(sanitizedName)) {
                    System.out.println("Renaming \"" + name + "\" to \"" + newName + "\" to avoid conflicting with a similar Folio Style " + pair.getSecond());
                    break;
                }
            }
        }

        //Add mapping
        mappings.put(lowerName, new Pair<String, String>(newName, name));
        values.add(lowerNewName);
        reverseMappings.put(lowerNewName, name);

        //Return new name
        return newName;
        //}
    }

    private String sanitizeString(String name) {
        //Remove all characters that don't match the regex.
        //When removing spaces, and the next character is lowercase, uppercase it.
        StringBuilder sb = new StringBuilder(name.length());

        boolean lastCharDelimiter = false;

        for (int i = 0; i < name.length(); i++) {
            char c = name.charAt(i);

            //Allow [_a-zA-Z] as the first character
            // Aug 12. Removed hyphen allowance for XML ID compat
            //And the remainder allow [_a-zA-Z0-9-]
            boolean valid = isCharValid(c, i > 0, true, true, i > 0);

            if (valid) {
                //Uppercase lowercase letters following a space.
                if (lastCharDelimiter && sb.length() > 0 && sb.charAt(sb.length() - 1) != '_') {
                    sb.append('_');
                }
                //Keep the character
                sb.append(c);
            }
            lastCharDelimiter = (c == ' ' || c == '.' || c == ',' || c == '+' || c == '/' || c == '\\' || c == '#' || c == '%' || c == '(' || c == ':');
        }
        return sb.toString();
    }

    private boolean isCharValid(char c, boolean allowHyphen, boolean allowUnderline, boolean allowAZ, boolean allowNumbers) {
        if (c == '-' && allowHyphen) return true;
        if (c == '_' && allowUnderline) return true;
        if (((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')) && allowAZ) return true;
        if (allowNumbers && (c >= '0' && c <= '9')) return true;
        return false;
    }

    public void process(SlxToken t) throws InvalidMarkupException {
        process(t, getNamespace(t), getPrefix(t), false);
    }

    public void process(TokenBase t, String namespace, String prefix, boolean throwExceptionIfDuplicate) throws InvalidMarkupException {
        if (!t.isTag()) return; //Only tags have attributes

        //Requires SLX valid.

        //For paragraphs, spans, links...
        String s = t.get("class");
        if (s != null) {
            String ns = cleanId(prefix + s, namespace, throwExceptionIfDuplicate);
            if (!s.equals(ns)) t.set("class", ns);
        }

        //For <bookmarks name="" and <link jumpdestination=""
        //For objects and bookmarks, use hash instead - don't CSS clean.
    }

    /**
     * Must be called before .process().  TODO: Resolve replaceDefiniton=true first
     *
     * @param r
     * @throws InvalidMarkupException
     */
    public void processRootRecord(SlxRecord r) throws InvalidMarkupException {
        //Index style-def tags only. Then repeat and get the rest.
        for (TokenBase t : r.getTokens()) {
            if (t.isTag() && t.matches("style-def")) {
                process(t, getNamespace(t), getPrefix(t), true);
                //disabled at one point because of Neil's style class name changed quick fix was to disable
            }
        }

    }

    /**
     * objects, bookmarks, and popups are not processed. They are hashed.
     *
     * @param t
     * @return
     * @throws InvalidMarkupException
     */
    private String getNamespace(TokenBase t) throws InvalidMarkupException {
		/* Namespaces
		 * Character styles
			Link Styles
			Paragraph Styles
			Level Styles
			Highlighter Styles
			Field Styles
		 */
        String type = t.get("type");


        if (t.matches("p|paragraph-attribute") || (t.matches("style-def") && "paragraph".equalsIgnoreCase(type)))
            return "paragraph";
        if (t.matches("record|record-attribute") || (t.matches("style-def") && "level".equalsIgnoreCase(type)))
            return "level";
        if (t.matches("link|popupLink|a") || (t.matches("style-def") && "link".equalsIgnoreCase(type)))
            return "link"; //Correct for the 'class' attribute, but not for 'objectName'.

        //if (t.matches("span|style-def") && "character-style".equalsIgnoreCase(type)) return "character-style";
        //if (t.matches("span|style-def") && "highlighter".equalsIgnoreCase(type)) return "highlighter";

        //All other span tags are fields.
        //if (t.matches("span") || (t.matches("style-def") && TokenUtils.fastMatches("text|date|time|integer|decimal", type))) return "field";


        return "span";
    }

    private String getPrefix(TokenBase t) throws InvalidMarkupException {
		/* Namespaces
		 * Character styles
			Link Styles
			Paragraph Styles
			Level Styles
			Highlighter Styles
			Field Styles
		 */
        String type = t.get("type");
        if (t.matches("span|style-def") && "character-style".equalsIgnoreCase(type)) return "cs_";
        if (t.matches("span|style-def") && "highlighter".equalsIgnoreCase(type)) return "hl_";


        //All other span tags are fields.
        //if (t.matches("span") || (t.matches("style-def") && TokenUtils.fastMatches("text|date|time|integer|decimal", type))) return "";

        return "";
    }

    /**
     * Returns the original name for the specified token based on the class attribute and tag name
     *
     * @param t
     * @param cssClass
     * @return
     * @throws InvalidMarkupException
     */
    public String findOriginalName(SlxToken t) throws InvalidMarkupException {
        return findOriginalName(getNamespace(t), t.get("class"));
    }

    /**
     * Returns the original name for the specified cssClass use the token specified to determine the namespace.
     *
     * @param t
     * @param cssClass
     * @return
     * @throws InvalidMarkupException
     */
    public String findOriginalName(SlxToken t, String cssClass) throws InvalidMarkupException {
        return findOriginalName(getNamespace(t), cssClass);
    }

    /**
     * Returns the original name for the specified cssClass
     *
     * @param t
     * @param cssClass
     * @return
     * @throws InvalidMarkupException
     */
    public String findOriginalName(String namespace, String cssClass) throws InvalidMarkupException {
        if (cssClass == null) return null;
        Map<String, String> mappings = reverseDict.get(namespace);
        if (mappings != null) {
            String s = mappings.get(cssClass.toLowerCase(Locale.ENGLISH));
            //if (pair == null)  return cssClass;
            //WARNING: TERRIBLE THING TO DO.... WILL BReAK EVERYTHING
            if (s == null)
                throw new InvalidMarkupException("Cannot find original css name for " + cssClass + " (in " + namespace + " namespace)");
            return s;//return pair.getSecond();
        }
        return null;
    }


    /**
     * NOT IMPLEMENTED Saves the mappings to the specified record. Verify that root.level = "root".
     * Builds a map of String->SlxToken (<style-def> tags).
     * If a <style-def /> doesn't exist for the mapping, insert it after the last style-def.
     * originalName = "" attribute is where the original names are stored.
     *
     * @param root
     * @throws InvalidMarkupException
     */
    public void saveTo(SlxRecord root) throws InvalidMarkupException {

        // TODO: We need this for later - For converting back, and possible for reference
        for (String namespace : dict.keySet()) {
            for (String key : dict.get(namespace).keySet()) {
                Pair<String, String> pair = dict.get(namespace).get(key);
                SlxToken t = new SlxToken("<mapping from=\"" + pair.getSecond() + "\" to=\"" + pair.getFirst() + "\" namespace=\"" + namespace + "\" />");
                root.write(t);
                //System.out.println(t.toTokenString());
            }
        }


    }

    public void loadFrom(NodeList nodes) throws InvalidMarkupException {
        //Index style-def tags only. Then repeat and get the rest.
        ///System.out.print(nodes.toXmlString(true));
        nodes = nodes.searchOuter(new NodeFilter("mapping"));
        for (TokenBase t : nodes.list()) {
            String namespace = t.get("namespace");
            //Namespaces are separated
            Map<String, Pair<String, String>> mappings = dict.get(namespace);
            if (mappings == null) {
                mappings = new HashMap<String, Pair<String, String>>();
                dict.put(namespace, mappings);
            }
            Map<String, String> reverseMappings = reverseDict.get(namespace);
            if (reverseMappings == null) {
                reverseMappings = new HashMap<String, String>();
                reverseDict.put(namespace, reverseMappings);
            }

            HashSet<String> values = valueDict.get(namespace);
            if (values == null) {
                values = new HashSet<String>();
                valueDict.put(namespace, values);
            }
            String lowerName = t.get("from").toLowerCase(Locale.ENGLISH);

            mappings.put(lowerName, new Pair<String, String>(t.get("from"), t.get("to")));
            reverseMappings.put(t.get("to").toLowerCase(Locale.ENGLISH), t.get("from"));
            values.add(lowerName);

        }

        assert (nodes.count() > 1);
    }


}