package folioxml.core; import folioxml.slx.ISlxTokenReader; import folioxml.slx.SlxToken; import java.io.File; import java.io.IOException; import java.io.Writer; import java.nio.charset.Charset; import java.nio.file.Files; import java.nio.file.Paths; import java.util.HashMap; import java.util.Iterator; import java.util.Map; import java.util.Map.Entry; public class FolioToSlxDiagnosticTool implements ISlxTokenReader { /** * To verify that the markup is being converted correctly, we need some diagnostic tools. * <p> * At translation level - * Output a file containing pairs of SlxToken and FolioToken? objects. * - This should help us spot anomolies in tag formatting, and understand the diversity of incoming markup. Adding frequency data would also be helpful. * - Output a table of character, and entity use (with frequency). This should help up spot potential character encoding issues. * <p> * After Transformation - * A table of all unique Tag, Entity, and Comment SlxTokens? and their frequencies. Grouping by (tag|entity|comment) would be good. * Also, the original names -> new CSS names table would be useful. * <p> * I thought I would need it today, but I won't until next Monday. SlxToken objects have a FolioToken? property - this should make it very easy to build the tables. * Both offer .toString(). * Your diagnostic table builder should work as an ISlxTokenReader than wraps another ISlxTokenReader. This will allow it to wrap SlxTranslatingReader?. * It should also have methods for processing a record. * Two instances will be needed to monitor both SLX Valid and SLX Transitional. * You can get the original names->css names from the SlxTokenReader?.cssCleaner instance. * Skip comments for now - I know there are 200,000 unique comments in the infobase. We gotta filter out PID and KPN comments first... * Add/Change #42 (Diagnostic tools) **/ // table contains pairs private HashMap<Pair<String, String>, Long> tagTablePair = null; private HashMap<Pair<String, String>, Long> entityTablePair = null; //TODO: add comments back in //skipping comments for now //private HashMap<Pair<String,String>,Long> commentTablePair = null; public FolioToSlxDiagnosticTool(ISlxTokenReader slxTokenReader) { this.reader = slxTokenReader; tagTablePair = new HashMap<Pair<String, String>, Long>(); entityTablePair = new HashMap<Pair<String, String>, Long>(); //skipping comments for now // commentTablePair = new HashMap<Pair<String,String>,Long>(); } public void outputDataFiles(String filename) throws InvalidMarkupException { exportTableMapToFile(tagTablePair, filename); } private boolean exportTableMapToFile(Map<Pair<String, String>, Long> map, String fileName) throws InvalidMarkupException { System.out.println(fileName); StringBuilder sb = new StringBuilder(); Iterator<Entry<Pair<String, String>, Long>> iterator = null; iterator = map.entrySet().iterator(); while (iterator.hasNext()) { Entry<Pair<String, String>, Long> nextPair = iterator.next(); sb.append(nextPair.getKey().getFirst()); sb.append(" "); sb.append(nextPair.getKey().getSecond()); sb.append(" "); SlxToken t = new SlxToken(nextPair.getKey().getSecond()); sb.append(t.getTagName()); sb.append(" "); String ft = nextPair.getKey().getFirst(); if (ft != null && ft.length() > 3) { if (ft.charAt(1) == '/') ft = ft.substring(2, 4); else ft = ft.substring(1, 3); } sb.append(ft); sb.append(" "); sb.append(nextPair.getValue()); //sb.append("|"); sb.append("\n"); } if (!new File(fileName).getParentFile().exists()) new File(fileName).getParentFile().mkdir(); Writer fw = null; try { fw = Files.newBufferedWriter(Paths.get(fileName), Charset.forName("UTF-8")); fw.write(sb.toString()); fw.close(); } catch (IOException iox) { iox.printStackTrace(); fw = null; return false; } return true; } protected ISlxTokenReader reader; //protected FolioSlxTranslator translator = new FolioSlxTranslator(); /** * Reads the next translated token from the stream * * @return * @throws java.io.IOException * @throws folioxml.core.InvalidMarkupException */ public SlxToken read() throws IOException, InvalidMarkupException { SlxToken st = reader.read(); if (st == null) return null; if (!"record".equals(st.getTagName())) { //classify token type and add new pair or increment pair frequency if (st.isTag()) { //create FolioToken & SlxToken String pair Pair<String, String> pair = new Pair<String, String>(st.sourceToken.text, st.toString()); tagTablePair.put(pair, (tagTablePair.containsKey(pair) ? tagTablePair.get(pair) + 1 : 1)); } else if (st.isEntity()) { //entityTablePair.put(pair, (entityTablePair.containsKey(pair) ? entityTablePair.get(pair) + 1 : 1)); } else if (st.isComment()) { // skipping comments for now // commentTablePair.put(pair, (commentTablePair.containsKey(pair) ? commentTablePair.get(pair) + 1 : 1)); } else if (st.isTextOrEntity()) { //ignore this branch } else { //this should never happen (I think?) throw new InvalidMarkupException("Error in SlxToken " + "FolioToken:" + st.sourceToken.text + "\t\tSlxToken:" + st.toString()); } } //System.err.println("FolioToken:" + st.sourceToken.text +"\nSLXToken:" +st.toString()); return st; } public boolean canRead() { return reader.canRead(); } /** * Closes the underlying reader */ public void close() throws IOException { reader.close(); reader = null; } }