FolioToSlxDiagnosticTool.java example

Explorer

folioxml-master
- commandline
  - src
    - folioxml
      - command
        Main.java
      - export
        ExportRunner.java
  - testsrc
    - folioxml
      - export
        TestExportRunner.java
- contrib
  - folioxml-lucene
    - src
      - folioxml
        export
        plugins
        ResolveHyperlinks.java
        lucene
        FieldCollector.java
        IndexFieldOpts.java
        IndexFieldOptsProvider.java
        InfobaseFieldOptsSet.java
        InfobaseSetIndexer.java
        analysis
        AnalyzerPicker.java
        DynamicAnalyzer.java
        ListAnalyzer.java
        ListTokenizer.java
        LowercaseKeywordAnalyzer.java
        folio
        FolioEnuAnalyzer.java
        FolioEnuPhraseAnalyzer.java
        FolioEnuTokenizer.java
        LookAroundCharTokenizer.java
        TokenCombiner.java
        folioQueryParser
        QueryParser.java
        QueryToken.java
        QueryTokenReader.java
    - testsrc
      - apache
        lucene
        CharTokenizer.java
      - folioxml
        directexport
        SimultaneousTest.java
        lucene
        analysis
        folio
        TokenCombinerTest.java
        folioQueryParser
        QueryParserTest.java
        tests
        Indexer.java
- core
  - folioxml
- diff_match_patch
  - oldtest
    - name
      - fraser
        neil
        plaintext
        diff_match_patch_test.java
  - src
    - name
      - fraser
        neil
        plaintext
        diff_match_patch.java

package folioxml.core;

import folioxml.slx.ISlxTokenReader;
import folioxml.slx.SlxToken;

import java.io.File;
import java.io.IOException;
import java.io.Writer;
import java.nio.charset.Charset;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Map.Entry;


public class FolioToSlxDiagnosticTool implements ISlxTokenReader {


    /**
     * To verify that the markup is being converted correctly, we need some diagnostic tools.
     * <p>
     * At translation level -
     * Output a file containing pairs of SlxToken and FolioToken? objects.
     * - This should help us spot anomolies in tag formatting, and understand the diversity of incoming markup. Adding frequency data would also be helpful.
     * - Output a table of character, and entity use (with frequency). This should help up spot potential character encoding issues.
     * <p>
     * After Transformation -
     * A table of all unique Tag, Entity, and Comment SlxTokens? and their frequencies. Grouping by (tag|entity|comment) would be good.
     * Also, the original names -> new CSS names table would be useful.
     * <p>
     * I thought I would need it today, but I won't until next Monday. SlxToken objects have a FolioToken? property - this should make it very easy to build the tables.
     * Both offer .toString().
     * Your diagnostic table builder should work as an ISlxTokenReader than wraps another ISlxTokenReader. This will allow it to wrap SlxTranslatingReader?.
     * It should also have methods for processing a record.
     * Two instances will be needed to monitor both SLX Valid and SLX Transitional.
     * You can get the original names->css names from the SlxTokenReader?.cssCleaner instance.
     * Skip comments for now - I know there are 200,000 unique comments in the infobase. We gotta filter out PID and KPN comments first...
     * Add/Change #42 (Diagnostic tools)
     **/


    // table contains pairs
    private HashMap<Pair<String, String>, Long> tagTablePair = null;
    private HashMap<Pair<String, String>, Long> entityTablePair = null;
    //TODO: add comments back in
    //skipping comments for now
    //private HashMap<Pair<String,String>,Long> commentTablePair = null;

    public FolioToSlxDiagnosticTool(ISlxTokenReader slxTokenReader) {
        this.reader = slxTokenReader;
        tagTablePair = new HashMap<Pair<String, String>, Long>();
        entityTablePair = new HashMap<Pair<String, String>, Long>();
        //skipping comments for now
        // commentTablePair = new HashMap<Pair<String,String>,Long>();
    }


    public void outputDataFiles(String filename) throws InvalidMarkupException {
        exportTableMapToFile(tagTablePair, filename);
    }

    private boolean exportTableMapToFile(Map<Pair<String, String>, Long> map, String fileName) throws InvalidMarkupException {
        System.out.println(fileName);
        StringBuilder sb = new StringBuilder();
        Iterator<Entry<Pair<String, String>, Long>> iterator = null;
        iterator = map.entrySet().iterator();

        while (iterator.hasNext()) {
            Entry<Pair<String, String>, Long> nextPair = iterator.next();

            sb.append(nextPair.getKey().getFirst());
            sb.append("	");
            sb.append(nextPair.getKey().getSecond());
            sb.append("	");

            SlxToken t = new SlxToken(nextPair.getKey().getSecond());

            sb.append(t.getTagName());
            sb.append("	");
            String ft = nextPair.getKey().getFirst();
            if (ft != null && ft.length() > 3) {
                if (ft.charAt(1) == '/') ft = ft.substring(2, 4);
                else ft = ft.substring(1, 3);
            }

            sb.append(ft);
            sb.append("	");
            sb.append(nextPair.getValue());

            //sb.append("|");
            sb.append("\n");

        }

        if (!new File(fileName).getParentFile().exists()) new File(fileName).getParentFile().mkdir();

        Writer fw = null;

        try {
            fw = Files.newBufferedWriter(Paths.get(fileName), Charset.forName("UTF-8"));

            fw.write(sb.toString());

            fw.close();
        } catch (IOException iox) {
            iox.printStackTrace();
            fw = null;
            return false;
        }
        return true;
    }

    protected ISlxTokenReader reader;
    //protected FolioSlxTranslator translator = new FolioSlxTranslator();

    /**
     * Reads the next translated token from the stream
     *
     * @return
     * @throws java.io.IOException
     * @throws folioxml.core.InvalidMarkupException
     */
    public SlxToken read() throws IOException, InvalidMarkupException {
        SlxToken st = reader.read();
        if (st == null) return null;

        if (!"record".equals(st.getTagName())) {


            //classify token type and add new pair or increment pair frequency
            if (st.isTag()) {

                //create FolioToken & SlxToken String pair
                Pair<String, String> pair = new Pair<String, String>(st.sourceToken.text, st.toString());
                tagTablePair.put(pair, (tagTablePair.containsKey(pair) ? tagTablePair.get(pair) + 1 : 1));

            } else if (st.isEntity()) {
                //entityTablePair.put(pair, (entityTablePair.containsKey(pair) ? entityTablePair.get(pair) + 1 : 1));

            } else if (st.isComment()) {
                // skipping comments for now
                //	commentTablePair.put(pair, (commentTablePair.containsKey(pair) ? commentTablePair.get(pair) + 1 : 1));

            } else if (st.isTextOrEntity()) {
                //ignore this branch
            } else {
                //this should never happen (I think?)
                throw new InvalidMarkupException("Error in SlxToken  " + "FolioToken:" + st.sourceToken.text + "\t\tSlxToken:" + st.toString());
            }

        }
        //System.err.println("FolioToken:" + st.sourceToken.text +"\nSLXToken:" +st.toString());

        return st;

    }

    public boolean canRead() {
        return reader.canRead();
    }

    /**
     * Closes the underlying reader
     */
    public void close() throws IOException {


        reader.close();
        reader = null;
    }
}