AutoGenSentences.java example

Explorer
sx10-master
package x10.parser;

import polyglot.util.CollectionUtil; import x10.util.CollectionFactory;

import java.io.File;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.io.BufferedWriter;
import java.io.FileWriter;
import java.util.HashMap;
import java.util.ArrayList;
import java.util.StringTokenizer;
import java.util.HashSet;
import java.util.Collections;
import java.util.Random;
import java.util.Collection;
import java.util.Set;
import java.util.Map;

/*
Not used:
all the rules at the beginning including
MethodPrimaryPrefix ::= Primary . ErrorId $ErrorId
MethodSuperPrefix ::= super . ErrorId $ErrorId


PlaceType ::= PlaceExpression
PlaceExpressionSingleListopt ::= $Empty
                               | PlaceExpressionSingleList


delete:
-> 
duplications at the beginning
 */
public class AutoGenSentences {
    private static int MAX_DEPTH = 10;
    private static String CompilationUnit = "CompilationUnit";
    private static String TypeDeclaration = "TypeDeclaration";

    static class Rule extends ArrayList<String> {
        private final String symbol;
        private final int line;
        private final ArrayList<String> names = new ArrayList<String>();
        private final ArrayList<String> code = new ArrayList<String>();

        Rule(String symbol, int line) {
            this.symbol = symbol;
            this.line = line;
        }
    }
    public static void main(String[] args) {
        if (args.length!=2) {
            System.err.println("You need to run AutoGenSentences with two arguments: GRAMMAR_FILE OUTPUT_FILE\nFor example: java AutoGenSentences x10.g Output.x10\n");
            System.exit(-1);
        }
        //only-grammar-productions.txt auto-gen-sentences.txt
        new AutoGenSentences(args);
    }
    AutoGenSentences(String[] args) {
        ArrayList<String> grammarFile = readFile(new File(args[0]));
        ArrayList<String> newFile = new ArrayList<String>();
        String currProd = null;
        int lineNum = -1;
        try {
            boolean isTypes = false;
            Map<String, ArrayList<Rule>> rules = null;
            boolean inJavaCode = false;
            for (String line : grammarFile) {
                lineNum++;
                if (!isTypes) newFile.add(line);
                line = line.trim();
                if (line.equals("")) continue;
                
                if (line.equals("%Rules")) {
                    rules = grammar;
                    continue;
                }
                if (line.equals("%Types")) {
                    isTypes = true;
                    rules = CollectionFactory.newHashMap();
                    continue;
                }
                if (line.equals("%End")) {
                    if (isTypes) {
                        isTypes = false;
                        // assert all productions are of size 1
                        for (String type : rules.keySet())
                            for (ArrayList<String> prod : rules.get(type)) {
                                assert prod.size()==1;
                                String nonTerminal = prod.get(0);
                                assert !isLiteral(nonTerminal);
                                assert !types.containsKey(nonTerminal);
                                types.put(nonTerminal,type);
                            }
                    }
                    rules = null;
                }
                if (rules==null) continue;

                if (line.startsWith("--")) continue; // ignore comments
                // Ignore: /.$NullAction./
                if (line.equals("/.$NullAction./")) continue;
                if (line.startsWith("/.")) {
                    assert line.equals("/.$BeginJava") : line;
                    inJavaCode = true;
                    ArrayList<Rule> prods = rules.get(currProd);
                    final int prodNum = prods.size() - 1;
                    final Rule rule = prods.get(prodNum);
                    ArrayList<String> ruleArgs = new ArrayList<String>();
                    int k=0;
                    for (String name : rule.names) {
                        String id = rule.get(k++);
                        if (hasArg(id,name))
                            ruleArgs.add(name);
                    }
                    newFile.add("\t\t\tr.rule_"+rule.symbol+prodNum+"("+simpleJoin(ruleArgs,",")+");");
                    continue;
                }
                if (line.endsWith("./")) {
                    assert inJavaCode;
                    assert line.equals("./") || line.startsWith("$EndJava") : line;
                    inJavaCode = false;
                    continue;
                }
                if (inJavaCode) {
                    ArrayList<Rule> prods = rules.get(currProd);
                    final Rule rule = prods.get(prods.size() - 1);
                    int k=1;
                    for (String name : rule.names)
                        line = line.replace("$"+name,""+(k++));
                    line = line.replace("$sym_type","X10Parsersym");
                    if (!line.equals("$EndJava")) rule.code.add(line);
                    newFile.remove(newFile.size()-1);
                    continue;
                }

                StringTokenizer tokenizer = new StringTokenizer(line);
                final String first = tokenizer.nextToken();
                if (!first.equals("|")) {
                    currProd = first;
                    currProd = unescape(currProd);
                    String next = tokenizer.nextToken();
                    assert next.equals("::=") || next.equals("::=?") : next;
                }
                assert currProd!=null;
                ArrayList<Rule> prods = rules.get(currProd); // AssignmentExpression is stated in 2 different rules
                if (prods==null) {
                    prods = new ArrayList<Rule>();
                    rules.put(currProd, prods);
                }
                //ImportDeclarations PackageDeclaration$misplacedPackageDeclaration ImportDeclarationsopt$misplacedImportDeclarations TypeDeclarationsopt
                Rule terms = new Rule(currProd, lineNum);
                while (tokenizer.hasMoreTokens()) {
                    String token = tokenizer.nextToken();
                    if (token.equals("--")) break; // comments

                    if (token.equals("|")) {
                        prods.add(terms);
                        terms = new Rule(currProd, lineNum);
                        continue;
                    }
                    token = unescape(token);
                    assert token.charAt(0)!='$';
                    int indexDollar = token.indexOf('$');
                    String name = token;
                    if (indexDollar!=-1) {
                        name = token.substring(indexDollar+1);
                        token = token.substring(0,indexDollar); // ImportDeclarationsopt$misplacedImportDeclarations
                    }
                    if (token.equals("%Empty")) continue;
                    terms.add(token);
                    terms.names.add(name);
                }
                // can be empty: assert terms.size()>=1 : currProd;
                prods.add(terms);
            }
        } catch (Throwable e) {
            System.err.println("Error on line "+lineNum);
            e.printStackTrace();
        }

        // removing unused symbols and types
        findUsedSymbols(CompilationUnit);
        Set<String> unusedSymbols = CollectionFactory.newHashSet(grammar.keySet());
        unusedSymbols.removeAll(usedSymbols);
        if (unusedSymbols.size()>0) {
            System.out.println("Unused symbols are: "+unusedSymbols); 
            for (String s : unusedSymbols)
                grammar.remove(s);
        }
        Set<String> unusedTypes = CollectionFactory.newHashSet(types.keySet());
        unusedTypes.removeAll(usedSymbols);
        if (unusedTypes.size()>0) {
            System.out.println("Unused types are: "+unusedTypes);
            for (String s : unusedTypes)
                types.remove(s);            
        }

        final ArrayList<String> nonTerminals = getNonTerminals();
        System.out.println("Roots are: "+findRoots());
        System.out.println("Literals are: "+getLiterals());
        System.out.println("Non-terminals are: "+ nonTerminals);

        // consistency checks
        Set<String> nonTerminalsWithoutType = CollectionFactory.newHashSet(nonTerminals);
        nonTerminalsWithoutType.removeAll(types.keySet());
        assert nonTerminalsWithoutType.size()==0 : nonTerminalsWithoutType;
        assert grammar.keySet().containsAll(types.keySet());

        // printing output
        newFile.add("%Types");
        newFile.add("\tObject ::= "+simpleJoin(types.keySet()," | "));
        newFile.add("%End");

        File output = new File(args[1]);
        if (true) {
            if (false) printSingletons();
            if (false) printGrammar(CompilationUnit,CollectionFactory.<String>newHashSet());
            writeFile(output,newFile);

            for (ArrayList<Rule> prods : grammar.values()) {
                int prodNum = -1;
                for (Rule rule : prods) { prodNum++;
                    if (rule.code.size()>0) {
                        int k = 0;
                        ArrayList<String> ruleArgs = new ArrayList<String>();
                        ArrayList<String> castArgs = new ArrayList<String>();
                        for (String name : rule.names) {
                            String id = rule.get(k++);
                            if (hasArg(id,name)) {
                                ruleArgs.add("Object _"+name);
                                String type = types.get(id);
                                if (type==null) type = "IToken";
                                castArgs.add(type+" "+name+" = ("+type+") _"+name+";");
                            }
                        }
                        System.out.println("\t// Production: "+rule.symbol+" ::= "+ join(rule," "));
                        System.out.println("\tvoid rule_"+rule.symbol+prodNum+"("+simpleJoin(ruleArgs,", ")+") {");
                        for (String s : castArgs)
                            System.out.println("\t\t"+s);
                        for (String c : rule.code)
                            System.out.println("\t\t"+c);
                        System.out.println("\t}");
                    }
                }
            }
            return;            
        }
        //x10.g root is CompilationUnit, but we want to generate many TypeDeclaration
        printGrammar(TypeDeclaration,CollectionFactory.<String>newHashSet());

        final Set<String> res = gen(TypeDeclaration, MAX_DEPTH);
        assert EMPTY_STR.size()==1 : EMPTY_STR;

        writeFile(output,res);
    }
    private static String unescape(String token) {
        if (token.charAt(0)=='\'') {
            // we escaped some tokens, like '|'  '%'  '-->'
            assert token.charAt(token.length()-1)=='\'' : token;
            token = token.substring(1,token.length()-1);
        }
        return token;
    }

    final Map<String, ArrayList<Rule>> grammar = CollectionFactory.newHashMap();
    final Map<String, String> types = CollectionFactory.newHashMap();


    Set<String> EMPTY_STR = CollectionFactory.newHashSet(Collections.singleton(""));

    String join(Collection<String> arr, String sep) {
        if (arr.size()==0) return "%Empty";
        String res = "";
        for (String s : arr) {
            final char c = s.charAt(0);
            res = res + (res.equals("") ? "" : sep) + (Character.isLetterOrDigit(c) ? s : "'"+s+"'");
        }
        return res;
    }
    String simpleJoin(Collection<String> arr, String sep) {
        if (arr.size()==0) return "";
        String res = "";
        for (String s : arr) {
            res = res + (res.equals("") ? "" : sep) + s;
        }
        return res;
    }


    Map<String,Set<String>> graph = CollectionFactory.newHashMap();
    Map<String,Integer> visited = CollectionFactory.newHashMap();
    int currID = 0;
    void printSingletons() {
        // I want to make sure the singletons don't have cycles
        for (String symbol : grammar.keySet()) {
            final Set<String> set = CollectionFactory.newHashSet();
            for (ArrayList<String> prods : grammar.get(symbol)) {
                if (prods.size()==1) {
                    final String other = prods.get(0);
                    set.add(other);
                }
            }
            if (set.size()>0)
                graph.put(symbol, set);
        }
        // do a DFS and assert we do not have a cycle
        for (String symbol : graph.keySet())
            dfs(symbol);

        for (String symbol : graph.keySet()) {
            int id = visited.get(symbol);
            for (String child : graph.get(symbol)) {
                int id2 = visited.get(child);
                assert id2<id;
                System.out.println(symbol+"("+id+") -> "+ child+"("+id2+")");
            }
        }
    }
    void dfs(String v) {
        final Integer i = visited.get(v);
        assert i==null || i.intValue()!=-1;
        if (i!=null) return; // already visited
        visited.put(v,-1);
        final Set<String> children = graph.get(v);
        if (children==null) {
            //assert isLiteral(v); , e.g., DepNamedType
        } else {
            for (String child : children) {
                dfs(child);
            }
        }
        visited.put(v,currID++);
    }
    void printGrammar(String symbol, Set<String> alreadyPrinted) {
        if (alreadyPrinted.contains(symbol)) return;
        alreadyPrinted.add(symbol);
        ArrayList<Rule> prods = grammar.get(symbol);
        if (prods==null) {
            // literal
            genLiteral(symbol); // for testing
            return;
        }
        System.out.println(symbol+" ::= " + (prods.size()==0 ? "" : join(prods.get(0)," ")));
        for (int i=1; i<prods.size(); i++)
            System.out.println("\t| "+join(prods.get(i)," "));

        for (ArrayList<String> prod : prods)
            for (String s : prod)
                printGrammar(s,alreadyPrinted);
    }

    Set<String> genProd(ArrayList<String> prod, int depth) {
        final int prodNum = prod.size();
        if (prodNum==0) return EMPTY_STR;
        ArrayList<Set<String>> acc = new ArrayList<Set<String>>(prodNum);
        int size = 0;
        for (String s : prod) {
            Set<String> set = gen(s,depth-1);
            if (set==null) return null;
            size += set.size();
            acc.add(set);
        }
        if (prodNum ==1) return acc.get(0);

        ArrayList<String[]> acc2 = new ArrayList<String[]>(prodNum);
        for (Set<String> s : acc)
            acc2.add(s.toArray(new String[s.size()]));

        // should be the cartesian prod of all sets, but it is too big, so we sum the sets
        size *= 2;
        Set<String> res = CollectionFactory.newHashSet(2*size);
        Random r = new Random();
        for (int i=0; i<size; i++) {
            StringBuilder s = new StringBuilder();
            for (String[] arr : acc2) {
                s.append(arr[r.nextInt(arr.length)]);
            }
            res.add(s.toString());
        }
        return res;
    }
    Set<String> gen(String rule, int depth) {
        Set<String> res = CollectionFactory.newHashSet();
        ArrayList<Rule> prods = grammar.get(rule);
        if (prods==null) {
            // literal
            res.add(genLiteral(rule)+" ");
        } else {
            if (depth<=0) return null;
            for (ArrayList<String> prod : prods) {
                Set<String> acc = genProd(prod,depth);
                if (acc!=null) {
                    assert acc.size()>0 : rule;
                    res.addAll(acc);
                }
            }
            if (res.size()==0) return null;
        }
        return res;
    }

    boolean isLiteral(String s) { return !grammar.containsKey(s); }

    static boolean random() { return Math.random()<0.5; }
    static boolean hasArg(String id, String name) {
        return !isLiteral2(id);
    }
    static boolean isLiteral2(String s) { // because we need to know if something is a literal before we parse the entire file
        return genLiteral2(s)!=null;
    }
    static String genLiteral2(String s) {
        char first = s.charAt(0);
        if (first>='A' && first<='Z') {
            // special literal
            // special literals (all start with uppercase):
            if (s.equals("UnsignedIntegerLiteral")) {
                return random() ? "0u" : "1u";
            } else if (s.equals("UnsignedLongLiteral")) {
                return random() ? "0ul" : "1ul";
            } else if (s.equals("IntegerLiteral")) {
                return random() ? "0" : "1";
            } else if (s.equals("LongLiteral")) {
                return random() ? "0l" : "1l";
            } else if (s.equals("FloatingPointLiteral")) {
                return random() ? "0.0f" : "1.1f";
            } else if (s.equals("DoubleLiteral")) {
                return random() ? "0.0" : "1.1";
            } else if (s.equals("StringLiteral")) {
                return random() ? "\"\"" : "\"a\"";
            } else if (s.equals("CharacterLiteral")) {
                return random() ? "' '" : "'a'";
            } else if (s.equals("IDENTIFIER")) {
                return random() ? "x" : "y";
            } else if (s.equals("ErrorId")) {
                return "ERR";
            } else {
                return null;
            }
        }
        return s;

    }
    static String genLiteral(String s) {
        String res = genLiteral2(s);
        assert res!=null;
        return res;
    }


    Set<String> usedSymbols = CollectionFactory.newHashSet();
    void findUsedSymbols(String v) {
        if (usedSymbols.contains(v)) return;
        usedSymbols.add(v);
        final ArrayList<Rule> prods = grammar.get(v);
        if (prods==null) return;
        for (ArrayList<String> prod : prods)
            for (String s : prod)
                findUsedSymbols(s);
    }

    Set<String> findRoots() {
        Set<String> res = CollectionFactory.newHashSet(grammar.keySet());
        for (ArrayList<Rule> products : grammar.values())
            for (ArrayList<String> prod : products)
                for (String s : prod)
                    res.remove(s);
        return res;
    }
    ArrayList<String> getNonTerminals() {
        ArrayList<String> res = new ArrayList<String>(grammar.keySet());
        res.removeAll(getLiterals());
        Collections.sort(res);
        return res;
    }
    ArrayList<String> getLiterals() {
        Set<String> res = CollectionFactory.newHashSet();
        for (ArrayList<Rule> products : grammar.values())
            for (ArrayList<String> prod : products)
                for (String s : prod)
                    if (isLiteral(s)) {
                        genLiteral(s); // to test it
                        res.add(s);
                    }

        ArrayList<String> sorted = new ArrayList<String>(res);
        Collections.sort(sorted);
        return sorted;                    
    }

    public static ArrayList<String> readFile(File f) {
        try {
            final BufferedReader in = new BufferedReader(new FileReader(f));
            ArrayList<String> res = new ArrayList<String>();
            String line;
            while ((line=in.readLine())!=null) {
                res.add(line);
            }
            in.close();
            return res;
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
    }
    public static final String NL = System.getProperty("line.separator");
    public static void writeFile(File f, Collection<String> lines)  {
        try {
            final BufferedWriter out = new BufferedWriter(new FileWriter(f));
            for (String s : lines) {
                out.write(s);
                out.write(NL);
            }
            out.close();
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
    }
}