import java.io.File; import java.util.Arrays; import java.util.Vector; /** * Hyper String Finite State Automata Object holding all possible combinations * of first letter capitalization and punctuation symbols .,? * * @author joakimlilja * */ public class HyperStringFSA2 { public static final String EMPTY_PUNCT = "" + ((char) 007) + "EMPTY "; //public static final String[] TRANSITIONS = { EMPTY_PUNCT, ",COMMA ", ".PERIOD ", "?QMARK ", "!EXCL " }; public static final String[] TRANSITIONS = { EMPTY_PUNCT, ",COMMA ", ".PERIOD "}; public static final String[] POSTPROCESSES = { " ", ", ", ". ", "? ", "! " }; public static final int TRANSITION_COUNT = TRANSITIONS.length; public static final int STATES_COUNT = 2; Vector<String[]> outputs; NGramWrapper nGram; /** * Constructor creating a FSA based on the specified String array consisting * of words * * @param s * Array of words * */ public HyperStringFSA2(String[] s, NGramWrapper nGram) { outputs = new Vector<String[]>(); this.nGram = nGram; constructFSA(s, outputs); } /** * Construct the FSA with all possible outputs with each emission having a * cost * * @param s * Array of words * * @param outputs * Vector holding all possible outputs * */ private void constructFSA(String[] s, Vector<String[]> outputs) { Node root = new Node("", 1.0); root = generateNodes(s, root); generateOutputs(root, outputs); } /** * Generate the outputs using a tree structure * * @param node * @param outputs */ private void generateOutputs(Node node, Vector<String[]> outputs) { if (node.children.size() == 0) { String s = backTrack(node, "", Integer.MAX_VALUE) + node.cost; // System.out.println(s); outputs.add(s.split(" ")); } else { for (int i = 0; i < node.children.size(); i++) { generateOutputs(node.children.elementAt(i), outputs); } } } /** * Backtrack from end node to generate the output of that path * * @param node * @param s * @return output */ private String backTrack(Node node, String s, int n) { String value = node.value.equals(EMPTY_PUNCT) ? "" : node.value; if (node.parent == null || n == 0) { return value + s; } else { return backTrack(node.parent, value + s, n - 1); } } /** * Generate children * * @param s * @param parent * @return */ private Node generateNodes(String[] s, Node parent) { String unCapWord = deCapitalizeWord(s[0]); Node unCapNode = new Node(unCapWord + " ", parent.cost * getCost(parent, unCapWord)); unCapNode.parent = parent; generateTransitions(unCapNode); String capWord = capitalizeWord(s[0]); Node capNode = new Node(capWord + " ", parent.cost * getCost(parent, capWord)); capNode.parent = parent; generateTransitions(capNode); parent.children.add(capNode); parent.children.add(unCapNode); if (s.length > 1) { for (int i = 0; i < unCapNode.children.size(); i++) { unCapNode.children.set( i, generateNodes(Arrays.copyOfRange(s, 1, s.length), unCapNode.children.get(i))); capNode.children.set( i, generateNodes(Arrays.copyOfRange(s, 1, s.length), capNode.children.get(i))); } // Add empty emission with zero count (WHAT COST FOR EMPTY // EMISSION??) // String[] nextWord = Arrays.copyOfRange(s, 1, 2); // unCapNode = generateNodes(nextWord, unCapNode); // capNode = generateNodes(nextWord, capNode); } // if (s.length == 1) { // System.out.println("---CHECK HERE ---" + Arrays.toString(s)); // Node emptyEndNode1 = new Node("", parent.cost); // emptyEndNode1.parent = unCapNode; // unCapNode.children.add(emptyEndNode1); // // Node emptyEndNode2 = new Node("", parent.cost); // emptyEndNode2.parent = capNode; // capNode.children.add(emptyEndNode2); // } return parent; } private double getCost(Node parent, String word) { String[] ngram = (backTrack(parent, word, nGram.getNGramLength() - 2) .split(" ")); // System.err.println(Arrays.toString(ngram.split(" "))); double cost = 1.0; if (ngram.length > 1) { //Varför större än 1 istället för större än 0 ... ? cost = nGram.getCostOfNGram(ngram); } // System.err.println("Cost: " + cost); return cost; } /** * Generate the possible punctuation transitions * * @param node */ private void generateTransitions(Node parent) { for (int i = 0; i < TRANSITION_COUNT; i++) { String emission = TRANSITIONS[i]; Node transNode = null; if (emission.equals(EMPTY_PUNCT)) { transNode = new Node(emission, parent.cost*0.5); //För att du vill ha en kostnad för att inte ha en punctuation ? } else { transNode = new Node(emission, parent.cost * getCost(parent, emission)); } transNode.parent = parent; parent.children.add(transNode); } } private String capitalizeWord(String input) { return input.substring(0, 1).toUpperCase() + input.substring(1); } private String deCapitalizeWord(String input) { return input.substring(0, 1).toLowerCase() + input.substring(1); } public String toString() { return outputs.toString(); } public Vector<String[]> getOutputs() { return outputs; } public static String postProcessing(String input) { for (int i = 0; i < TRANSITION_COUNT; i++) { input = input.replaceAll(TRANSITIONS[i], POSTPROCESSES[i]); } return input; } public static void main(String... args) { String[] words = { "mars", "scientists" }; NGramWrapper ngw = new NGramWrapper(3); ngw.readFile(new File("sentences.txt")); HyperStringFSA2 fsa = new HyperStringFSA2(words, ngw); for (String[] s : fsa.outputs) { System.out.println(Arrays.toString(s)); } } private class Node { String value; double cost; Node parent; Vector<Node> children; public Node(String value, double cost) { children = new Vector<Node>(); this.cost = cost; this.value = value; } public String toString() { return value; } } }