package markov; import java.util.*; import java.io.*; /** * Markov learning and recall * * <p/> * Copyright 2002-2012 by Mark Watson. All rights reserved. * <p/> * This software is can be used under either of the following licenses: * <p/> * 1. LGPL v3<br/> * 2. Apache 2 * <p/> */ public class Markov { static public void main(String[] args) { new Markov(); } /** * The public constructor does everything: * * 1. reads a tagged input file tagged_text.txt and builds * a list of all possible tags and words * 2. trains a visible Markov model * 3. tests the Markov model by tagging new test sentences * containing only words originally in tagged text file. */ public Markov() { build_words_and_tags(); print_statistics(); train_model(); test_model(); } /** * Read an input file of manually tagged text. Get a sequence of words and * associated tags and also build sequences of unique words and unique tags. */ public void build_words_and_tags() { try { FileReader fr = new FileReader("test_data/markov/tagged_text.txt"); BufferedReader br = new BufferedReader(fr); while (true) { String line = br.readLine(); if (line == null) break; p(line); line = line.trim(); while (true) { int index = line.indexOf(" "); String key; String tag; if (index == -1) { int index2 = line.indexOf("/"); key = line.substring(0, index2).toLowerCase(); tag = line.substring(index2 + 1); } else { String line2 = line.substring(0, index); line = line.substring(index + 1); int index2 = line2.indexOf("/"); key = line2.substring(0, index2).toLowerCase(); tag = line2.substring(index2 + 1); } List<String> v = lexicon.get(key); if (v == null) { v = new ArrayList<String>(5); } v.add(tag); lexicon.put(key, v); if (tags.get(tag) == null) { tags.put(tag, new Integer(1)); tagCount++; } else { int old_count = ((Integer)tags.get(tag)).intValue(); tags.put(tag, new Integer(old_count + 1)); } if (words.get(key) == null) { words.put(key, new Integer(wordCount++)); } wordList.add(key); tagList.add(tag); if (index == -1) break; } } uniqueTags.addAll(tags.keySet()); uniqueWords.addAll(words.keySet()); uniqueTagCount = uniqueTags.size(); uniqueWordCount = uniqueWords.size(); } catch (Exception e) { e.printStackTrace(); } } /** * For debug only: print out statistics of number of unique words and unique tags. */ public void print_statistics() { int word_count = 0; for (String key : lexicon.keySet()) { word_count++; List<String> v = lexicon.get(key); p0(key + ":"); for (String s : v) p0(" " + s); p(""); } p("\ntotal number of unique words is " + word_count + "\n"); p("wordCount=" + wordCount + ", tagCount=" + tagCount); } /** * This is for debug only: print out training matrices in a CSV type format * so that the matrices can be examined in a spreadsheet program for debugging purposes. */ private void WriteCSVfile(List<String> rowNames, List<String> colNames, float[][] buf, String fileName) { p("tagList.size()="+tagList.size()); try { FileWriter fw = new FileWriter(fileName + ".txt"); PrintWriter bw = new PrintWriter(new BufferedWriter(fw)); // write the first title row: StringBuffer sb = new StringBuffer(500); for (int i = 0, size = colNames.size(); i < size; i++) { sb.append("," + colNames.get(i)); } bw.println(sb.toString()); // loop on remaining rows: for (int i = 0, size = buf.length; i < size; i++) { sb.delete(0, sb.length()); sb.append(rowNames.get(i)); for (int j = 0, size2 = buf[i].length; j < size2; j++) { sb.append("," + buf[i][j]); } bw.println(sb.toString()); } bw.close(); } catch (IOException ioe) { ioe.printStackTrace(); } } /** * This is for book production only: print out training matrices in a Latex type format * so that the matrices can be inserted into my manuscript: \begin{table}[htdp] \caption{Runtimes by Method} \centering \begin{tabular}{|l|l|l|} \hline \textbf{Class.method name}&\textbf{Percent of total runtime}&\textbf{Percent in this method}\\ \hline Chess.main&97.7&0.0\\ GameSearch.playGame&96.5&0.0\\ Chess.calcPieceMoves&1.7&0.8\\ \hline \end{tabular} \label{tab:runtimes_by_method} \end{table} */ private void WriteLatexFile(List<String> rowNames, List<String> colNames, float[][] buf, String fileName) { p("tagList.size()="+tagList.size()); int SKIP = 6; try { FileWriter fw = new FileWriter(fileName + ".latex"); PrintWriter bw = new PrintWriter(new BufferedWriter(fw)); int size = colNames.size() - SKIP; bw.print("\\begin{table*}[htdp]\n\\caption{ADD CAPTION}\\centering\\begin{tabular}{|"); for (int i = 0; i < size+1; i++) bw.print("l|"); bw.println("}\n\\hline"); bw.print(" &"); for (int i = 0; i < size; i++) { bw.print("\\emph{" + colNames.get(i)+"}"); if (i < (size-1)) bw.print("&"); } bw.println("\\\\\n\\hline"); //bw.printf(format, args) // loop on remaining rows: for (int i = 0, size3 = buf.length - SKIP; i < size3; i++) { bw.print(rowNames.get(i)+"&"); for (int j = 0, size2 = buf[i].length - SKIP; j < size2; j++) { bw.printf("%.2f",buf[i][j]); if (j < (size2-1)) bw.print("&"); } bw.println("\\\\"); } bw.println("\\hline\n\\end{tabular}\n\\label{tab:CHANGE_THIS_LABEL}\n\\end{table*}"); bw.close(); } catch (IOException ioe) { ioe.printStackTrace(); } } /** * Train a Markov model using manually tagged input text */ public void train_model() { // start by filling in the tag to tag transition count matrix: tagToTagTransitionCount = new float[uniqueTagCount][uniqueTagCount]; p("tagCount="+tagCount); p("uniqueTagCount="+uniqueTagCount); for (int i = 0; i < uniqueTagCount; i++) { for (int j = 0; j < uniqueTagCount; j++) { tagToTagTransitionCount[i][j] = 0; } } String tag1 = (String) tagList.get(0); int index1 = uniqueTags.indexOf(tag1); // inefficient! int index0; for (int i = 0, size1 = wordList.size() - 1; i < size1; i++) { index0 = index1; tag1 = (String) tagList.get(i + 1); index1 = uniqueTags.indexOf(tag1); // inefficient tagToTagTransitionCount[index0][index1]++; } WriteCSVfile(uniqueTags, uniqueTags, tagToTagTransitionCount, "tag_to_tag"); WriteLatexFile(uniqueTags, uniqueTags, tagToTagTransitionCount, "tag_to_tag"); // now calculate the probabilities of transitioning from tagN to tagM: probabilityTag1ToTag2 = new float[uniqueTagCount][uniqueTagCount]; for (int i = 0; i < uniqueTagCount; i++) { int count = ((Integer)tags.get((String)uniqueTags.get(i))).intValue(); p("tag: " + uniqueTags.get(i) + ", count="+count); for (int j = 0; j < uniqueTagCount; j++) { probabilityTag1ToTag2[i][j] = 0.0001f + tagToTagTransitionCount[i][j] / (float)count; } } WriteCSVfile(uniqueTags, uniqueTags, probabilityTag1ToTag2, "test_data/markov/prob_tag_to_tag"); WriteLatexFile(uniqueTags, uniqueTags, probabilityTag1ToTag2, "test_data/markov/prob_tag_to_tag"); // now calculate the probability of a word, given a proceeding tag: probabilityWordGivenTag = new float[uniqueWordCount][uniqueTagCount]; for (int i = 0; i < uniqueWordCount; i++) { String word = uniqueWords.get(i); for (int j = 0; j < uniqueTagCount; j++) { String tag = uniqueTags.get(j); // note: index of tag is one less than index of emitted word we are testing: int countTagOccurence = tags.get(tag); float wordWithTagOccurence = 0; for (int n=0, sizem1=wordList.size()-1; n<sizem1; n++) { String testWord = wordList.get(n); String testTag = tagList.get(n); if (testWord.equals(word) && testTag.equals(tag)) { wordWithTagOccurence++; } } probabilityWordGivenTag[i][j] = wordWithTagOccurence / (float)countTagOccurence; } } WriteLatexFile(uniqueWords, uniqueTags, probabilityWordGivenTag, "test_data/markov/prob_word_given_tag"); WriteCSVfile(uniqueWords, uniqueTags, probabilityWordGivenTag, "test_data/markov/prob_word_given_tag"); } // data for exponential method of evaluating most probable tags for a sequence of words: int [] indices; int [] counts; ArrayList<ArrayList<String>> possibleTags; /** * Increment the class variable indices[] to point to the next possible set of tags * to check. */ private boolean incrementIndices(int num) { // uses the global arrays indices and counts for (int i=0; i<num; i++) { if (indices[i] < (counts[i] - 1)) { indices[i] += 1; for (int j=0; j<i; j++) { indices[j] = 0; } return true; } } return false; } /** * For a sequence of words, values of class variable indices[], and class variable * possibleTags, evaluate how well tags rate using equation 10.7 in [Manning/Schutze, 1999] */ float score(List<String> words) { // uses global variables float s = 1.0f; int num = words.size(); float prob_tag_i_given_tag_i_minus_1 = 1.0f; for (int i=0; i<num; i++) { System.out.println("words["+i+"]="+words.get(i)); //int tag_index = indices[i]; if (i > 0) { List<String> v0 = possibleTags.get(i - 1); List<String> v1 = possibleTags.get(i); int index1 = uniqueTags.indexOf(v0.get(indices[i - 1])); int index2 = uniqueTags.indexOf(v1.get(indices[i])); System.out.println("index1="+index1+"[tag: "+uniqueTags.get(index1)+"], index2="+index2 +"[tag: "+uniqueTags.get(index2)+"]"); prob_tag_i_given_tag_i_minus_1 = probabilityTag1ToTag2[index1][index2]; int index3 = uniqueWords.indexOf("" + words.get(i)); float p = probabilityWordGivenTag[index3][index2]; System.out.println("word: " + words.get(i) + ", p="+p); prob_tag_i_given_tag_i_minus_1 *= p; } s *= prob_tag_i_given_tag_i_minus_1; } return s; } /** * Use exponential runtime tagging algorithm (evaluates trained Markov model). * NOTE: do not use this algorithm for long input word sequences - instead, * break up long sequences of text into smaller pieces (i.e., process just * a few sentences at a time). */ public List<String> exponential_tagging_algorithm(List<String> words) { possibleTags = new ArrayList<ArrayList<String>>(); int num = words.size(); indices = new int[num]; counts = new int[num]; int [] best_indices = new int[num]; for (int i=0; i<num; i++) { indices[i] = 0; counts[i] = 0;} for (int i=0; i<num; i++) { String word = "" + words.get(i); List<String> v = lexicon.get(word); ArrayList<String> v2 = new ArrayList<String>(); // possible tags at index i for (int j=0; j<v.size(); j++) { String tag = "" + v.get(j); if (v2.contains(tag) == false) { v2.add(tag); counts[i]++; } } possibleTags.add(v2); // possible tags at index i System.out.print("^^ word: " + word + ", tag count: " + counts[i] + ", tags: "); for (int j=0; j<v2.size(); j++) System.out.print(" " + v2.get(j)); System.out.println(); } float best_score = -9999; do { System.out.print("Current indices:"); for (int k=0; k<num; k++) System.out.print(" " + indices[k]); System.out.println(); float score = score(words); if (score > best_score) { best_score = score; System.out.println(" ** ** new best score: " + best_score); for (int m=0; m<num; m++) best_indices[m] = indices[m]; } } while (incrementIndices(num)); List<String> tags = new ArrayList<String>(num); for (int i=0; i<num; i++) { List<String> v = possibleTags.get(i); tags.add(v.get(best_indices[i])); } return tags; } /** * Throw away test method. */ public void test_model() { List<String> words = new ArrayList<String>(); words.add("."); words.add("the"); words.add("dog"); words.add("chased"); words.add("the"); words.add("cat"); words.add("."); words.add("mary"); words.add("went"); words.add("to"); words.add("the"); words.add("river"); words.add("."); words.add("john"); words.add("saw"); words.add("mary"); words.add("bank"); words.add("the"); words.add("airplane"); words.add("."); List<String> tags = exponential_tagging_algorithm(words); p(""); for (int i=0; i<words.size()-1; i++) { p(""+words.get(i)+"\t: "+tags.get(i)); } } /** * Utility print method - with line feed */ public void p(String s) { System.out.println(s); } /** * Utility print method - no line feed */ public void p0(String s) { System.out.print(s); } Map<String,List<String>> lexicon = new Hashtable<String,List<String>>(); Map<String, Integer> tags = new Hashtable<String, Integer>(); Map<String, Integer> words = new Hashtable<String, Integer>(); List<String> uniqueTags = new ArrayList<String>(); List<String> uniqueWords = new ArrayList<String>(); int uniqueTagCount; int uniqueWordCount; //String [] tagNames; int tagCount = 0; // from training text int wordCount = 0; // from training text List<String> wordList = new ArrayList<String>(); // from training text List<String> tagList = new ArrayList<String>(); // from training text float[][] tagToTagTransitionCount; // [num_tag][num_tag] int[][] wordCountByTag; float [][]probabilityTag1ToTag2; float [][]probabilityWordGivenTag; }