Markov.java example

Explorer
Java-AI-Book-Code-master
- mr_temp
  - nlp
    - com
      - knowledgebooks
        mapreduce
        NameFinder.java
        nlp
        ExtractNames.java
        util
        ScoredList.java
        Tokenizer.java
- src
  - database
    - CreateSampleDatabases.java
    - DumpMetaData.java
  - geneticalgorithm
    - Genetic.java
    - TestGenetic.java
  - markov
    - Markov.java
  - neuralnetworks
  - nlp
    - com
      - knowledgebooks
        mapreduce
        NameFinder.java
        nlp
        ASpellWrapper.java
        AutoTagger.java
        ComparableDocument.java
        ExtractNames.java
        FastTag.java
        util
        NameValue.java
        NoiseWords.java
        RunExternal.java
        ScoredList.java
        Tokenizer.java
    - public_domain
      - Stemmer.java
  - opencalais
    - OpenCalaisClient.java
  - powerloom
  - search
  - semanticweb
  - spelling
    - jazzy
      - SpellingJazzyTester.java
    - norvig
      - SpellingSuggestions.java
    - norvigwordpairs
      - SpellingSuggestionsWordPairs.java
  - textsearch
  - weka
    - WekaStocks.java
  - wordnet
    - WordNetTest.java
package markov;

import java.util.*;
import java.io.*;

/**
 * Markov learning and recall
 *
 * <p/>
 * Copyright 2002-2012 by Mark Watson. All rights reserved.
 * <p/>
 * This software is can be used under either of the following licenses:
 * <p/>
 * 1. LGPL v3<br/>
 * 2. Apache 2
 * <p/>
 */

public class Markov {
    static public void main(String[] args) {
        new Markov();
    }

    /**
     *  The public constructor does everything:
     *
     *   1. reads a tagged input file tagged_text.txt and builds
     *      a list of all possible tags and words
     *   2. trains a visible Markov model
     *   3. tests the Markov model by tagging new test sentences
     *      containing only words originally in tagged text file.
     */
    public Markov() {
        build_words_and_tags();
        print_statistics();
        train_model();
        test_model();
    }

    /**
     *  Read an input file of manually tagged text. Get a sequence of words and
     *  associated tags and also build sequences of unique words and unique tags.
     */
    public void build_words_and_tags() {
        try {
            FileReader fr = new FileReader("test_data/markov/tagged_text.txt");
            BufferedReader br = new BufferedReader(fr);
            while (true) {
                String line = br.readLine();
                if (line == null) break;
                p(line);
                line = line.trim();
                while (true) {
                    int index = line.indexOf(" ");
                    String key;
                    String tag;
                    if (index == -1) {
                        int index2 = line.indexOf("/");
                        key = line.substring(0, index2).toLowerCase();
                        tag = line.substring(index2 + 1);
                    } else {
                        String line2 = line.substring(0, index);
                        line = line.substring(index + 1);
                        int index2 = line2.indexOf("/");
                        key = line2.substring(0, index2).toLowerCase();
                        tag = line2.substring(index2 + 1);
                    }
                    List<String> v = lexicon.get(key);
                    if (v == null) {
                        v = new ArrayList<String>(5);
                    }
                    v.add(tag);
                    lexicon.put(key, v);
                    if (tags.get(tag) == null) {
                        tags.put(tag, new Integer(1));
                        tagCount++;
                    } else {
                        int old_count = ((Integer)tags.get(tag)).intValue();
                        tags.put(tag, new Integer(old_count + 1));
                    }
                    if (words.get(key) == null) {
                        words.put(key, new Integer(wordCount++));
                    }
                    wordList.add(key);
                    tagList.add(tag);
                    if (index == -1) break;
                }
            }
            uniqueTags.addAll(tags.keySet());
            uniqueWords.addAll(words.keySet());
            uniqueTagCount = uniqueTags.size();
            uniqueWordCount = uniqueWords.size();
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    /**
     *   For debug only: print out statistics of number of unique words and unique tags.
     */
    public void print_statistics() {
        int word_count = 0;
        for (String key : lexicon.keySet()) {
            word_count++;
            List<String> v = lexicon.get(key);
            p0(key + ":");
            for (String s : v) p0(" " + s);
            p("");
        }
        p("\ntotal number of unique words is " + word_count + "\n");
        p("wordCount=" + wordCount + ", tagCount=" + tagCount);
    }

    /**
     *  This is for debug only: print out training matrices in a CSV type format
     *  so that the matrices can be examined in a spreadsheet program for debugging purposes.
     */
    private void WriteCSVfile(List<String> rowNames, List<String> colNames, float[][] buf, String fileName) {
        p("tagList.size()="+tagList.size());
        try {
            FileWriter fw = new FileWriter(fileName + ".txt");
            PrintWriter bw = new PrintWriter(new BufferedWriter(fw));
            // write the first title row:
            StringBuffer sb = new StringBuffer(500);
            for (int i = 0, size = colNames.size(); i < size; i++) {
                sb.append("," + colNames.get(i));
            }
            bw.println(sb.toString());
            // loop on remaining rows:
            for (int i = 0, size = buf.length; i < size; i++) {
                sb.delete(0, sb.length());
                sb.append(rowNames.get(i));
                for (int j = 0, size2 = buf[i].length; j < size2; j++) {
                    sb.append("," + buf[i][j]);
                }
                bw.println(sb.toString());
            }
            bw.close();
        } catch (IOException ioe) {
            ioe.printStackTrace();
        }
    }

    /**
     *  This is for book production only: print out training matrices in a Latex type format
     *  so that the matrices can be inserted into my manuscript:

   \begin{table}[htdp]
\caption{Runtimes by Method}
\centering

\begin{tabular}{|l|l|l|}
\hline
\textbf{Class.method name}&\textbf{Percent of total runtime}&\textbf{Percent in this method}\\
\hline
Chess.main&97.7&0.0\\
GameSearch.playGame&96.5&0.0\\

Chess.calcPieceMoves&1.7&0.8\\
\hline
\end{tabular}
   
\label{tab:runtimes_by_method}
\end{table}
   
     */
    private void WriteLatexFile(List<String> rowNames, List<String> colNames, float[][] buf, String fileName) {
        p("tagList.size()="+tagList.size());
        int SKIP = 6;
        try {
            FileWriter fw = new FileWriter(fileName + ".latex");
            PrintWriter bw = new PrintWriter(new BufferedWriter(fw));
            int size = colNames.size() - SKIP;
            bw.print("\\begin{table*}[htdp]\n\\caption{ADD CAPTION}\\centering\\begin{tabular}{|");
            for (int i = 0; i < size+1; i++) bw.print("l|");
            bw.println("}\n\\hline");
            bw.print(" &");
            for (int i = 0; i < size; i++) {
              bw.print("\\emph{" + colNames.get(i)+"}");
              if (i < (size-1)) bw.print("&");
            }
            bw.println("\\\\\n\\hline");
            
//bw.printf(format, args)
            // loop on remaining rows:
            for (int i = 0, size3 = buf.length - SKIP; i < size3; i++) {
                bw.print(rowNames.get(i)+"&");
                for (int j = 0, size2 = buf[i].length - SKIP; j < size2; j++) {
                    bw.printf("%.2f",buf[i][j]);
                    if (j < (size2-1)) bw.print("&");
                }
                bw.println("\\\\");
            }
            bw.println("\\hline\n\\end{tabular}\n\\label{tab:CHANGE_THIS_LABEL}\n\\end{table*}");
            bw.close();
        } catch (IOException ioe) {
            ioe.printStackTrace();
        }
    }

    /**
     *  Train a Markov model using manually tagged input text
     */
    public void train_model() {
        // start by filling in the tag to tag transition count matrix:
        tagToTagTransitionCount = new float[uniqueTagCount][uniqueTagCount];
        p("tagCount="+tagCount);
        p("uniqueTagCount="+uniqueTagCount);
        for (int i = 0; i < uniqueTagCount; i++) {
            for (int j = 0; j < uniqueTagCount; j++) {
                tagToTagTransitionCount[i][j] = 0;
            }
        }
        String tag1 = (String) tagList.get(0);
        int index1 = uniqueTags.indexOf(tag1);           // inefficient!
        int index0;
        for (int i = 0, size1 = wordList.size() - 1; i < size1; i++) {
            index0 = index1;
            tag1 = (String) tagList.get(i + 1);
            index1 = uniqueTags.indexOf(tag1);           // inefficient
            tagToTagTransitionCount[index0][index1]++;
        }
        WriteCSVfile(uniqueTags, uniqueTags, tagToTagTransitionCount, "tag_to_tag");
        WriteLatexFile(uniqueTags, uniqueTags, tagToTagTransitionCount, "tag_to_tag");
        // now calculate the probabilities of transitioning from tagN to tagM:
        probabilityTag1ToTag2 = new float[uniqueTagCount][uniqueTagCount];
        for (int i = 0; i < uniqueTagCount; i++) {
            int count = ((Integer)tags.get((String)uniqueTags.get(i))).intValue();
            p("tag: " + uniqueTags.get(i) + ", count="+count);
            for (int j = 0; j < uniqueTagCount; j++) {
                probabilityTag1ToTag2[i][j] = 0.0001f + tagToTagTransitionCount[i][j] / (float)count;
            }
        }
        WriteCSVfile(uniqueTags, uniqueTags, probabilityTag1ToTag2, "test_data/markov/prob_tag_to_tag");
        WriteLatexFile(uniqueTags, uniqueTags, probabilityTag1ToTag2, "test_data/markov/prob_tag_to_tag");
        // now calculate the probability of a word, given a proceeding tag:
        probabilityWordGivenTag = new float[uniqueWordCount][uniqueTagCount];
        for (int i = 0; i < uniqueWordCount; i++) {
            String word = uniqueWords.get(i);
            for (int j = 0; j < uniqueTagCount; j++) {
                String tag = uniqueTags.get(j);
                // note: index of tag is one less than index of emitted word we are testing:
                int countTagOccurence = tags.get(tag);
                float wordWithTagOccurence = 0;
                for (int n=0, sizem1=wordList.size()-1; n<sizem1; n++) {
                    String testWord = wordList.get(n);
                    String testTag  = tagList.get(n);
                    if (testWord.equals(word) && testTag.equals(tag)) {
                        wordWithTagOccurence++;
                    }
                }
                probabilityWordGivenTag[i][j] = wordWithTagOccurence / (float)countTagOccurence;
            }
        }
        WriteLatexFile(uniqueWords, uniqueTags, probabilityWordGivenTag, "test_data/markov/prob_word_given_tag");
        WriteCSVfile(uniqueWords, uniqueTags, probabilityWordGivenTag, "test_data/markov/prob_word_given_tag");
    }

    // data for exponential method of evaluating most probable tags for a sequence of words:
    int [] indices;
    int [] counts;
    ArrayList<ArrayList<String>> possibleTags;

    /**
     *  Increment the class variable indices[] to point to the next possible set of tags
     *  to check.
     */
    private boolean incrementIndices(int num) { // uses the global arrays indices and counts
        for (int i=0; i<num; i++) {
            if (indices[i] < (counts[i] - 1)) {
                indices[i] += 1;
                for (int j=0; j<i; j++) {
                    indices[j] = 0;
                }
                return true;
            }
        }
        return false;
    }

    /**
     *  For a sequence of words, values of class variable indices[], and class variable
     *  possibleTags, evaluate how well tags rate using equation 10.7 in [Manning/Schutze, 1999]
     */
    float score(List<String> words) { // uses global variables
        float s = 1.0f;
        int num = words.size();
        float prob_tag_i_given_tag_i_minus_1 = 1.0f;
        for (int i=0; i<num; i++) {
            System.out.println("words["+i+"]="+words.get(i));
            //int tag_index = indices[i];
            if (i > 0) {
                List<String> v0 = possibleTags.get(i - 1);
                List<String> v1 = possibleTags.get(i);
                int index1 = uniqueTags.indexOf(v0.get(indices[i - 1]));
                int index2 = uniqueTags.indexOf(v1.get(indices[i]));
                System.out.println("index1="+index1+"[tag: "+uniqueTags.get(index1)+"], index2="+index2
                            +"[tag: "+uniqueTags.get(index2)+"]");
                prob_tag_i_given_tag_i_minus_1 = probabilityTag1ToTag2[index1][index2];
                int index3 = uniqueWords.indexOf("" + words.get(i));
                float p = probabilityWordGivenTag[index3][index2];
                System.out.println("word: " + words.get(i) + ", p="+p);
                prob_tag_i_given_tag_i_minus_1 *= p;
            }
            s *= prob_tag_i_given_tag_i_minus_1;
        }
        return s;
    }

    /**
     *  Use exponential runtime tagging algorithm (evaluates trained Markov model).
     *  NOTE: do not use this algorithm for long input word sequences - instead,
     *  break up long sequences of text into smaller pieces (i.e., process just
     *  a few sentences at a time).
     */
    public List<String> exponential_tagging_algorithm(List<String> words) {
        possibleTags = new ArrayList<ArrayList<String>>();
        int num = words.size();
        indices = new int[num];
        counts = new int[num];
        int [] best_indices = new int[num];
        for (int i=0; i<num; i++) { indices[i] = 0; counts[i] = 0;}
        for (int i=0; i<num; i++) {
            String word = "" + words.get(i);
            List<String> v = lexicon.get(word);
            ArrayList<String> v2 = new ArrayList<String>();  // possible tags at index i
            for (int j=0; j<v.size(); j++) {
                String tag = "" + v.get(j);
                if (v2.contains(tag) == false)   { v2.add(tag);  counts[i]++; }
            }

            possibleTags.add(v2);      // possible tags at index i
            System.out.print("^^ word: " + word + ", tag count: " + counts[i] + ", tags: ");
            for (int j=0; j<v2.size(); j++) System.out.print(" " + v2.get(j));
            System.out.println();
        }
        float best_score = -9999;
        do {
            System.out.print("Current indices:");
            for (int k=0; k<num; k++) System.out.print(" " + indices[k]);
            System.out.println();
            float score = score(words);
            if (score > best_score) {
                best_score = score;
                System.out.println(" **  ** new best score: " + best_score);
                for (int m=0; m<num; m++) best_indices[m] = indices[m];
            }
        } while (incrementIndices(num));

        List<String> tags = new ArrayList<String>(num);
        for (int i=0; i<num; i++) {
            List<String> v = possibleTags.get(i);
            tags.add(v.get(best_indices[i]));
        }
        return tags;
    }

    /**
     *   Throw away test method.
     */
    public void test_model() {
        List<String> words = new ArrayList<String>();     words.add(".");
        words.add("the"); words.add("dog"); words.add("chased"); words.add("the"); words.add("cat"); words.add(".");
        words.add("mary"); words.add("went"); words.add("to"); words.add("the"); words.add("river"); words.add(".");
        words.add("john"); words.add("saw"); words.add("mary"); words.add("bank"); words.add("the");
        words.add("airplane"); words.add(".");
        List<String> tags = exponential_tagging_algorithm(words);
        p("");
        for (int i=0; i<words.size()-1; i++) {
            p(""+words.get(i)+"\t: "+tags.get(i));
        }
    }

    /**
     *  Utility print method - with line feed
     */
    public void p(String s) {
        System.out.println(s);
    }

    /**
     *  Utility print method - no line feed
     */
    public void p0(String s) {
        System.out.print(s);
    }

    Map<String,List<String>> lexicon = new Hashtable<String,List<String>>();
    Map<String, Integer> tags = new Hashtable<String, Integer>();
    Map<String, Integer> words = new Hashtable<String, Integer>();
    List<String> uniqueTags = new ArrayList<String>();
    List<String> uniqueWords = new ArrayList<String>();
    int uniqueTagCount;
    int uniqueWordCount;
    //String [] tagNames;
    int tagCount = 0;                  // from training text
    int wordCount = 0;                 // from training text
    List<String> wordList = new ArrayList<String>();    // from training text
    List<String> tagList = new ArrayList<String>();     // from training text
    float[][] tagToTagTransitionCount;      // [num_tag][num_tag]
    int[][] wordCountByTag;
    float [][]probabilityTag1ToTag2;
    float [][]probabilityWordGivenTag;


}