/** * OpenKM, Open Document Management System (http://www.openkm.com) * Copyright (c) 2006-2011 Paco Avila & Josep Llort * * No bytes were intentionally harmed during the development of this application. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License along * with this program; if not, write to the Free Software Foundation, Inc., * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. */ package com.openkm.util.markov; import java.io.Reader; import java.util.HashMap; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Random; /** * A Markov chain for characters. For each set of prefix strings, * keeps track of possible next characters and the probability * of going to each. * * @author Lawrence Kesteloot * @author Paco Avila */ public class Markov { /** * Map from the prefix string (String) to list of characters (Chain). */ private Map<String, Chain> map; private String bootstrapPrefix; /** * Creates a chain based on the Reader with a prefix of * length "length". Reads the entire input stream and * creates the Markov chain. */ public Markov(Reader in, int length) throws java.io.IOException { map = new HashMap<String, Chain>(); CharQueue queue = new CharQueue(length); int c; for (int i = 0; i < length; i++) { c = in.read(); if (c == -1) { System.out.println("Input is too short"); return; } queue.put((char)c); } bootstrapPrefix = queue.toString(); // for collapsing whitespace boolean wasWhitespace = false; while ((c = in.read()) != -1) { if (Character.isWhitespace((char)c)) { if (wasWhitespace) { // collapse continuous whitespace continue; } c = ' '; wasWhitespace = true; } else { wasWhitespace = false; } String prefix = queue.toString(); Chain chain = map.get(prefix); if (chain == null) { chain = new Chain(prefix); map.put(prefix, chain); } chain.add((char)c); queue.put((char)c); } } /** * Returns the first "length" characters that were read. */ public String getBootstrapPrefix() { return bootstrapPrefix; } /** * Returns the next character to print given the prefix. * Returns -1 when there are no possible next characters. */ public int get(String prefix, Random random) { Chain chain = map.get(prefix); if (chain == null) { return -1; } int index = random.nextInt(chain.getTotal()); return chain.get(index); } /** * Prints the contents of the Markov graph. */ public void dump() { for (Chain chain : map.values()) { chain.dump(); } } /** * List of possible next characters and their probabilities. */ private static class Chain { private String prefix; private int total; private List<Link> list; public Chain(String prefix) { this.prefix = prefix; total = 0; list = new LinkedList<Link>(); } public int getTotal() { return total; } public char get(int index) { for (Link link : list) { int count = link.getCount(); if (index < count) { return link.getChar(); } index -= count; } // weird return '@'; } public void add(char c) { boolean found = false; for (Link link : list) { if (c == link.getChar()) { link.increment(); found = true; break; } } if (!found) { Link link = new Link(c); list.add(link); } total++; } public void dump() { System.out.println(prefix + ": (" + total + ")"); for (Link link: list) { System.out.println(" " + link.getChar() + " (" + link.getCount() + ")"); } } /** * Possible next character and the number of times we've seen it. */ private static class Link { private char c; private int count; public Link(char c) { this.c = c; count = 1; } public void increment() { count++; } public int getCount() { return count; } public char getChar() { return c; } } } }