Recorder.java example

Explorer
Syntactic-master
- src
  - main
    - java
      - syntaxLearner
        Cluster.java
        ClusterContext.java
        Learner.java
        LearnerMain.java
        Recorder.java
        UI
        Console.java
        Report.java
        corpus
        Context.java
        Corpus.java
        Vocabulary.java
        VocabularyContext.java
        Word.java
        source
        CorpusSource.java
        PlainTextFile.java
        WikiDump.java
package syntaxLearner;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;

import syntaxLearner.corpus.Corpus;
import syntaxLearner.corpus.Vocabulary;
import syntaxLearner.corpus.Word;
/**
 * A library that records the algorithm in JSON for a standard interpreter
 * @author Omer Shapira
 *
 */
public class Recorder {
	private File mainPath;
	private File currentPath;
	private File wordPath;
	private String name;
	private boolean isRecording;
	private int iterationCounter = 0;
	private File corpusFile;
	private Learner l;
	private File iterationFile;
	private Corpus c; 

	public Recorder(Learner l, File mainPath,  String name, boolean isRecording){
		this.mainPath = mainPath;
		this.name = name;
		this.isRecording = isRecording;
		this.l=l;
	}
	/*
	 * 
	 * corpus_%corpusName% = {
		name: "...",
		tokenCount: 500,
		typeCount: 1000, 
		commonTypes: 900,
		clusterCount: 100,
		identityEps: 0.00004 
		iterationCount: 7,
		typeToId: 
		{
			typeName:0,
			typeName2: 1, 
			...
		},
		idToType: {
		1: "word",
		4: "bleh",
		}
	};

	 */
	/**
	 * 
	 */
	public void recordCorpusData (Corpus c, Learner l){
		if (!isRecording) return;
		this.c=c;
		Vocabulary v = c.getVocabulary();
		Set<Map.Entry<String, Integer>> entrySet = v.getWordIndicesEntrySet();
		Set<Map.Entry<Integer, Word>> wordSet = v.getWordEntrySet();
		StringBuilder s = new StringBuilder(v.getNumOfWords()*30);
		corpusFile = new File(mainPath, "corpus_"+name+".js");

		s.append(String.format("corpus_%1$s = \n{\n\tname:\"%1$s\",\n", name));
		s.append(String.format("\ttokenCount: %1$s,\n", c.tokenCount));
		s.append(String.format("\ttypeCount: %1$s,\n", v.getNumOfWords()));
		s.append(String.format("\tcommonTypes: %1$s,\n", (v.getNumOfWords()-v.countWordsBelowThreshold(l.RARE_WORD_THRESHOLD))));
		s.append(String.format("\tclusterCount: %1$s,\n", l.NUMBER_OF_CLUSTERS));
		//TODO add proper number formatters
		s.append(String.format("\tidentityEps: %1$s,\n", l.IDENTITY_EPSILON));
		s.append(String.format("\titerationCount: %1$s,\n", iterationCounter));
		s.append("\ttypeToId: \n\t{\n\t");
		for (Map.Entry<String, Integer> e: entrySet){
			if (v.getWord(e.getValue()).frequency >= l.RARE_WORD_THRESHOLD){ //TODO see if necessary
				s.append(String.format("\t\t\"%1$s\": %2$s,\n", e.getKey(), e.getValue()));
			}
		}
		s.append("},\n");
		s.append("\tidToType: {");
		for (Map.Entry<Integer, Word> e: wordSet){ //No rare word check here, to keep vector status
			if (e.getValue().frequency >= l.RARE_WORD_THRESHOLD){
				s.append(String.format("\t\t%1$s: \"%2$s\",\n",e.getKey(), e.getValue().name));	
			}
		}
		s.append("}");
		s.append("\n};");

		try {
			corpusFile.createNewFile();
			BufferedWriter out = new BufferedWriter(new FileWriter(corpusFile));
			out.append(s);
			out.close();
		} catch (IOException e1) {
			// TODO Auto-generated catch block
			e1.printStackTrace();
		}
	}

	private void updateIterationData(){
		if (iterationCounter<l.getIterationCount()){
			iterationCounter = l.getIterationCount();
			//TODO Write iteration number to file		
		}
	}
	/*
	 * iteration_%corpusName%_%iterationNumber% = {
	iteration_number : 2,
	unsorted_words: ["a", "b", ...]
};

	 */
	/**
	 * 
	 */
	public void recordNewIteration(String words){
		if (!isRecording) return;
		updateIterationData();
		currentPath = new File(mainPath, Integer.toString(iterationCounter));
		currentPath.mkdir();
		wordPath = new File(currentPath, "words");
		wordPath.mkdir();
		iterationFile = new File(currentPath, 
				String.format("iteration_%1$s_%2$s.js",name,iterationCounter));
		StringBuilder s = new StringBuilder();
		s.append(String.format("iteration_%1$s_%2$s = {\n",name,iterationCounter));
		s.append(String.format("\titeration_number : %1$s,\n",iterationCounter));
		s.append("\tunsorted_words: ");
		s.append(words);
		s.append("\n}");
		try {
			iterationFile.createNewFile();
			BufferedWriter out = new BufferedWriter(new FileWriter(iterationFile));
			out.append(s);
			out.close();
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
	}


	/*
	 * cluster_%corpusName%_%iterationNumber%_%clusterIndex% =
{
	index: 30,
	words: [8, 9, ...],
	distribution: [[0.1, 0.1,...],
		       [0.1, 0.1,...],
		       [0.1, 0.1,....], 
		       ..]
}

	 */
	/**
	 * 
	 */
	public void recordClusterInfo(Cluster c){
		if (!isRecording) return;
		StringBuilder s = new StringBuilder();
		String filename = String.format("cluster_%1$s_%2$s_%3$s", name, iterationCounter,c.ID);
		s.append(filename+" =\n{\n");
		s.append(String.format("\tindex: %1$s,\n", c.ID));
		s.append("\twords: [");
		for (int i: c.words){
			s.append(" "+i+",");
		}
		//remove last comma
		s.deleteCharAt(s.length()-1);
		s.append(" ],\n");
		s.append("\tdistribution: [");
		for (double d1[] : c.clusterDistribution()){
			for (double d2: d1){
				s.append(String.format(" %1$-4f,", d2));
			}
		}
		s.deleteCharAt(s.length()-1);
		s.append(" ]\n}");
		File f = new File (currentPath, filename+".js");
		try {
			f.createNewFile();
			BufferedWriter out = new BufferedWriter(new FileWriter(f));
			out.append(s);
			out.close();
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
	}

	/*
	 * word_%corpusName%_%iterationNumber%_%wordIndex% = 
{
	word: "..",
	wordIndex: 5,
	freq: 80,
	rank: 150,
	distribution: [[0.1, 0.1,...],
		       [0.1, 0.1,...],
		       [0.1, 0.1,....], 
		       ..],
	dist_to_cluster: []
}

	 */
	public void recordWordInfo(Word w, TreeMap<Double, Short> distances, StringBuilder distribution){
		if (!isRecording) return;
		double[] distanceArray = new double[l.NUMBER_OF_CLUSTERS];
		for (Map.Entry<Double, Short> e : distances.entrySet()){
			assert (e.getValue()<distanceArray.length);
			distanceArray[e.getValue()]=e.getKey();
		}
		String filename = String.format("word_%1$s_%2$s_%3$s", name, iterationCounter, w.ID);
		StringBuilder s = new StringBuilder();
		s.append(filename+" =\n{\n");
		s.append(String.format("\tword: \"%1$s\",\n\twordIndex: %2$s,\n", w.name, w.ID));
		s.append(String.format("\tfreq: %1$s,\n", w.frequency));
		s.append(String.format("\trank: %1$s,\n", c.getVocabulary().getRank(w.ID)));
		s.append("\tdistribution: ");
		s.append(distribution);
		s.append(",\n\tdist_to_cluster : [");
		for (double d: distanceArray){
			s.append(String.format(" %1$-4f,", d));
		}
		s.deleteCharAt(s.length()-1);
		s.append(" ]\n}");
		File f = new File(wordPath, filename+".js");
		try {
			f.createNewFile();
			BufferedWriter out = new BufferedWriter(new FileWriter(f));
			out.append(s);
			out.close();
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
	}
}