ExtractRuleProfiler.java example

Explorer
relax-decode-master
- third-party
/* This file is part of the Joshua Machine Translation System.
 * 
 * Joshua is free software; you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as
 * published by the Free Software Foundation; either version 2.1
 * of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free
 * Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
 * MA 02111-1307 USA
 */
package joshua.prefix_tree;

import java.io.File;
import java.io.IOException;
import java.io.PrintStream;
import java.io.PrintWriter;
import java.util.Date;
import java.util.HashMap;
import java.util.Map;
import java.util.Scanner;
import java.util.logging.Logger;

import joshua.corpus.Corpus;
import joshua.corpus.alignment.AlignmentGrids;
import joshua.corpus.alignment.Alignments;
import joshua.corpus.suffix_array.AbstractHierarchicalPhrases;
import joshua.corpus.suffix_array.HierarchicalPhrases;
import joshua.corpus.suffix_array.ParallelCorpusGrammarFactory;
import joshua.corpus.suffix_array.SuffixArrayFactory;
import joshua.corpus.suffix_array.Suffixes;
import joshua.corpus.vocab.Vocabulary;
import joshua.decoder.JoshuaConfiguration;
import joshua.util.FormatUtil;

/**
 *
 *
 * @author Lane Schwartz
 */
public class ExtractRuleProfiler {

	/** Logger for this class. */
	private static Logger logger =
		Logger.getLogger(ExtractRuleProfiler.class.getName());
	
	public static void main(String[] args) throws IOException {

		// Tell System.out and System.err to use UTF8
		FormatUtil.useUTF8();

		logger.info("Starting up - current count is " + AbstractHierarchicalPhrases.counter);
		
		
		int trainingLines = 1000;
		
		String sourceCorpusString = 
			"it makes him and it mars him , it sets him on yet it takes him off .";
		
		String sourceFileName;
		{
			File sourceFile = File.createTempFile("source", new Date().toString());
			PrintStream sourcePrintStream = new PrintStream(sourceFile, "UTF-8");
			for (int i=0; i<trainingLines; i++) {
				sourcePrintStream.println(sourceCorpusString);	
			}
			sourcePrintStream.close();
			sourceFileName = sourceFile.getAbsolutePath();
		}
	
		String targetCorpusString = 
			"das macht ihn und es besch\u00E4digt ihn , es setzt ihn auf und es f\u00FChrt ihn aus .";
		
		
		String targetFileName;
		{
			File targetFile = File.createTempFile("target", new Date().toString());
			PrintWriter targetPrintStream = new PrintWriter(targetFile, "UTF-8");
//			PrintStream targetPrintStream = new PrintStream(targetFile, "UTF-8");
			for (int i=0; i<trainingLines; i++) {
				targetPrintStream.println(targetCorpusString);
			}
			targetPrintStream.close();
			targetFileName = targetFile.getAbsolutePath();
		}
		
		String alignmentString = 
			"0-0 1-1 2-2 3-3 4-4 5-5 6-6 7-7 8-8 9-9 10-10 11-11 12-12 13-13 14-14 15-15 16-16 17-17";
		
		String alignmentFileName;
		{
			File alignmentFile = File.createTempFile("alignment", new Date().toString());
			PrintStream alignmentPrintStream = new PrintStream(alignmentFile);
			for (int i=0; i<trainingLines; i++) {
				alignmentPrintStream.println(alignmentString);
			}
			alignmentPrintStream.close();
			alignmentFileName = alignmentFile.getAbsolutePath();
		}

		//String alignmentsType = alignmentsType;
	
		int maxCacheSize = 100000;//12566;
		
		int numSourceWords, numSourceSentences;
		Vocabulary sourceVocab = new Vocabulary();
		int[] sourceWordsSentences = Vocabulary.initializeVocabulary(sourceFileName, sourceVocab, true);
		numSourceWords = sourceWordsSentences[0];
		numSourceSentences = sourceWordsSentences[1];
		
		Corpus sourceCorpusArray = SuffixArrayFactory.createCorpusArray(sourceFileName, sourceVocab, numSourceWords, numSourceSentences);
		Suffixes sourceSuffixArray = SuffixArrayFactory.createSuffixArray(sourceCorpusArray, maxCacheSize);
		
		int numTargetWords, numTargetSentences;
		Vocabulary targetVocab = new Vocabulary();
		int[] targetWordsSentences = Vocabulary.initializeVocabulary(targetFileName, targetVocab, true);
		numTargetWords = targetWordsSentences[0];
		numTargetSentences = targetWordsSentences[1];
		
		Corpus targetCorpusArray = SuffixArrayFactory.createCorpusArray(targetFileName, targetVocab, numTargetWords, numTargetSentences);
		Suffixes targetSuffixArray = SuffixArrayFactory.createSuffixArray(targetCorpusArray, maxCacheSize);
		
		int trainingSize = sourceCorpusArray.getNumSentences();
		boolean requireTightSpans = true;
		Alignments alignments = new AlignmentGrids(new Scanner(new File(alignmentFileName)), sourceCorpusArray, targetCorpusArray, trainingSize, requireTightSpans);
		
//		ParallelCorpus parallelCorpus = 
//			new AlignedParallelCorpus(sourceCorpusArray, targetCorpusArray, alignments);
		
//		LexicalProbabilities lexProbs = 
//			new LexProbs(parallelCorpus, Float.MIN_VALUE);
		
		Map<Integer,String> ntVocab = new HashMap<Integer,String>();
		ntVocab.put(PrefixTree.X, "X");
		
		int ruleSampleSize = 300;
		int maxPhraseSpan = 10;
		int maxPhraseLength = 10;
		int minNonterminalSpan = 2;
		int maxNonterminals = 2;
		
//		RuleExtractor ruleExtractor = new HierarchicalRuleExtractor(sourceSuffixArray, targetCorpusArray, alignments, lexProbs, ruleSampleSize, maxPhraseSpan, maxPhraseLength, minNonterminalSpan, maxPhraseSpan);
		
		int[] words = sourceVocab.getIDs(sourceCorpusString);
		
		int numIterations = 5;
		long[] times = new long[numIterations];
		
		for (int i=0; i<numIterations; i++) {
			logger.info("Extracting rules for sentence " + (i+1) + ".");
			long startTime1 = System.currentTimeMillis();
			{
				ParallelCorpusGrammarFactory parallelCorpus = new ParallelCorpusGrammarFactory(sourceSuffixArray, targetSuffixArray, alignments, null, ruleSampleSize, maxPhraseSpan, maxPhraseLength, maxNonterminals, minNonterminalSpan, Float.MIN_VALUE, JoshuaConfiguration.phrase_owner, JoshuaConfiguration.default_non_terminal, JoshuaConfiguration.oovFeatureCost);

//				PrefixTree prefixTree = new PrefixTree(sourceSuffixArray, targetCorpusArray, alignments, sourceSuffixArray.getVocabulary(), lexProbs, ruleExtractor, maxPhraseSpan, maxPhraseLength, maxNonterminals, minNonterminalSpan);
				PrefixTree prefixTree = new PrefixTree(parallelCorpus);
				
				prefixTree.sentenceInitialX = true;
				prefixTree.sentenceFinalX   = true;
				prefixTree.edgeXMayViolatePhraseSpan = true;
				prefixTree.add(words);
			}
			long endTime1 = System.currentTimeMillis();
			logger.info("Cached HPs: " + sourceSuffixArray.getCachedHierarchicalPhrases().size());
			logger.info("Current count is " + AbstractHierarchicalPhrases.counter);
			logger.info("HP Constructor counts: " + HierarchicalPhrases.publicCounter + ", " + HierarchicalPhrases.protectedCounter + "," + HierarchicalPhrases.privateCounter + "," + HierarchicalPhrases.emptyListCounter);

			times[i] = endTime1 - startTime1;
		}
		
		for (long time : times) {
			logger.info("Time == " + time);
		}
		
//		logger.info("Extracting rules for second sentence.");
//		long startTime2 = System.currentTimeMillis();
//		{
//			PrefixTree prefixTree = new PrefixTree(sourceSuffixArray, targetCorpusArray, alignments, sourceSuffixArray.getVocabulary(), lexProbs, ruleExtractor, maxPhraseSpan, maxPhraseLength, maxNonterminals, minNonterminalSpan);
//			prefixTree.add(words);
//		}
//		long endTime2 = System.currentTimeMillis();
//		logger.info("Cached HPs: " + sourceSuffixArray.getCachedHierarchicalPhrases().size());
//		logger.info("Current count is " + AbstractHierarchicalPhrases.counter);
//		logger.info("HP Constructor counts: " + HierarchicalPhrases.publicCounter + ", " + HierarchicalPhrases.protectedCounter + "," + HierarchicalPhrases.privateCounter + "," + HierarchicalPhrases.emptyListCounter);
//		
//		long time1 = endTime1 - startTime1;
//		long time2 = endTime2 - startTime2;
//		
//		logger.info("Time1 == " + time1);
//		logger.info("Time2 == " + time2);
		
//		Assert.assertTrue(time2 < time1);
	}
}