/* This file is part of the Joshua Machine Translation System. * * Joshua is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2.1 * of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free * Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, * MA 02111-1307 USA */ package joshua.prefix_tree; import java.io.File; import java.io.IOException; import java.io.PrintStream; import java.io.PrintWriter; import java.util.Date; import java.util.HashMap; import java.util.Map; import java.util.Scanner; import java.util.logging.Logger; import joshua.corpus.Corpus; import joshua.corpus.alignment.AlignmentGrids; import joshua.corpus.alignment.Alignments; import joshua.corpus.suffix_array.AbstractHierarchicalPhrases; import joshua.corpus.suffix_array.HierarchicalPhrases; import joshua.corpus.suffix_array.ParallelCorpusGrammarFactory; import joshua.corpus.suffix_array.SuffixArrayFactory; import joshua.corpus.suffix_array.Suffixes; import joshua.corpus.vocab.Vocabulary; import joshua.decoder.JoshuaConfiguration; import joshua.util.FormatUtil; /** * * * @author Lane Schwartz */ public class ExtractRuleProfiler { /** Logger for this class. */ private static Logger logger = Logger.getLogger(ExtractRuleProfiler.class.getName()); public static void main(String[] args) throws IOException { // Tell System.out and System.err to use UTF8 FormatUtil.useUTF8(); logger.info("Starting up - current count is " + AbstractHierarchicalPhrases.counter); int trainingLines = 1000; String sourceCorpusString = "it makes him and it mars him , it sets him on yet it takes him off ."; String sourceFileName; { File sourceFile = File.createTempFile("source", new Date().toString()); PrintStream sourcePrintStream = new PrintStream(sourceFile, "UTF-8"); for (int i=0; i<trainingLines; i++) { sourcePrintStream.println(sourceCorpusString); } sourcePrintStream.close(); sourceFileName = sourceFile.getAbsolutePath(); } String targetCorpusString = "das macht ihn und es besch\u00E4digt ihn , es setzt ihn auf und es f\u00FChrt ihn aus ."; String targetFileName; { File targetFile = File.createTempFile("target", new Date().toString()); PrintWriter targetPrintStream = new PrintWriter(targetFile, "UTF-8"); // PrintStream targetPrintStream = new PrintStream(targetFile, "UTF-8"); for (int i=0; i<trainingLines; i++) { targetPrintStream.println(targetCorpusString); } targetPrintStream.close(); targetFileName = targetFile.getAbsolutePath(); } String alignmentString = "0-0 1-1 2-2 3-3 4-4 5-5 6-6 7-7 8-8 9-9 10-10 11-11 12-12 13-13 14-14 15-15 16-16 17-17"; String alignmentFileName; { File alignmentFile = File.createTempFile("alignment", new Date().toString()); PrintStream alignmentPrintStream = new PrintStream(alignmentFile); for (int i=0; i<trainingLines; i++) { alignmentPrintStream.println(alignmentString); } alignmentPrintStream.close(); alignmentFileName = alignmentFile.getAbsolutePath(); } //String alignmentsType = alignmentsType; int maxCacheSize = 100000;//12566; int numSourceWords, numSourceSentences; Vocabulary sourceVocab = new Vocabulary(); int[] sourceWordsSentences = Vocabulary.initializeVocabulary(sourceFileName, sourceVocab, true); numSourceWords = sourceWordsSentences[0]; numSourceSentences = sourceWordsSentences[1]; Corpus sourceCorpusArray = SuffixArrayFactory.createCorpusArray(sourceFileName, sourceVocab, numSourceWords, numSourceSentences); Suffixes sourceSuffixArray = SuffixArrayFactory.createSuffixArray(sourceCorpusArray, maxCacheSize); int numTargetWords, numTargetSentences; Vocabulary targetVocab = new Vocabulary(); int[] targetWordsSentences = Vocabulary.initializeVocabulary(targetFileName, targetVocab, true); numTargetWords = targetWordsSentences[0]; numTargetSentences = targetWordsSentences[1]; Corpus targetCorpusArray = SuffixArrayFactory.createCorpusArray(targetFileName, targetVocab, numTargetWords, numTargetSentences); Suffixes targetSuffixArray = SuffixArrayFactory.createSuffixArray(targetCorpusArray, maxCacheSize); int trainingSize = sourceCorpusArray.getNumSentences(); boolean requireTightSpans = true; Alignments alignments = new AlignmentGrids(new Scanner(new File(alignmentFileName)), sourceCorpusArray, targetCorpusArray, trainingSize, requireTightSpans); // ParallelCorpus parallelCorpus = // new AlignedParallelCorpus(sourceCorpusArray, targetCorpusArray, alignments); // LexicalProbabilities lexProbs = // new LexProbs(parallelCorpus, Float.MIN_VALUE); Map<Integer,String> ntVocab = new HashMap<Integer,String>(); ntVocab.put(PrefixTree.X, "X"); int ruleSampleSize = 300; int maxPhraseSpan = 10; int maxPhraseLength = 10; int minNonterminalSpan = 2; int maxNonterminals = 2; // RuleExtractor ruleExtractor = new HierarchicalRuleExtractor(sourceSuffixArray, targetCorpusArray, alignments, lexProbs, ruleSampleSize, maxPhraseSpan, maxPhraseLength, minNonterminalSpan, maxPhraseSpan); int[] words = sourceVocab.getIDs(sourceCorpusString); int numIterations = 5; long[] times = new long[numIterations]; for (int i=0; i<numIterations; i++) { logger.info("Extracting rules for sentence " + (i+1) + "."); long startTime1 = System.currentTimeMillis(); { ParallelCorpusGrammarFactory parallelCorpus = new ParallelCorpusGrammarFactory(sourceSuffixArray, targetSuffixArray, alignments, null, ruleSampleSize, maxPhraseSpan, maxPhraseLength, maxNonterminals, minNonterminalSpan, Float.MIN_VALUE, JoshuaConfiguration.phrase_owner, JoshuaConfiguration.default_non_terminal, JoshuaConfiguration.oovFeatureCost); // PrefixTree prefixTree = new PrefixTree(sourceSuffixArray, targetCorpusArray, alignments, sourceSuffixArray.getVocabulary(), lexProbs, ruleExtractor, maxPhraseSpan, maxPhraseLength, maxNonterminals, minNonterminalSpan); PrefixTree prefixTree = new PrefixTree(parallelCorpus); prefixTree.sentenceInitialX = true; prefixTree.sentenceFinalX = true; prefixTree.edgeXMayViolatePhraseSpan = true; prefixTree.add(words); } long endTime1 = System.currentTimeMillis(); logger.info("Cached HPs: " + sourceSuffixArray.getCachedHierarchicalPhrases().size()); logger.info("Current count is " + AbstractHierarchicalPhrases.counter); logger.info("HP Constructor counts: " + HierarchicalPhrases.publicCounter + ", " + HierarchicalPhrases.protectedCounter + "," + HierarchicalPhrases.privateCounter + "," + HierarchicalPhrases.emptyListCounter); times[i] = endTime1 - startTime1; } for (long time : times) { logger.info("Time == " + time); } // logger.info("Extracting rules for second sentence."); // long startTime2 = System.currentTimeMillis(); // { // PrefixTree prefixTree = new PrefixTree(sourceSuffixArray, targetCorpusArray, alignments, sourceSuffixArray.getVocabulary(), lexProbs, ruleExtractor, maxPhraseSpan, maxPhraseLength, maxNonterminals, minNonterminalSpan); // prefixTree.add(words); // } // long endTime2 = System.currentTimeMillis(); // logger.info("Cached HPs: " + sourceSuffixArray.getCachedHierarchicalPhrases().size()); // logger.info("Current count is " + AbstractHierarchicalPhrases.counter); // logger.info("HP Constructor counts: " + HierarchicalPhrases.publicCounter + ", " + HierarchicalPhrases.protectedCounter + "," + HierarchicalPhrases.privateCounter + "," + HierarchicalPhrases.emptyListCounter); // // long time1 = endTime1 - startTime1; // long time2 = endTime2 - startTime2; // // logger.info("Time1 == " + time1); // logger.info("Time2 == " + time2); // Assert.assertTrue(time2 < time1); } }