LMAdaptingRuleExtractor.java example

Explorer
relax-decode-master
- third-party
/* This file is part of the Joshua Machine Translation System.
 * 
 * Joshua is free software; you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as
 * published by the Free Software Foundation; either version 2.1
 * of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free
 * Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
 * MA 02111-1307 USA
 */
package joshua.prefix_tree;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Map;

import joshua.corpus.Corpus;
import joshua.corpus.Phrase;
import joshua.corpus.alignment.Alignments;
import joshua.corpus.lexprob.LexicalProbabilities;
import joshua.corpus.suffix_array.HierarchicalPhrase;
import joshua.corpus.suffix_array.Pattern;
import joshua.corpus.suffix_array.Suffixes;
import joshua.corpus.vocab.SymbolTable;
import joshua.corpus.vocab.Vocabulary;
import joshua.decoder.JoshuaConfiguration;
import joshua.decoder.ff.FeatureFunction;
import joshua.decoder.ff.lm.NGramLanguageModel;
import joshua.decoder.ff.lm.buildin_lm.LMGrammarJAVA;

/**
 *
 *
 * @author Lane Schwartz
 */
public class LMAdaptingRuleExtractor extends HierarchicalRuleExtractor {

	final float[] weights;
	
	/**
     * Constructs a rule extractor for 
     * Hiero-style hierarchical phrase-based translation.
	 * 
	 * @param suffixArray        Suffix array representing the 
	 *                           source language corpus
	 * @param targetCorpus       Corpus array representing the
	 *                           target language corpus
	 * @param alignments         Represents alignments between words in the 
	 *                           source corpus and the target corpus 
	 * @param lexProbs           Lexical translation probability table
	 * @param sampleSize         Specifies the maximum number of rules 
	 *                           that will be extracted for any source pattern
	 * @param maxPhraseSpan      Max span in the source corpus of any 
	 *                           extracted hierarchical phrase
	 * @param maxPhraseLength    Maximum number of terminals plus nonterminals
	 *                           allowed in any extracted hierarchical phrase
	 * @param minNonterminalSpan Minimum span in the source corpus of any 
	 *                           nonterminal in an extracted hierarchical 
	 *                           phrase
	 * @param maxNonterminalSpan Maximum span in the source corpus of any 
	 *                           nonterminal in an extracted hierarchical 
	 *                           phrase
	 * @throws IOException 
	 */
	public LMAdaptingRuleExtractor(
			String largeArpaLM, String testArpaLM, int lmOrder,
			Suffixes suffixArray, 
			Suffixes targetSuffixArray, 
			Alignments alignments, 
			LexicalProbabilities lexProbs, 
			ArrayList<FeatureFunction> models,
			int sampleSize, 
			int maxPhraseSpan, 
			int maxPhraseLength, 
			int minNonterminalSpan, 
			int maxNonterminalSpan) throws IOException {
		
		super(suffixArray, 
				targetSuffixArray, alignments, 
				lexProbs, models, sampleSize, 
				maxPhraseSpan, maxPhraseLength, 
				minNonterminalSpan, maxNonterminalSpan);
		
		SymbolTable vocab = new Vocabulary();
		
		Corpus corpus = suffixArray.getCorpus();
		
		NGramLanguageModel largeLM = new LMGrammarJAVA(
				vocab,
				lmOrder,
				largeArpaLM,
				JoshuaConfiguration.use_left_equivalent_state,
				JoshuaConfiguration.use_right_equivalent_state);
		
		NGramLanguageModel testLM = new LMGrammarJAVA(
				vocab,
				lmOrder,
				testArpaLM,
				JoshuaConfiguration.use_left_equivalent_state,
				JoshuaConfiguration.use_right_equivalent_state);
		
		this.weights = new float[corpus.getNumSentences()];
		
		for (int i=0, n=corpus.getNumSentences(); i<n; i++) {
			Phrase sentence = corpus.getSentence(i);
			int[] words = sentence.getWordIDs();
			double largeProbLM = largeLM.ngramLogProbability(words);
			double testProbLM = testLM.ngramLogProbability(words);
			double ratio = testProbLM - largeProbLM;
			this.weights[i] = (float) ratio;
		}
	}
	
	@Override
	protected float[] calculateFeatureValues(Pattern sourcePattern, int sourcePatternCount, HierarchicalPhrase translation, Map<Pattern,Integer> counts, float totalTranslationCount) {
		float[] featureValues = super.calculateFeatureValues(sourcePattern, sourcePatternCount, translation, counts, totalTranslationCount);
		
		return featureValues;
	}
	
}