Subsampler.java example

Explorer
relax-decode-master
- third-party
/* This file is part of the Joshua Machine Translation System.
 * 
 * Joshua is free software; you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as
 * published by the Free Software Foundation; either version 2.1
 * of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free
 * Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
 * MA 02111-1307 USA
 */
/*
 * This file is based on the edu.umd.clip.mt.subsample.Subsampler
 * class from the University of Maryland's jmtTools project (in
 * conjunction with the umd-hadoop-mt-0.01 project). That project
 * is released under the terms of the Apache License 2.0, but with
 * special permission for the Joshua Machine Translation System to
 * release modifications under the LGPL version 2.1. LGPL version
 * 3 requires no special permission since it is compatible with
 * Apache License 2.0
 */
package joshua.subsample;

import joshua.corpus.Phrase;
import joshua.corpus.vocab.Vocabulary;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.util.ArrayList;
import java.util.List;
import java.util.HashMap;
import java.util.Map;
import java.util.HashSet;
import java.util.Set;


/**
 * A class for subsampling a large (F,E)-parallel sentence-aligned
 * corpus to generate a smaller corpus whose N-grams are relevant
 * to some seed corpus. The idea of subsampling owes to Kishore
 * Papineni.
 *
 * @author UMD (Jimmy Lin, Chris Dyer, et al.)
 * @author wren ng thornton <wren@users.sourceforge.net>
 * @version $LastChangedDate: 2009-05-27 20:14:28 -0500 (Wed, 27 May 2009) $
 */
public class Subsampler {
	protected Vocabulary ve = new Vocabulary();
	protected Vocabulary vf = new Vocabulary();
	protected Map<Phrase,Integer> ngramCounts;
	protected int maxN;
	protected int targetCount;
	protected int maxSubsample = 1500000;
	
	protected static final int MAX_SENTENCE_LENGTH = 100;
	protected static final int MIN_RATIO_LENGTH    = 10;
	
	
	public Subsampler(String[] testFiles, int maxN, int targetCount)
	throws IOException {
		this.maxN        = maxN;
		this.targetCount = targetCount;
		this.ngramCounts = loadNgrams(testFiles);
	}
	private HashMap<Phrase,Integer> loadNgrams(String[] files)
	throws IOException {
		HashMap<Phrase,Integer> map = new HashMap<Phrase,Integer>();
		for (String fn : files) {
			System.err.println("Loading test set from " +fn+ "...");
			
			PhraseReader reader = new PhraseReader(
					new FileReader(fn), this.vf, (byte)1);
			Phrase phrase;
			int lineCount = 0;
			try {
				while ((phrase = reader.readPhrase()) != null) {
					lineCount++;
					List<Phrase> ngrams = phrase.getSubPhrases(this.maxN);
					for (Phrase ngram : ngrams) map.put(ngram, 0);
				}
			} finally {
				reader.close();
			}
			System.err.println("Processed " +lineCount+ " lines in " +fn);
		}
		System.err.println("Test set: " +map.size()+ " ngrams");
		return map;
	}
	
	
	/**
	 * The general subsampler function for external use.
	 *
	 * @param filelist list of source files to subsample from
	 * @param targetFtoERatio goal for ratio of output F length
	 *                        to output E length
	 * @param extf   extension of F files
	 * @param exte   extension of E files
	 * @param fpath  path to source F files
	 * @param epath  path to source E files
	 * @param output basename for output files (will append
	 *               extensions)
	 */
	public void subsample(
		String filelist, float targetFtoERatio,
		String extf,  String exte,
		String fpath, String epath,
		String output
	) throws IOException {
		this.subsample(
			filelist,
			targetFtoERatio,
			new PhraseWriter(
				new BufferedWriter(
					new OutputStreamWriter(
						new FileOutputStream(output + "." + extf),
						"UTF8")),
				new BufferedWriter(
					new OutputStreamWriter(
						new FileOutputStream(output + "." + exte),
						"UTF8"))
				),
			new BiCorpusFactory(
				fpath, epath, null,
				extf,  exte,  null,
				this.vf, this.ve)
			);
	}
	
	/**
	 * The main wrapper for the subsample worker. Closes the
	 * PhraseWriter before exiting.
	 */
	protected void subsample(
		String filelist, float targetFtoERatio,
		PhraseWriter out, BiCorpusFactory bcFactory
	) throws IOException {
		try {
			// Read filenames into a list
			List<String> files = new ArrayList<String>();
			{
				FileReader     fr = null;
				BufferedReader br = null;
				try {
					fr = new FileReader(filelist);
					br = new BufferedReader(fr);
					String file;
					while((file = br.readLine()) != null) {
						files.add(file);
					}
				} finally {
					// Maybe redundant, but UMD's FixBugs says to
					// close br (and close is idempotent anyways)
					if (null != fr) fr.close();
					if (null != br) br.close();
				}
			}
			
			int totalSubsampled = 0;
			// Iterating on files in order biases towards files
			// earlier in the list
			for (String f : files) {
				System.err.println("Loading training data: " + f);
				
				BiCorpus bc = bcFactory.fromFiles(f);
				
				Set<PhrasePair> set = new HashSet<PhrasePair>();
				
				int binsize = 10; // BUG: Magic-Number
				int max_k   = MAX_SENTENCE_LENGTH / binsize;
				System.err.print("Looking in length range");
				// Iterating bins from small to large biases
				// towards short sentences
				for (int k = 0; k < max_k; k++) {
					System.err.print(
						" [" +(k * binsize + 1)+ "," +((k + 1) * binsize)+ "]");
					System.err.flush();
					
					this.subsample(
						set,
						bc,
						k * binsize + 1,
						(k + 1) * binsize,
						targetFtoERatio);
					
					if (set.size() + totalSubsampled > maxSubsample) break;
				}
				
				float ff = 0.0f;
				float ef = 0.0f;
				for (PhrasePair pp : set) {
					// Get pp.ratioFtoE() for all pp
					ff += pp.getF().size();
					ef += pp.getE().size();
					
					out.write(pp);
					out.newLine();
				}
				out.flush();
				
				totalSubsampled += set.size();
				System.err.println(
					"\n  current="       +set.size()+
					" [total="           +totalSubsampled+
					"]    currentRatio=" +(ff/ef) );
				System.err.flush();
				
				// TODO: is this gc actually dubious? Or
				// does profiling show it helps? We only
				// do it once per file, so it's not a
				// performance blackhole.
				set = null; bc = null; System.gc();
			}
		} finally {
			out.close();
		}
	}
	
	/**
	 * The worker function for subsampling.
	 *
	 * @param set       The set to put selected sentences into
	 * @param bc        The sentence-aligned corpus to read from
	 * @param minLength The minimum F sentence length
	 * @param maxLength The maximum F sentence length
	 * @param targetFtoERatio The desired ratio of F length to
	 *                  E length
	 */
	private void subsample(
		Set<PhrasePair> set,
		BiCorpus bc,
		int minLength,
		int maxLength,
		float targetFtoERatio
	) {
		for (PhrasePair pp : bc) {
			{ int eLength = pp.getE().size();
			if (eLength == 0
			||  eLength >  MAX_SENTENCE_LENGTH)             continue; }
			
			int fLength = pp.getF().size();
			if (fLength == 0
			||  fLength <  minLength
			||  fLength >  maxLength
			||  fLength >  MAX_SENTENCE_LENGTH)             continue;
			if (fLength > 10
			&& targetFtoERatio != 0.0f) {
				float ratio = pp.ratioFtoE();
				if (fLength >= MIN_RATIO_LENGTH
					&& (   ratio > 1.3f * targetFtoERatio
						|| ratio * 1.3f < targetFtoERatio)) continue;
			}
			if (set.contains(pp))                           continue;
			
			// at this point, length checks out and the sentence hasn't
			// been selected yet
			
			List<Phrase> ngrams = pp.getF().getSubPhrases(this.maxN);
			boolean useSentence = false;
			for (Phrase ng : ngrams) {
				Integer count = this.ngramCounts.get(ng);
				if (count == null) continue;
				if (count < targetCount) {
					useSentence = true;
					count++;
					this.ngramCounts.put(ng, count);
				}
			}
			if (useSentence) set.add(pp);
		}
	}
	
	
	public static void main(String[] args) {
		new SubsamplerCLI().runMain(args);
	}
}