/* This file is part of the Joshua Machine Translation System.
*
* Joshua is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as
* published by the Free Software Foundation; either version 2.1
* of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free
* Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
* MA 02111-1307 USA
*/
package joshua.subsample;
import java.io.BufferedWriter;
import java.io.OutputStreamWriter;
import java.io.FileOutputStream;
import java.io.IOException;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
/**
* A subsampler which takes in word-alignments as well as the F and
* E files. To remove redundant code, this class uses callback
* techniques in order to "override" the superclass methods.
*
* @see joshua.subsample.Subsampler
* @author wren ng thornton <wren@users.sourceforge.net>
* @version $LastChangedDate: 2009-05-27 20:14:28 -0500 (Wed, 27 May 2009) $
*/
public class AlignedSubsampler extends Subsampler {
public AlignedSubsampler(String[] testFiles, int maxN, int targetCount)
throws IOException {
super(testFiles, maxN, targetCount);
}
/**
* @param filelist list of source files to subsample from
* @param targetFtoERatio goal for ratio of output F length
* to output E length
* @param extf extension of F files
* @param exte extension of E files
* @param exta extension of alignment files
* @param fpath path to source F files
* @param epath path to source E files
* @param apath path to source alignment files
* @param output basename for output files (will append
* extensions)
*/
public void subsample(
String filelist, float targetFtoERatio,
String extf, String exte, String exta,
String fpath, String epath, String apath,
String output
) throws IOException {
this.subsample(
filelist,
targetFtoERatio,
new PhraseWriter(
new BufferedWriter(
new OutputStreamWriter(
new FileOutputStream(output + "." + extf),
"UTF8")),
new BufferedWriter(
new OutputStreamWriter(
new FileOutputStream(output + "." + exte),
"UTF8")),
new BufferedWriter(
new OutputStreamWriter(
new FileOutputStream(output + "." + exta),
"UTF8"))
),
new BiCorpusFactory(
fpath, epath, apath,
extf, exte, exta,
this.vf, this.ve) { /* Local class definition */
public BiCorpus fromFiles(String f) throws IOException {
return this.alignedFromFiles(f);
}
}
);
}
@SuppressWarnings("static-access")
public static void main(String[] args) {
new SubsamplerCLI() { /* Local class definition */
//TODO hasArg is a static method. It should be accessed as OptionBuilder.hasArg()
protected final Option oa = OptionBuilder
.withArgName("lang")
.hasArg()
.withDescription("Word alignment extension")
.isRequired()
.create("a");
//TODO hasArg is a static method. It should be accessed as OptionBuilder.hasArg()
protected final Option oapath = OptionBuilder
.withArgName("path")
.hasArg()
.withDescription("Directory containing word alignment files")
.create("apath");
public Options getCliOptions() {
return super.getCliOptions()
.addOption(oa)
.addOption(oapath);
}
public String getClassName() {
return AlignedSubsampler.class.getName();
}
public void runSubsampler(
String[] testFiles, int maxN, int targetCount, float ratio
) throws IOException {
new AlignedSubsampler(testFiles, maxN, targetCount).subsample(
ot.getValue(),
ratio,
of.getValue(),
oe.getValue(),
oa.getValue(),
ofpath.getValue(),
oepath.getValue(),
oapath.getValue(),
ooutput.getValue() );
}
}.runMain(args);
}
}