/* This file is part of the Joshua Machine Translation System. * * Joshua is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2.1 * of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free * Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, * MA 02111-1307 USA */ package joshua.corpus.lexprob; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.io.PrintStream; import java.io.UnsupportedEncodingException; import java.io.Writer; import java.util.Arrays; import java.util.HashSet; import java.util.NoSuchElementException; import java.util.Scanner; import java.util.Set; import java.util.logging.Level; import java.util.logging.Logger; import java.util.zip.GZIPInputStream; import java.util.zip.GZIPOutputStream; import joshua.util.CommandLineParser; import joshua.util.CommandLineParser.Option; /** * Utility to extract aligned word pairs from an aligned corpus. * <p> * The files used must use Unix-style newlines. * * @author Lane Schwartz * @version $LastChangedDate: 2009-05-22 23:31:12 -0500 (Fri, 22 May 2009) $ * @see "Section 4.4 of 'Statistical Phrase-Based Translation' by * Philipp Koehn, Franz Josef Och, & Daniel Marcu (HLT-NAACL, 2003)" */ public class ExtractWordPairs { private static final Logger logger = Logger.getLogger(ExtractWordPairs.class.getName()); /** Special marker to use with unaligned words */ public static final String UNALIGNED_MARKER = "NULL"; /** * Extract aligned word pairs from an aligned corpus. * <p> * This method does not convert from upper case to lower * case. All input needs to already be in the proper case. * <p> * NOTE: The scanners provided for source text, target text, * and alignments must all be backed by data that uses * Unix-style newlines. * * @param number_of_lines The number of lines to process * from the aligned corpus. * @param source_text Scanner backed by the source language text * @param target_text Scanner backed by the target language text * @param alignments Scanner backed by the sentence alignment data * @param outputFile Writer to use when producing output results * @throws IOException Thrown if an I/O error occurs when writing results */ public static void extract(int number_of_lines, Scanner source_text, Scanner target_text, Scanner alignments, Writer outputFile) throws IOException { if (logger.isLoggable(Level.INFO)) { logger.info("Extracting aligned word pairs from aligned sentences..."); } // Iterate over all lines of input for (int line_number=1; line_number<=number_of_lines; line_number++) { // Read in the next line from the files // BUG: use joshua.util.Regex.spaces.split(...) String[] source_words = source_text.nextLine().split("\\s+"); String[] target_words = target_text.nextLine().split("\\s+"); String[] raw_alignment_points = alignments.nextLine().split("\\s+"); try { // We have a new sentence pair. // Initially assume that all words are unaligned. // As each alignment point is processed, aligned words will be removed from the appropriate set Set<Integer> unaligned_source_words = new HashSet<Integer>(source_words.length); Set<Integer> unaligned_target_words = new HashSet<Integer>(target_words.length); for (int i=0; i<source_words.length; i++) { unaligned_source_words.add(i); } for (int i=0; i<target_words.length; i++) { unaligned_target_words.add(i); } // Iterate over each alignment point in the aligned sentence pair for (String raw_alignment_point : raw_alignment_points) { // Alignment points must be of the format #-#, where # is a number int split_point = raw_alignment_point.indexOf('-'); int x = Integer.valueOf(raw_alignment_point.substring(0,split_point)); int y = Integer.valueOf(raw_alignment_point.substring(split_point+1)); // Remove this source word from the set of unaligned source words unaligned_source_words.remove(x); // Remove this target word from the set of unaligned target words unaligned_target_words.remove(y); // Lowercase the words, // then print the word pair to the output file outputFile.write(source_words[x].toLowerCase() + " " + target_words[y].toLowerCase() + "\n"); } // For each unaligned source word, // lowercase the word, // then print the word, aligned with the special token NULL for (int source_word_index : unaligned_source_words) { outputFile.write(source_words[source_word_index].toLowerCase() + " " + UNALIGNED_MARKER + "\n"); } // For each unaligned target word, // lowercase the word, // then print the word, aligned with the special token NULL for (int target_word_index : unaligned_target_words) { outputFile.write(UNALIGNED_MARKER + " " + target_words[target_word_index].toLowerCase() + "\n"); } } catch (ArrayIndexOutOfBoundsException e) { if (logger.isLoggable(Level.SEVERE)) { logger.severe("ArrayIndexOutOfBoundsException at sentence pair:\n" + Arrays.toString(source_words) + "\n"+Arrays.toString(target_words) +"\n"+Arrays.toString(raw_alignment_points) + "\n"); } throw e; } } // Tidy up outputFile.flush(); outputFile.close(); if (logger.isLoggable(Level.INFO)) { logger.info("...done."); } } /** * Utility to extract aligned word pairs from an aligned * corpus. * * @param args Command line arguments */ public static void main(String[] args) { CommandLineParser commandLine = new CommandLineParser(); Option<String> source_file = commandLine.addStringOption('s',"source-text","SOURCE_FILENAME","name of file containing source language corpus"); //Option<String> source_file_encoding = commandLine.addStringOption("source-encoding","SOURCE_ENCODING","ISO-8859-1","source language file encoding"); Option<String> source_file_encoding = commandLine.addStringOption("source-encoding","SOURCE_ENCODING","UTF-8","source language file encoding"); Option<Boolean> source_file_gz = commandLine.addBooleanOption("source-text-gzipped",false,"is the source text gzipped"); Option<String> target_file = commandLine.addStringOption('t',"target-text","TARGET_FILENAME","name of file containing target language corpus"); //Option<String> target_file_encoding = commandLine.addStringOption("target-encoding","TARGET_ENCODING","ISO-8859-1","target language file encoding"); Option<String> target_file_encoding = commandLine.addStringOption("target-encoding","TARGET_ENCODING","UTF-8","target language file encoding"); Option<Boolean> target_file_gz = commandLine.addBooleanOption("target-text-gzipped",false,"is the target text gzipped"); Option<String> alignment_file = commandLine.addStringOption('a',"alignment","ALIGNMENT_FILENAME","name of file containing word alignments for the sentences in the corpus"); Option<Boolean> alignment_file_gz = commandLine.addBooleanOption("alignment-file-gzipped",false,"is the alignment file gzipped"); Option<Integer> num_lines = commandLine.addIntegerOption('l',"lines","LINE_COUNT","number of aligned sentences in the corpus"); Option<String> output_file = commandLine.addStringOption('o',"output","OUTPUT_FILENAME","file where aligned word pairs will be written"); Option<String> output_file_encoding = commandLine.addStringOption("output-encoding","OUTPUT_ENCODING","UTF-8","output file encoding"); Option<Boolean> output_file_gz = commandLine.addBooleanOption("output-text-gzipped",false,"should the output file be gzipped"); commandLine.parse(args); try { // Set System.out and System.err to use the provided character encoding try { System.setOut(new PrintStream(System.out, true, commandLine.getValue(source_file_encoding))); System.setErr(new PrintStream(System.err, true, commandLine.getValue(source_file_encoding))); } catch (UnsupportedEncodingException e1) { System.err.println(commandLine.getValue(source_file_encoding) + " is not a valid encoding; using system default encoding for System.out and System.err."); } catch (SecurityException e2) { System.err.println("Security manager is configured to disallow changes to System.out or System.err; using system default encoding."); } // The number of lines to read int number_of_lines = commandLine.getValue(num_lines); // Set up the source text for reading Scanner source_text; if (commandLine.getValue(source_file).endsWith(".gz") || commandLine.getValue(source_file_gz)) { source_text = new Scanner(new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(commandLine.getValue(source_file))),commandLine.getValue(source_file_encoding)))); } else { source_text = new Scanner( new File(commandLine.getValue(source_file)), commandLine.getValue(source_file_encoding)); } // Set up the target text for reading Scanner target_text; if (commandLine.getValue(target_file).endsWith(".gz") || commandLine.getValue(target_file_gz)) { target_text = new Scanner(new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(commandLine.getValue(target_file))),commandLine.getValue(target_file_encoding)))); } else { target_text = new Scanner( new File(commandLine.getValue(target_file)), commandLine.getValue(target_file_encoding)); } // Set up the alignment file for reading Scanner alignments; if (commandLine.getValue(alignment_file).endsWith(".gz") || commandLine.getValue(alignment_file_gz)) { alignments = new Scanner(new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(commandLine.getValue(alignment_file)))))); } else { alignments = new Scanner( new File(commandLine.getValue(alignment_file))); } // Set up the output file for writing Writer outputFile; if (commandLine.getValue(output_file).endsWith(".gz") || commandLine.getValue(output_file_gz)) { outputFile = new OutputStreamWriter(new GZIPOutputStream(new FileOutputStream(commandLine.getValue(output_file))),commandLine.getValue(output_file_encoding)); } else { outputFile = new OutputStreamWriter(new FileOutputStream(commandLine.getValue(output_file)),commandLine.getValue(output_file_encoding)); } try { extract(number_of_lines, source_text, target_text, alignments, outputFile); } catch (NoSuchElementException e) { System.err.println("There are more than " + number_of_lines + " lines of input. Please determine the actual number of lines of input, and re-run with the appropriate command line flag."); commandLine.printUsage(); System.exit(-1); } } catch (FileNotFoundException e) { e.printStackTrace(); } catch (UnsupportedEncodingException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } }