SuffixArrayFactory.java example

Explorer
relax-decode-master
- third-party
/* This file is part of the Joshua Machine Translation System.
 * 
 * Joshua is free software; you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as
 * published by the Free Software Foundation; either version 2.1
 * of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free
 * Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
 * MA 02111-1307 USA
 */
package joshua.corpus.suffix_array;

import joshua.corpus.Corpus;
import joshua.corpus.CorpusArray;
import joshua.corpus.alignment.AlignmentArray;
import joshua.corpus.alignment.Alignments;
import joshua.corpus.vocab.ExternalizableSymbolTable;
import joshua.corpus.vocab.Vocabulary;
import joshua.util.io.LineReader;

import java.util.*;
import java.util.logging.Logger;
import java.io.*;


/**
 * SuffixArrayFactory is the class that handles the loading and
 * saving of SuffixArrays and their associated classes.
 * 
 * @author Chris Callison-Burch
 * @since  8 February 2005
 * @version $LastChangedDate:2008-07-30 17:15:52 -0400 (Wed, 30 Jul 2008) $
 */

public class SuffixArrayFactory {

	//===============================================================
	// Constants
	//===============================================================

	public static boolean SHOW_PROGRESS = false;
	private static final Logger logger = Logger.getLogger(SuffixArrayFactory.class.getName());

	//===============================================================
	// Static
	//===============================================================

	/**
	 * Gets the standard filename for a Vocabulary.
	 */
	public static String getVocabularyFileName(String lang, String corpus) {
		return(lang+"_"+corpus+"_vocab.txt"); 
	}


	/**
	 * Gets the standard filename for a CorpusArray.
	 */
	public static String getCorpusArrayFileName(String lang, String corpus) {
		return (lang+"_"+corpus+"_sentences.txt"); 
	}


	/**
	 * Gets the standard filename for a SuffixArray.
	 */
	public static String getSuffixArrayFileName(String lang, String corpus) {
		return (lang+"_"+corpus+"_suffixes.txt"); 
	}


	/**
	 * Gets the standard filename for an AlignmentArray.
	 */
	public static String getAlignmentArrayFileName(String sourceLang, String targetLang, String corpus) {
		return (sourceLang+"_"+targetLang+"_"+corpus+"_alignment_array.txt"); 
	}


	// TODO: fuse createVocabulary and createCorpusArray together to avoid allocating the trivial array.
	public static CorpusArray createCorpusArray(String inputFilename)
	throws IOException {
		Vocabulary vocabulary = new Vocabulary();
		int[] ws = Vocabulary.initializeVocabulary(inputFilename, vocabulary, true);
		return createCorpusArray(inputFilename, vocabulary, ws[0], ws[1]);
	}


	// TODO: fuse count and createCorpusArray together to avoid allocating the trivial array.
	public static CorpusArray createCorpusArray(String inputFilename, ExternalizableSymbolTable vocabulary) throws IOException {
		int[] ws = count(inputFilename);
		return createCorpusArray(inputFilename, vocabulary, ws[0], ws[1]);
	}


	/**
	 * Counts the words and sentences in a plain text file.
	 *
	 * @param inputFilename the plain text file
	 * @return a tuple containing the number of words in the
	 *         corpus and number of sentences in the corpus
	 */
	static int[] count(String inputFilename) throws IOException {

		int numSentences = 0;
		int numWords = 0;

		LineReader lineReader = new LineReader(inputFilename);

		for (String line : lineReader) {
			String[] sentence = line.trim().split("\\s+");
			numWords += sentence.length;
			numSentences++;
			if(SHOW_PROGRESS && numSentences % 10000==0) logger.info(""+numWords);
		}

		int[] numberOfWordsSentences = { numWords, numSentences };
		return numberOfWordsSentences;
	}


	/**
	 * Creates a new CorpusArray from a plain text file, given
	 * a symbol table created from the same file.
	 *
	 * @param numWords     the number of words in the file
	 *                     (returned by createVocabulary)
	 * @param numSentences the number of lines in the file
	 *                     (returned by createVocabulary)
	 */
	public static CorpusArray createCorpusArray(String inputFilename, ExternalizableSymbolTable vocab, int numWords, int numSentences) throws IOException {
		// initialize the arrays.
		int[] corpus = new int[numWords];
		int[] sentenceIndexes = new int[numSentences];

		// instantiate them.
		int wordCounter = 0;
		int sentenceCounter = 0;

		LineReader lineReader = new LineReader(inputFilename);

		for (String phraseString : lineReader) {
			int[] words = vocab.getIDs(phraseString);
//			String[] wordStrings = phraseString.split("\\s+");
//			int[] words = new int[wordStrings.length];
//			for (int i = 0; i < wordStrings.length; i++) {
//				words[i] = vocab.getID(wordStrings[i]);
//			}
//
//			BasicPhrase sentence = new BasicPhrase(words, vocab);
			sentenceIndexes[sentenceCounter] = wordCounter;
			sentenceCounter++;
			System.arraycopy(words, 0, corpus, wordCounter, words.length);
			wordCounter += words.length;
//
//			for(int i = 0; i < sentence.size(); i++) {
//				corpus[wordCounter] = sentence.getWordID(i);
//				wordCounter++;
//			}
//			if(SHOW_PROGRESS && sentenceCounter % 10000==0) logger.info(""+numWords);
		}

		return new CorpusArray(corpus, sentenceIndexes, vocab);
	}


	/**
	 * Creates a new SuffixArray from a CorpusArray created
	 * from the same file.
	 */
	public static SuffixArray createSuffixArray(Corpus corpusArray, int maxCacheSize) throws IOException {
		return new SuffixArray(corpusArray, maxCacheSize);
	}


	/**
	 * Creates an Alignments object from a file containing
	 * Moses-style alignments, and a source and target corpus.
	 */
	public static Alignments createAlignments(String alignmentsFilename, Suffixes sourceCorpus, Suffixes targetCorpus) throws IOException {

		// Maps indices from the target corpus to a list of their aligned source points
		Map<Integer,List<Integer>> alignedSourceList = new HashMap<Integer,List<Integer>>();

		// Maps indices from the source corpus to a list of their aligned target points
		Map<Integer,List<Integer>> alignedTargetList = new HashMap<Integer,List<Integer>>();

		// Maps indices from the target corpus to an array of their aligned source points
		int[][] alignedSourceIndices = new int[targetCorpus.size()][];

		// Maps indices from the source corpus to an array of their aligned target points
		int[][] alignedTargetIndices = new int[sourceCorpus.size()][];

		// set the values of the arrays based on the alignments file...
		int sentenceCounter = 0;

		LineReader lineReader = new LineReader(alignmentsFilename);

		for (String line : lineReader) {

			// Start index of current source sentence
			int sourceOffset = sourceCorpus.getSentencePosition(sentenceCounter);

			// Start index of current target sentence
			int targetOffset = targetCorpus.getSentencePosition(sentenceCounter);

			// To save memory, clear old items that will not be used again
			alignedSourceList.clear();
			alignedTargetList.clear();

			// parse the alignment points
			String[] alignments = line.split("\\s+");
			for(int i = 0; i < alignments.length; i++) {
				String[] points = alignments[i].split("-");
				int sourceIndex = sourceOffset + Integer.parseInt(points[0]);
				int targetIndex = targetOffset + Integer.parseInt(points[1]);

				if (!alignedSourceList.containsKey(targetIndex)) {
					ArrayList<Integer> list = new ArrayList<Integer>();
					list.add(sourceIndex);
					alignedSourceList.put(targetIndex, list);
				} else {
					alignedSourceList.get(targetIndex).add(sourceIndex);
				}

				if (!alignedTargetList.containsKey(sourceIndex)) {
					ArrayList<Integer> list = new ArrayList<Integer>();
					list.add(targetIndex);
					alignedTargetList.put(sourceIndex, list);
				} else {
					alignedTargetList.get(sourceIndex).add(targetIndex);
				}

			}

			int nextSourceOffset = sourceCorpus.getSentencePosition(sentenceCounter+1);
			int nextTargetOffset = targetCorpus.getSentencePosition(sentenceCounter+1);

			for (int i=targetOffset; i<nextTargetOffset; i++) {

				if (alignedSourceList.containsKey(i)) {
					// List of source points aligned to the target index i
					List<Integer> sourceList = alignedSourceList.get(i);
					Collections.sort(sourceList);
					int size=sourceList.size();
					alignedSourceIndices[i] = new int[size];
					for (int j=0; j<size; j++) alignedSourceIndices[i][j] = sourceList.get(j);
				} else {
					alignedSourceIndices[i] = null;
				}

			}

			for (int i=sourceOffset; i<nextSourceOffset; i++) {

				if (alignedTargetList.containsKey(i)) {
					// List of target points aligned to the source index i
					List<Integer> targetList = alignedTargetList.get(i);
					Collections.sort(alignedTargetList.get(i));
					int size=targetList.size();
					alignedTargetIndices[i] = new int[size];
					for (int j=0; j<size; j++) alignedTargetIndices[i][j] = targetList.get(j);
				} else {
					alignedTargetIndices[i] = null;
				}
			}

			sentenceCounter++;
		}

		return new AlignmentArray(alignedTargetIndices, alignedSourceIndices, sentenceCounter);
	}



	//===============================================================
	// Private 
	//===============================================================

}