ArpaFile.java example

Explorer
relax-decode-master
- third-party
/* This file is part of the Joshua Machine Translation System.
 * 
 * Joshua is free software; you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as
 * published by the Free Software Foundation; either version 2.1
 * of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free
 * Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
 * MA 02111-1307 USA
 */
package joshua.decoder.ff.lm;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.util.Iterator;
import java.util.NoSuchElementException;
import java.util.Scanner;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.zip.GZIPInputStream;

import joshua.corpus.vocab.SymbolTable;
import joshua.corpus.vocab.Vocabulary;
import joshua.util.Regex;
import joshua.util.io.LineReader;

/**
 * Utility class for reading ARPA language model files.
 * 
 * @author Lane Schwartz
 */
public class ArpaFile implements Iterable<ArpaNgram> {

	/** Logger for this class. */
	private static final Logger logger = 
		Logger.getLogger(ArpaFile.class.getName());
	
	/** Regular expression representing a blank line. */
	public static final Regex BLANK_LINE  = new Regex("^\\s*$");
	
	/** 
	 * Regular expression representing a line 
	 * starting a new section of n-grams in an ARPA language model file. 
	 */
	public static final Regex NGRAM_HEADER = new Regex("^\\\\\\d-grams:\\s*$");
	
	/** 
	 * Regular expression representing a line 
	 * ending an ARPA language model file. 
	 */
	public static final Regex NGRAM_END = new Regex("^\\\\end\\\\s*$");
	
	/** ARPA file for this object. */
	private final File arpaFile;
	
	/** The symbol table associated with this object. */
	private final SymbolTable vocab;
	
	/**
	 * Constructs an object that represents an ARPA language model file.
	 * 
	 * @param arpaFileName File name of an ARPA language model file
	 * @param vocab Symbol table to be used by this object
	 */
	public ArpaFile(String arpaFileName, SymbolTable vocab) {
		this.arpaFile = new File(arpaFileName);
		this.vocab = vocab;
	}

	public ArpaFile(String arpaFileName) throws IOException {
		this.arpaFile = new File(arpaFileName);
		this.vocab = new Vocabulary();
		
//		final Scanner scanner = new Scanner(arpaFile);
		
//		// Eat initial header lines
//		while (scanner.hasNextLine()) {
//			String line = scanner.nextLine();
//			logger.finest("Discarding line: " + line);
//			if (NGRAM_HEADER.matches(line)) {
//				break;
//			}
//		}
		
//		int ngramOrder = 1;
		
		LineReader grammarReader = new LineReader(arpaFileName);
		
		try {
			for (String line : grammarReader) {


//		while (scanner.hasNext()) {
//			
//			String line = scanner.nextLine();

				String[] parts = Regex.spaces.split(line);
				if (parts.length > 1) {
					String[] words = Regex.spaces.split(parts[1]);

					for (String word : words) {
						if (logger.isLoggable(Level.FINE)) logger.fine("Adding to vocab: " + word);
						vocab.addTerminal(word);	
					}

				} else {
					logger.info(line);
				}

			}
		} finally { 
			grammarReader.close(); 
		}

//			
//			boolean lineIsHeader = NGRAM_HEADER.matches(line);
//			
//			while (lineIsHeader || BLANK_LINE.matches(line)) {
//				
//				if (lineIsHeader) {
//					ngramOrder++;
//				}
//				
//				if (scanner.hasNext()) {
//					line = scanner.nextLine().trim();
//					lineIsHeader = NGRAM_HEADER.matches(line);
//				} else {
//					logger.severe("Ran out of lines!");
//					return;
//				}
//			}
			
			
//			
//			// Add word to vocab
//			if (logger.isLoggable(Level.FINE)) logger.fine("Adding word to vocab: " + parts[ngramOrder]);
//			vocab.addTerminal(parts[ngramOrder]);
//			
//			// Add context words to vocab
//			for (int i=1; i<ngramOrder; i++) {
//				if (logger.isLoggable(Level.FINE)) logger.fine("Adding context word to vocab: " + parts[i]);
//				vocab.addTerminal(parts[i]);
//			}
			
//		}
		
		logger.info("Done constructing ArpaFile");
		
	}
	
	/**
	 * Gets the symbol table associated with this object.
	 * 
	 * @return the symbol table associated with this object
	 */
	public SymbolTable getVocab() {
		return vocab;
	}
	
	/**
	 * Gets the total number of n-grams 
	 * in this ARPA language model file.
	 * 
	 * @return total number of n-grams 
	 *         in this ARPA language model file
	 */
	@SuppressWarnings("unused")
	public int size() {

		logger.fine("Counting n-grams in ARPA file");
		int count=0;
		
		for (ArpaNgram ngram : this) {
			count++;
		}
		logger.fine("Done counting n-grams in ARPA file");
		
		return count;
	}
	
	public int getOrder() throws FileNotFoundException {

		Pattern pattern = Pattern.compile("^ngram (\\d+)=\\d+$");
		if (logger.isLoggable(Level.FINEST)) logger.finest("Pattern is " + pattern.toString());
		final Scanner scanner = new Scanner(arpaFile);

		int order = 0;
		
		// Eat initial header lines
		while (scanner.hasNextLine()) {
			String line = scanner.nextLine();
			
			if (NGRAM_HEADER.matches(line)) {
				break;
			} else {
				Matcher matcher = pattern.matcher(line);
				if (matcher.matches()) {
					if (logger.isLoggable(Level.FINEST)) logger.finest("DOES   match: \'" + line + "\'");
					order = Integer.valueOf(matcher.group(1));
				} else if (logger.isLoggable(Level.FINEST)) {
					logger.finest("Doesn't match: \'" + line + "\'");
				}
			}
		}
		
		return order;
	}
	
	/**
	 * Gets an iterator capable of iterating 
	 * over all n-grams in the ARPA file.
	 * 
	 * @return an iterator capable of iterating 
	 *         over all n-grams in the ARPA file
	 */
	public Iterator<ArpaNgram> iterator() {

		try {
			final Scanner scanner;
			
			if (arpaFile.getName().endsWith("gz")) {
				InputStream in = new GZIPInputStream(
						new FileInputStream(arpaFile));
				scanner = new Scanner(in);
			} else {
				scanner = new Scanner(arpaFile);
			}
			
			// Eat initial header lines
			while (scanner.hasNextLine()) {
				String line = scanner.nextLine();
				logger.finest("Discarding line: " + line);
				if (NGRAM_HEADER.matches(line)) {
					break;
				}
			}
			
			return new Iterator<ArpaNgram>() {
				
				String nextLine = null;
				int ngramOrder = 1;
//				int id = 0;
				
				public boolean hasNext() {
					
					if (scanner.hasNext()) {
						
						String line = scanner.nextLine();
						
						boolean lineIsHeader = NGRAM_HEADER.matches(line) || NGRAM_END.matches(line);
						
						while (lineIsHeader || BLANK_LINE.matches(line)) {
							
							if (lineIsHeader) {
								ngramOrder++;
							}
							
							if (scanner.hasNext()) {
								line = scanner.nextLine().trim();
								lineIsHeader = NGRAM_HEADER.matches(line) || NGRAM_END.matches(line);
							} else {
								nextLine = null;
								return false;
							}
						}
						
						nextLine = line;
						return true;
						
					} else {
						nextLine = null;
						return false;
					}
					
				}

				public ArpaNgram next() {
					if (nextLine!=null) {
						
						String[] parts = Regex.spaces.split(nextLine);

						float value = Float.valueOf(parts[0]);
						
						int word = vocab.getID(parts[ngramOrder]);
						
						int[] context = new int[ngramOrder-1];
						for (int i=1; i<ngramOrder; i++) {
							context[i-1] = vocab.getID(parts[i]);
						}
						
						float backoff;
						if (parts.length > ngramOrder+1) {
							backoff = Float.valueOf(parts[parts.length-1]);
						} else {
							backoff = ArpaNgram.DEFAULT_BACKOFF;
						}
						
						nextLine = null;
						return new ArpaNgram(word, context, value, backoff);
						
					} else {
						throw new NoSuchElementException();
					}
				}

				public void remove() {
					throw new UnsupportedOperationException();
				}
				
			};
		} catch (FileNotFoundException e) {
			logger.severe(e.toString());
			return null;
		} catch (IOException e) {
			logger.severe(e.toString());
			return null;
		}
		
	}
}