/* This file is part of the Joshua Machine Translation System. * * Joshua is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2.1 * of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free * Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, * MA 02111-1307 USA */ package joshua.decoder.ff.lm; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import java.util.Iterator; import java.util.NoSuchElementException; import java.util.Scanner; import java.util.logging.Level; import java.util.logging.Logger; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.zip.GZIPInputStream; import joshua.corpus.vocab.SymbolTable; import joshua.corpus.vocab.Vocabulary; import joshua.util.Regex; import joshua.util.io.LineReader; /** * Utility class for reading ARPA language model files. * * @author Lane Schwartz */ public class ArpaFile implements Iterable<ArpaNgram> { /** Logger for this class. */ private static final Logger logger = Logger.getLogger(ArpaFile.class.getName()); /** Regular expression representing a blank line. */ public static final Regex BLANK_LINE = new Regex("^\\s*$"); /** * Regular expression representing a line * starting a new section of n-grams in an ARPA language model file. */ public static final Regex NGRAM_HEADER = new Regex("^\\\\\\d-grams:\\s*$"); /** * Regular expression representing a line * ending an ARPA language model file. */ public static final Regex NGRAM_END = new Regex("^\\\\end\\\\s*$"); /** ARPA file for this object. */ private final File arpaFile; /** The symbol table associated with this object. */ private final SymbolTable vocab; /** * Constructs an object that represents an ARPA language model file. * * @param arpaFileName File name of an ARPA language model file * @param vocab Symbol table to be used by this object */ public ArpaFile(String arpaFileName, SymbolTable vocab) { this.arpaFile = new File(arpaFileName); this.vocab = vocab; } public ArpaFile(String arpaFileName) throws IOException { this.arpaFile = new File(arpaFileName); this.vocab = new Vocabulary(); // final Scanner scanner = new Scanner(arpaFile); // // Eat initial header lines // while (scanner.hasNextLine()) { // String line = scanner.nextLine(); // logger.finest("Discarding line: " + line); // if (NGRAM_HEADER.matches(line)) { // break; // } // } // int ngramOrder = 1; LineReader grammarReader = new LineReader(arpaFileName); try { for (String line : grammarReader) { // while (scanner.hasNext()) { // // String line = scanner.nextLine(); String[] parts = Regex.spaces.split(line); if (parts.length > 1) { String[] words = Regex.spaces.split(parts[1]); for (String word : words) { if (logger.isLoggable(Level.FINE)) logger.fine("Adding to vocab: " + word); vocab.addTerminal(word); } } else { logger.info(line); } } } finally { grammarReader.close(); } // // boolean lineIsHeader = NGRAM_HEADER.matches(line); // // while (lineIsHeader || BLANK_LINE.matches(line)) { // // if (lineIsHeader) { // ngramOrder++; // } // // if (scanner.hasNext()) { // line = scanner.nextLine().trim(); // lineIsHeader = NGRAM_HEADER.matches(line); // } else { // logger.severe("Ran out of lines!"); // return; // } // } // // // Add word to vocab // if (logger.isLoggable(Level.FINE)) logger.fine("Adding word to vocab: " + parts[ngramOrder]); // vocab.addTerminal(parts[ngramOrder]); // // // Add context words to vocab // for (int i=1; i<ngramOrder; i++) { // if (logger.isLoggable(Level.FINE)) logger.fine("Adding context word to vocab: " + parts[i]); // vocab.addTerminal(parts[i]); // } // } logger.info("Done constructing ArpaFile"); } /** * Gets the symbol table associated with this object. * * @return the symbol table associated with this object */ public SymbolTable getVocab() { return vocab; } /** * Gets the total number of n-grams * in this ARPA language model file. * * @return total number of n-grams * in this ARPA language model file */ @SuppressWarnings("unused") public int size() { logger.fine("Counting n-grams in ARPA file"); int count=0; for (ArpaNgram ngram : this) { count++; } logger.fine("Done counting n-grams in ARPA file"); return count; } public int getOrder() throws FileNotFoundException { Pattern pattern = Pattern.compile("^ngram (\\d+)=\\d+$"); if (logger.isLoggable(Level.FINEST)) logger.finest("Pattern is " + pattern.toString()); final Scanner scanner = new Scanner(arpaFile); int order = 0; // Eat initial header lines while (scanner.hasNextLine()) { String line = scanner.nextLine(); if (NGRAM_HEADER.matches(line)) { break; } else { Matcher matcher = pattern.matcher(line); if (matcher.matches()) { if (logger.isLoggable(Level.FINEST)) logger.finest("DOES match: \'" + line + "\'"); order = Integer.valueOf(matcher.group(1)); } else if (logger.isLoggable(Level.FINEST)) { logger.finest("Doesn't match: \'" + line + "\'"); } } } return order; } /** * Gets an iterator capable of iterating * over all n-grams in the ARPA file. * * @return an iterator capable of iterating * over all n-grams in the ARPA file */ public Iterator<ArpaNgram> iterator() { try { final Scanner scanner; if (arpaFile.getName().endsWith("gz")) { InputStream in = new GZIPInputStream( new FileInputStream(arpaFile)); scanner = new Scanner(in); } else { scanner = new Scanner(arpaFile); } // Eat initial header lines while (scanner.hasNextLine()) { String line = scanner.nextLine(); logger.finest("Discarding line: " + line); if (NGRAM_HEADER.matches(line)) { break; } } return new Iterator<ArpaNgram>() { String nextLine = null; int ngramOrder = 1; // int id = 0; public boolean hasNext() { if (scanner.hasNext()) { String line = scanner.nextLine(); boolean lineIsHeader = NGRAM_HEADER.matches(line) || NGRAM_END.matches(line); while (lineIsHeader || BLANK_LINE.matches(line)) { if (lineIsHeader) { ngramOrder++; } if (scanner.hasNext()) { line = scanner.nextLine().trim(); lineIsHeader = NGRAM_HEADER.matches(line) || NGRAM_END.matches(line); } else { nextLine = null; return false; } } nextLine = line; return true; } else { nextLine = null; return false; } } public ArpaNgram next() { if (nextLine!=null) { String[] parts = Regex.spaces.split(nextLine); float value = Float.valueOf(parts[0]); int word = vocab.getID(parts[ngramOrder]); int[] context = new int[ngramOrder-1]; for (int i=1; i<ngramOrder; i++) { context[i-1] = vocab.getID(parts[i]); } float backoff; if (parts.length > ngramOrder+1) { backoff = Float.valueOf(parts[parts.length-1]); } else { backoff = ArpaNgram.DEFAULT_BACKOFF; } nextLine = null; return new ArpaNgram(word, context, value, backoff); } else { throw new NoSuchElementException(); } } public void remove() { throw new UnsupportedOperationException(); } }; } catch (FileNotFoundException e) { logger.severe(e.toString()); return null; } catch (IOException e) { logger.severe(e.toString()); return null; } } }