/* This file is part of the Joshua Machine Translation System.
*
* Joshua is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as
* published by the Free Software Foundation; either version 2.1
* of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free
* Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
* MA 02111-1307 USA
*/
package joshua.corpus.suffix_array;
import java.io.IOException;
import java.io.ObjectOutput;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import joshua.corpus.Phrase;
import joshua.corpus.vocab.SymbolTable;
/**
* Represents a pattern of terminals and nonterminals.
* <p>
* The integer representation of each terminal must be positive.
* The integer representation of each nonterminal must be negative.
*
* @author Lane Schwartz
* @version $LastChangedDate:2008-09-18 12:47:23 -0500 (Thu, 18 Sep 2008) $
*/
public class Pattern extends BasicPhrase implements PatternFormat {
//===============================================================
// Member variables
//===============================================================
/** The number of nonterminals in this pattern. */
final int arity;
//===============================================================
// Constructor(s)
//===============================================================
/**
* Constructs a pattern of terminals and nonterminals.
* <p>
* The integer representation of each terminal must be
* positive. The integer representation of each nonterminal
* must be negative.
*
* @param vocab Vocabulary capable of mapping between symbols
* and integers.
*/
public Pattern(SymbolTable vocab, int... words) {
super(words, vocab);
this.arity = calculateArity(this.words);
}
/**
* Constructs a pattern by copying an existing phrase.
*
* @param phrase an existing phrase
*/
public Pattern(Phrase phrase) {
super(new int[phrase.size()], phrase.getVocab());
for(int i = 0 ; i < phrase.size(); i++) {
words[i] = phrase.getWordID(i);
}
this.arity = calculateArity(this.words);
}
/**
* Constructs a pattern by copying an existing pattern, and
* then appending additional words to the new pattern.
*
* @param pattern Existing pattern to copy.
* @param word Words to append to the new pattern.
*/
public Pattern(Pattern pattern, int... word) {
super(pattern(pattern.words, word),pattern.vocab);
this.arity = calculateArity(this.words);
}
/**
* Constructs a pattern by copying an int[] pattern, and
* then appending additional words to the new pattern.
*
* @param vocab Vocabulary capable of mapping between symbols
* and integers.
* @param patternStart Existing pattern to copy.
* @param patternEnd Words to append to the new pattern.
*/
public Pattern(SymbolTable vocab, int[] patternStart, int... patternEnd) {
super(pattern(patternStart,patternEnd), vocab);
this.arity = calculateArity(this.words);
}
/**
* Constructs a new integer array by concatenating two
* existing integer arrays together.
*
* @param oldPattern
* @param newPattern
* @return a new integer array representing two existing
* integer arrays concatenated together
*/
protected static int[] pattern(int[] oldPattern, int... newPattern) {
int[] pattern = new int[oldPattern.length + newPattern.length];
for (int index=0; index<oldPattern.length; index++) {
pattern[index] = oldPattern[index];
}
for (int index=oldPattern.length; index<oldPattern.length+newPattern.length; index++) {
pattern[index] = newPattern[index - oldPattern.length];
}
return pattern;
}
//===============================================================
// Public
//===============================================================
//===========================================================
// Accessor methods (set/get)
//===========================================================
public boolean startsWithNonterminal() {
if (words.length > 0) {
// we assume that the nonterminal symbols will be denoted with negative numbers
return words[0] < 0;
} else {
return false;
}
}
public boolean endsWithNonterminal() {
if (words.length > 0) {
// we assume that the nonterminal symbols will be denoted with negative numbers
return words[words.length-1] < 0;
} else {
return false;
}
}
public boolean endsWithTwoTerminals() {
return (words.length > 1 && words[words.length-1] >= 0 && words[words.length-2] >= 0);
}
public boolean secondTokenIsTerminal() {
return (words.length > 1 && words[1] >= 0);
}
/**
* Gets the lengths of each terminal sequence in this
* pattern.
* <p>
* The result of this method is not well-defined for patterns
* that consist only of nonterminals.
*
* @return the lengths of each terminal sequence in this pattern
*/
// TODO Write unit tests for this method.
public byte[] getTerminalSequenceLengths() {
int size = 0;
boolean readyToStartSequence = true;
for (int word : words) {
if (word < 0) {
readyToStartSequence = true;
} else {
if (readyToStartSequence) {
size++;
readyToStartSequence = false;
}
}
}
byte[] result = new byte[size];
if (size > 0) {
int index=0;
byte count=0;
for (int word : words) {
if (word < 0) {
if (count > 0) {
result[index] = count;
index++;
count = 0;
}
} else {
count++;
}
}
if (count > 0) {
result[index] = count;
}
}
return result;
}
//===========================================================
// Methods
//===========================================================
public List<Pattern> split() {
int arity = this.arity();
if (arity==0) {
return Collections.singletonList(this);
} else {
List<Pattern> patternList = new ArrayList<Pattern>(arity);
List<Integer> tokenList = new ArrayList<Integer>(this.size());
for (int token : this.getWordIDs()) {
if (token < 0) {
if (! tokenList.isEmpty()) {
int[] tokens = new int[tokenList.size()];
patternList.add(new Pattern(this.getVocab(), tokens));
tokenList.clear();
}
} else {
tokenList.add(token);
}
}
if (! tokenList.isEmpty()) {
int[] tokens = new int[tokenList.size()];
patternList.add(new Pattern(this.getVocab(), tokens));
tokenList.clear();
}
return patternList;
}
}
public int arity() {
return arity;
}
public String toString() {
StringBuilder s = new StringBuilder();
s.append('[');
for (int i=0; i<words.length; i++) {
if (i>0) {
s.append(' ');
}
if (words[i] >= 0) {
if (vocab==null) {
s.append(words[i]);
} else {
s.append(vocab.getWord(words[i]));
}
} else {
s.append('X');
}
}
s.append(']');
return s.toString();
}
//===============================================================
// Protected
//===============================================================
//===============================================================
// Methods
//===============================================================
//===============================================================
// Private
//===============================================================
//===============================================================
// Methods
//===============================================================
/**
* Gets the number of nonterminals in this pattern.
*
* @return the number of nonterminals in this pattern.
*/
private static int calculateArity(int[] words) {
int arity = 0;
for (int element : words) {
if (element < 0) arity++;
}
return arity;
}
public void writeExternal(ObjectOutput out) throws IOException {
out.writeInt(arity);
for (int word : words) {
out.writeInt(word);
}
}
}