/* This file is part of the Joshua Machine Translation System.
*
* Joshua is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as
* published by the Free Software Foundation; either version 2.1
* of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free
* Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
* MA 02111-1307 USA
*/
package joshua.corpus.alignment;
import java.io.Externalizable;
import java.io.IOException;
import java.io.ObjectInput;
import java.io.ObjectOutput;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashSet;
import java.util.Iterator;
/**
* Representation of a 2-dimensional grid. This implementation is
* designed to be as memory-efficient as possible for storing many
* grids in memory. Most JVMs use 32bit ints to store byte, short,
* and boolean individual primitives, but will actually make efficient
* use of memory when these small primitives are stored in arrays.
* Therefore, we create a single sorted array of shorts, where both
* the X and Y coordinate for a given "true" point are the grid are
* encoded in one short.
*
* @author Josh Schroeder
* @since 09 Dec 2004
* @author Lane Schwartz
* @since 15 Dec 2008
*/
public class AlignmentGrid implements Externalizable {
//===============================================================
// Constants
//===============================================================
/**
* Maximum size of a dimension.
*/
static public final int MAX_LENGTH = 100;
/**
* Constant used for generating coordinate short value.
*/
static private final short X_SHIFT = 100;
//===============================================================
// Member variables
//===============================================================
/** Width of the grid. */
protected int width;
/** Height of the grid. */
protected int height;
/**
* Array of alignment points, encoded as shorts.
*
* @see #getKey(int, int)
* @see #getLocation(short)
*/
protected short[] coordinates;
/**
* Array of reverse alignment points, encoded as shorts.
*
* @see #getKey(int, int)
* @see #getLocation(short)
*/
protected short[] transposedCoordinates;
/**
* Constructor takes the small string representation of
* alignment points.
*
* @param alignmentPoints the string representation of
* alignments.
*/
public AlignmentGrid(String alignmentPoints) {
HashSet<Coordinate> coordinates = new HashSet<Coordinate>();
String[] alignmentPointsArray = alignmentPoints.split(",|\\s");
for (int i = 0; i < alignmentPointsArray.length; i++) {
if (!alignmentPointsArray[i].trim().equals("")) {
Coordinate coord = new Coordinate(alignmentPointsArray[i]);
width = Math.max(width, coord.x+1);
height = Math.max(height, coord.y+1);
if (width>MAX_LENGTH || height>MAX_LENGTH) {
throw new RuntimeException("Encountered alignment point " + coord + " which exceeds the maximum that can be represented " + new Coordinate(MAX_LENGTH-1, MAX_LENGTH-1) + ". Please ensure that each training sentence contains fewer than " + MAX_LENGTH + " words.");
}
coordinates.add(coord);
}
}
initializeCoordinates(coordinates);
}
/**
* Constructs a completely empty, utterly uninitialized
* alignment grid, containing <em>absolutely nothing</em>.
* <p>
* This constructor only exists to allow this class to be
* properly <code>Externalizable</code>.
*/
public AlignmentGrid() {
// This method intentionally left blank.
}
//===============================================================
// Public
//===============================================================
//===========================================================
// Accessor methods (set/get)
//===========================================================
/**
* Gets the width of this object.
*
* @return the width (X size) of the Grid
*/
public int getWidth() {
return width;
}
/**
* Gets the height of this object.
*
* @return the height (Y size) of the Grid
*/
public int getHeight() {
return height;
}
/**
* Checks if a coordinate's values fall within the bounds
* of the Grid. DOES NOT check for the existance of the
* coordinate in the grid. Use contains for that purpose.
*
* @param x the x value of the location to check validity for
* @param y the y value of the location to check validity for
* @return true if the location is in bounds
* @see #contains(int,int)
*/
public boolean isValid(int x, int y) {
return (x>=0 &&
y>=0 &&
x<getWidth() &&
y<getHeight());
}
//===========================================================
// Methods
//===========================================================
/**
* Compares this object to another. If it is also a grid,
* first checks height and width compatibility, then checks
* that all coordinates are equal.
*
* @param o object to comare to
* @return <code>true</code> if o is a Grid of the same
* size and containing the same points as this one,
* <code>false</code> otherwise
*/
public boolean equals(Object o) {
if (this==o) {
return true;
} else if (o instanceof AlignmentGrid) {
AlignmentGrid other = (AlignmentGrid)o;
return (this.getWidth()==other.getWidth() &&
this.getHeight()==other.getHeight() &&
Arrays.equals(this.getCoordinates(),other.getCoordinates()));
} else {
return false;
}
}
/* See Javadoc for java.lang.Object#hashCode */
public int hashCode() {
return
Arrays.hashCode(this.getCoordinates()) +
this.getHeight()*31 +
this.getWidth()*317;
}
/**
* Checks if a given location is present in the grid.
*
* @param x the x value of the coordinate to check for
* @param y the y value of the coordinate to check for
* @return <code>true</code> if the specified coordinate
* is in bounds and exists, <code>false</code>
* otherwise
*/
public boolean contains(int x, int y) {
if (isValid(x,y)) {
int index = Arrays.binarySearch(getCoordinates(), getKey(x,y));
//the index returned by a binarySearch is positive if the number exists
return (index >= 0);
} else {
throw new ArrayIndexOutOfBoundsException("("+x+","+y+")");
}
}
/**
* Exports the contents of this grid to a 2-d boolean array,
* of the size array[width][height]. Coordinates contained
* in this grid will be set to true, all others false.
*
* @return a 2-d boolean array representation of this grid
*/
public boolean[][] generateBooleanArray() {
int width = getWidth();
int height = getHeight();
boolean[][] array = new boolean[width][height];
for (int x = 0; x < width; x++)
for (int y = 0; y < height; y++)
array[x][y]=false;
for (int i=0;i<getCoordinates().length;i++) {
short[] location = getLocation(getCoordinates()[i]);
//location may be null if some coordinates are masked (see MaskedGrid)
if (location !=null) array[location[0]][location[1]] = true;
}
return array;
}
/**
* Returns a sorted list (includes any duplicates) of the
* target language indices that align with the given source
* language span.
*
* @param sourceSpanStart Inclusive start index
* into the source language sentence
* @param sourceSpanEnd Exclusive end index into
* the source language sentence.
*
* @return a sorted list (includes any duplicates) of the
* target language indices that align with the given
* source language span
*/
public int[] getTargetPoints(int sourceSpanStart, int sourceSpanEnd) {
return getPoints(sourceSpanStart, sourceSpanEnd, getHeight(), getCoordinates());
}
/**
* Returns a sorted list (includes any duplicates) of the
* source language indices that align with the given target
* language span.
*
* @param targetSpanStart Inclusive start index
* into the target language sentence.
* @param targetSpanEnd Exclusive end index into
* the target language sentence.
*
* @return a sorted list (includes any duplicates) of the
* source language indices that align with the given
* target language span.
*/
public int[] getSourcePoints (int targetSpanStart, int targetSpanEnd) {
return getPoints(targetSpanStart, targetSpanEnd, getWidth(), getTransposedCoordinates());
}
/**
* Returns a sorted list (includes any duplicates) of
* alignment indices for the given span, constructed from
* the provided array of encoded points.
*
* @param start Inclusive start index
* @param end Exclusive end index
* @param maxKey Maximum allowed coordinate value
* @param points Encoded alignment points
*
* @return a sorted list (includes any duplicates) of
* alignment indices for the given span, constructed
* from the provided array of encoded points.
*/
public static int[] getPoints(int start, int end, int maxKey, short[] points) {
short startKey = getKey(start,0);
short endKey = getKey(end-1,maxKey);
int startIndex = Arrays.binarySearch(points, startKey);
int endIndex = Arrays.binarySearch(points,endKey);
if (startIndex < 0) startIndex = (startIndex+1)*(-1);
if (endIndex < 0) endIndex = (endIndex+1)*(-1);
int[] result = new int[endIndex-startIndex];
for (int i=startIndex;i<endIndex;i++) {
result[i-startIndex]=points[i] % X_SHIFT;
}
Arrays.sort(result);
return result;
}
/**
* Gets a String representation of the grid's contents in
* the smallest number of characters.
*
* The format is <code>x1.y1,x2.y2,....xN.yN</code>.
* <em>Note</em>: The returned String does <em>not</em>
* imply the full width and height of the array, only the
* points contained in the grid.
*
* @return a "thin" String representation of the grid.
*/
public String toString() {
StringBuffer buf = new StringBuffer();
for (int i=0;i<getCoordinates().length;i++) {
short[] location = getLocation(getCoordinates()[i]);
//location may be null if some coordinates are masked (see MaskedGrid)
if (location !=null){
buf.append(location[0]);
buf.append('-');
buf.append(location[1]);
buf.append(' ');
}
}
if(buf.length() > 0) buf.deleteCharAt(buf.length()-1);
return buf.toString();
}
/**
* Gets a String representation of the grid represented as
* an ASCII graph of the grid.
*
* @return String displaying the grid as an ASCII graph
*/
public String toAsciiGraph() {
StringBuffer buffer = new StringBuffer();
boolean[][] array = generateBooleanArray();
if(array.length == 0) return "";
for(int y = 0; y < array[0].length; y++) {
buffer.append('|');
for(int x = 0; x < array.length; x++) {
if(array[x][y]) {
buffer.append("XX");
} else {
buffer.append(" ");
}
buffer.append('|');
}
buffer.append('\n');
}
return buffer.toString();
}
//===============================================================
// Protected
//===============================================================
//===============================================================
// Methods
//===============================================================
/**
* Called by the constructor to load a set of coordinates.
*
* @param coordinates Coordinates to be used during initialization
*/
protected void initializeCoordinates(Collection<Coordinate> coordinates) {
Iterator<Coordinate> it = coordinates.iterator();
this.coordinates = new short[coordinates.size()];
this.transposedCoordinates = new short[coordinates.size()];
int index=0;
while (it.hasNext()) {
Coordinate coordinate = it.next();
this.coordinates[index]=getKey(coordinate.x,coordinate.y);
this.transposedCoordinates[index]=getKey(coordinate.y,coordinate.x);
index++;
}
Arrays.sort(this.coordinates);
Arrays.sort(this.transposedCoordinates);
}
/**
* Gets an encoded short value for a given x,y pair.
*
* @param x X coordinate
* @param y Y coordinate
* @return an encoded short value for a given x,y pair.
*/
protected static short getKey(int x, int y) {
int key = x*X_SHIFT+y;
return (short)key;
}
/**
* Generates the location of a coordinate from a key.
*
* @param key Encoded short value
* @return the coordinate from the key
*/
protected short[] getLocation(short key) {
short[] location = new short[2];
location[0] = (short)(key / X_SHIFT);
location[1] = (short)(key % X_SHIFT);
return location;
}
/**
* Gets the encoded coordinates for this grid.
*
* @return the encoded coordinates for this grid
*/
protected short[] getCoordinates() {
return coordinates;
}
/**
* Gets the encoded reverse coordinates for this grid.
*
* @return the encoded reverse coordinates for this grid
*/
protected short[] getTransposedCoordinates() {
return transposedCoordinates;
}
/* See Javadoc for java.io.Externalizable interface. */
public void readExternal(ObjectInput in) throws IOException,
ClassNotFoundException {
// Read the width and height of the grid
this.width = in.readInt();
this.height = in.readInt();
// Read the number of alignment points
int numPoints = in.readInt();
// Read the alignment points
this.coordinates = new short[numPoints];
for (int i=0; i<numPoints; i++) {
coordinates[i] = in.readShort();
}
// Read the reverse alignment points
this.transposedCoordinates = new short[numPoints];
for (int i=0; i<numPoints; i++) {
transposedCoordinates[i] = in.readShort();
}
}
/* See Javadoc for java.io.Externalizable interface. */
public void writeExternal(ObjectOutput out) throws IOException {
// Write the width and height of the grid
out.writeInt(width);
out.writeInt(height);
// Write the number of alignment points
out.writeInt(coordinates.length);
// Write the alignment points
for (short point : coordinates) {
out.writeShort(point);
}
// Write the reverse alignment points
for (short point : transposedCoordinates) {
out.writeShort(point);
}
}
}