/* Copyright (C) 2002 Univ. of Massachusetts Amherst, Computer Science Dept. This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit). http://www.cs.umass.edu/~mccallum/mallet This software is provided under the terms of the Common Public License, version 1.0, as published by http://www.opensource.org. For further information, see the file `LICENSE' included with this distribution. */ /** Create new features from all possible conjunctions with other (possibly position-offset) features. @author Andrew McCallum <a href="mailto:mccallum@cs.umass.edu">mccallum@cs.umass.edu</a> */ package cc.mallet.pipe.tsf; import java.io.*; import cc.mallet.pipe.Pipe; import cc.mallet.types.Instance; import cc.mallet.types.Token; import cc.mallet.types.TokenSequence; import cc.mallet.util.PropertyList; public class OffsetPropertyConjunctions extends Pipe implements Serializable { int[][] conjunctions; boolean includeOriginalSingletons; String propertyKey; // To include all the old previous singleton features, pass {{0}} // For a conjunction at the current time step, pass {{0,0}} // For a conjunction of current and previous, pass {{0,-1}} // For a conjunction of the current and next two, pass {{0,1,2}} private OffsetPropertyConjunctions (boolean includeOriginalSingletons, String propertyKey, int[][] conjunctions) { this.conjunctions = conjunctions; this.includeOriginalSingletons = includeOriginalSingletons; this.propertyKey = propertyKey; } public OffsetPropertyConjunctions (boolean includeOriginalSingletons, int[][] conjunctions) { this (includeOriginalSingletons, null, conjunctions); } public OffsetPropertyConjunctions (int[][] conjunctions) { this (true, conjunctions); } public Instance pipe (Instance carrier) { TokenSequence ts = (TokenSequence) carrier.getData(); int tsSize = ts.size(); PropertyList[] oldfs = new PropertyList[ts.size()]; PropertyList[] newfs = new PropertyList[ts.size()]; for (int i = 0; i < tsSize; i++) oldfs[i] = ts.get(i).getFeatures (); if (includeOriginalSingletons) for (int i = 0; i < tsSize; i++) newfs[i] = ts.get(i).getFeatures (); for (int i = 0; i < ts.size(); i++) { //System.out.println ("OffsetPropertyConjunctions: ts index="+i+", conjunction ="); conjunctionList: for (int j = 0; j < conjunctions.length; j++) { // Make sure that the offsets in the conjunction are all available at this position for (int k = 0; k < conjunctions[j].length; k++) { if (conjunctions[j][k] + i < 0 || conjunctions[j][k] + i > tsSize-1 || oldfs[i+conjunctions[j][k]] == null) continue conjunctionList; //System.out.print (" "+conjunctions[j][k]); } //System.out.print ("\n"); // Add the features for this conjunction if (conjunctions[j].length == 1) { int offset = conjunctions[j][0]; if (offset == 0 && includeOriginalSingletons) throw new IllegalArgumentException ("Original singletons already there."); PropertyList.Iterator iter = oldfs[i+offset].iterator(); while (iter.hasNext()) { iter.next(); if (propertyKey != null && !propertyKey.equals(iter.getKey())) continue; String key = iter.getKey() + (offset==0 ? "" : "@"+offset); newfs[i] = PropertyList.add (key, iter.getNumericValue(), newfs[i]); } } else if (conjunctions[j].length == 2) { //System.out.println ("token="+ts.getToken(i).getText()+" conjunctionIndex="+j); int offset0 = conjunctions[j][0]; int offset1 = conjunctions[j][1]; PropertyList.Iterator iter0 = oldfs[i+offset0].iterator(); int iter0i = -1; while (iter0.hasNext()) { iter0i++; iter0.next(); if (propertyKey != null && !propertyKey.equals(iter0.getKey())) continue; PropertyList.Iterator iter1 = oldfs[i+offset1].iterator(); int iter1i = -1; while (iter1.hasNext()) { iter1i++; iter1.next(); if (propertyKey != null && !propertyKey.equals(iter1.getKey())) continue; // Avoid redundant doubling of feature space; include only upper triangle //System.out.println ("off0="+offset0+" off1="+offset1+" iter0i="+iter0i+" iter1i="+iter1i); if (offset0 == offset1 && iter1i <= iter0i) continue; //System.out.println (">off0="+offset0+" off1="+offset1+" iter0i="+iter0i+" iter1i="+iter1i); String key = iter0.getKey() + (offset0==0 ? "" : "@"+offset0) +"&"+iter1.getKey() + (offset1==0 ? "" : "@"+offset1); newfs[i] = PropertyList.add (key, iter0.getNumericValue() * iter1.getNumericValue(), newfs[i]); } } } else if (conjunctions[j].length == 3) { int offset0 = conjunctions[j][0]; int offset1 = conjunctions[j][1]; int offset2 = conjunctions[j][2]; PropertyList.Iterator iter0 = oldfs[i+offset0].iterator(); int iter0i = -1; while (iter0.hasNext()) { iter0i++; iter0.next(); if (propertyKey != null && !propertyKey.equals(iter0.getKey())) continue; PropertyList.Iterator iter1 = oldfs[i+offset1].iterator(); int iter1i = -1; while (iter1.hasNext()) { iter1i++; iter1.next(); if (propertyKey != null && !propertyKey.equals(iter1.getKey())) continue; // Avoid redundant doubling of feature space; include only upper triangle if (offset0 == offset1 && iter1i <= iter0i) continue; PropertyList.Iterator iter2 = oldfs[i+offset2].iterator(); int iter2i = -1; while (iter2.hasNext()) { iter2i++; iter2.next(); if (propertyKey != null && !propertyKey.equals(iter2.getKey())) continue; // Avoid redundant doubling of feature space; include only upper triangle if (offset1 == offset2 && iter2i <= iter1i) continue; String key = iter0.getKey() + (offset0==0 ? "" : "@"+offset0) +"&"+iter1.getKey() + (offset1==0 ? "" : "@"+offset1) +"&"+iter2.getKey() + (offset2==0 ? "" : "@"+offset2); newfs[i] = PropertyList.add (key, iter0.getNumericValue() * iter1.getNumericValue() * iter2.getNumericValue(), newfs[i]); } } } } else { throw new UnsupportedOperationException ("Conjunctions of length 4 or more not yet implemented."); } } } // Put the new PropertyLists in place for (int i = 0; i < ts.size(); i++) ts.get(i).setFeatures (newfs[i]); return carrier; } // Serialization private static final long serialVersionUID = 1; private static final int CURRENT_SERIAL_VERSION = 0; private static final int NULL_INTEGER = -1; private void writeObject (ObjectOutputStream out) throws IOException { out.writeInt (CURRENT_SERIAL_VERSION); int size1, size2; size1 = (conjunctions == null) ? NULL_INTEGER : conjunctions.length; out.writeInt(size1); if (size1 != NULL_INTEGER) { for (int i = 0; i <size1; i++) { size2 = (conjunctions[i] == null) ? NULL_INTEGER: conjunctions.length; out.writeInt(size2); if (size2 != NULL_INTEGER) { for (int j = 0; j <size2; j++) { out.writeInt(conjunctions[i][j]); } } } } out.writeBoolean(includeOriginalSingletons); } private void readObject (ObjectInputStream in) throws IOException, ClassNotFoundException { int size1, size2; int version = in.readInt (); size1 = in.readInt();; if (size1 == NULL_INTEGER) { conjunctions = null; } else { conjunctions = new int[size1][]; for (int i = 0; i < size1; i++) { size2 = in.readInt(); if (size2 == NULL_INTEGER) { conjunctions[i] = null; } else { conjunctions[i] = new int[size2]; for (int j = 0; j < size2; j++) { conjunctions[i][j] = in.readInt(); } } } } includeOriginalSingletons = in.readBoolean(); } }