/** FeatureGenCache.java
* Created on Jul 20, 2005
*
* @author Sunita Sarawagi
* @since 1.2
* @version 1.3
*
* For each distinct feature-id there is a IntHashArray of variants of
* the values and labels of the feature seen throughout the data. This list is a vector of
* variantIds. There is hash-map from variantIds to FeatureImpl.
*
* TODO: keeping vector of featureIds implies that insertion is quadratic--- need to make this efficient.
*/
package iitb.CRF;
import gnu.trove.list.array.TIntArrayList;
import gnu.trove.map.hash.TIntObjectHashMap;
import gnu.trove.procedure.TObjectProcedure;
import java.util.ArrayList;
import java.util.BitSet;
import java.util.Hashtable;
import java.util.Iterator;
import cern.colt.matrix.tdouble.DoubleMatrix1D;
public class FeatureGenCache implements FeatureGeneratorNested {
private static final long serialVersionUID = 1L;
FeatureGeneratorNested fgen;
FeatureGenerator sfgen;
TIntArrayList featureIds = new TIntArrayList();
ArrayList<int[][][]> perSegmentFeatureOffsets = new ArrayList<int[][][]>();
protected boolean firstScan=true;
int dataIndex=-1;
int scanNum=0;
int dataIndexStart=0;
static class DBKeysToIndexMap extends Hashtable<Integer,Integer> {
/**
*
*/
private static final long serialVersionUID = -5371025071227735691L;
int prevId=-1;
Integer pos;
public int getDataIndex(DataSequence data) {
int id = ((KeyedDataSequence)data).getKey();
if (prevId==id)
return pos;
prevId = id;
pos = get(id);
if (pos==null) {
pos=-1;
}
return pos;
}
public DBKeysToIndexMap(DataIter dataIter) {
int pos = 0;
for (dataIter.startScan(); dataIter.hasNext();pos++) {
DataSequence data = dataIter.next();
assert(getDataIndex(data)==-1);
put(((KeyedDataSequence)data).getKey(),pos);
// System.out.println("Inserting "+((KeyedDataSequence)data).getKey()+" "+pos+ " "+data.length());
}
}
}
DBKeysToIndexMap dbKeyToIndexMap=null;
public static class AllFeatureCache {
ArrayList<FeatureImpl> distinctFeatures;
ArrayList<FeatureImpl> featureVariants;
// public FeatureVector edgeFeatureIds = new FeatureVector();
public EdgeFeatures edgeFeatures = new EdgeFeatures();
public boolean edgeFeaturesXIndependent = false;
//public boolean cacheEdgeFeaturesDone = false;
class FeatureImpl implements Feature {
int _index;
int _y;
float _value;
void init(int _index, int _y, float _value) {
this._index = _index;
this._y = _y;
this._value = _value;
}
void copy(Feature f) {
this._index = f.index();
this._y = f.y();
this._value = f.value();
}
public FeatureImpl(Feature f) {
if (f != null) {
copy(f);
}
}
public int index() {
return _index;
}
public int y() {
return _y;
}
public int yprev() {
return -1;
}
public float value() {
return _value;
}
public int[] yprevArray() {
return null;
}
public boolean allButValueEqual(Object obj) {
Feature feature = (Feature)obj;
return (_y==feature.y());
}
public int add(Feature f){return index();}
@Override
public int hashCode() {
final int PRIME = 31;
int result = 1;
result = PRIME * result + Float.floatToIntBits(_value);
result = PRIME * result + _y;
return result;
}
@Override
public boolean equals(Object obj) {
if (this == obj)
return true;
if (obj == null)
return false;
return (allButValueEqual(obj)
&& (Math.abs(_value-((Feature)obj).value()) < Float.MIN_VALUE));
}
}
class FeatureImplWithYPrev extends FeatureImpl {
public FeatureImplWithYPrev(Feature f) {
super(f);
_yprev=f.yprev();
}
int _yprev;
void init(int _index, int _y, int yprev, float _value) {
super.init(_index, _y, _value);
this._yprev=yprev;
}
void copy(Feature f) {
super.copy(f);
_yprev=f.yprev();
}
public boolean allButValueEqual(Object obj) {
return (super.allButValueEqual(obj) && (_yprev==((Feature)obj).yprev()));
}
public int yprev() {
return _yprev;
}
@Override
public int hashCode() {
final int PRIME = 31;
int result = 1;
result = PRIME * result + Float.floatToIntBits(_value);
result = PRIME * result + _y;
result = PRIME * result + _yprev;
return result;
}
}
class FeatureCache extends FeatureImpl {
Hashtable<FeatureImpl,Integer> featureVariantIds = null;
FeatureCache(Feature f) {super(f);}
/**
* @param f
* @return
*/
public int add(Feature f) {
if (equals(f)) {
return f.index();
}
if (featureVariantIds == null) {
featureVariantIds = new Hashtable<FeatureImpl,Integer>();
}
//Object diffObject = createDiff(f);
FeatureImpl diffObject = new FeatureImpl(f);
int variantId;
if (featureVariantIds.containsKey(diffObject))
variantId = featureVariantIds.get(diffObject); //findFeatureInVariantList(f);
else {
variantId = featureVariants.size();
featureVariantIds.put(diffObject,variantId);
featureVariants.add(diffObject);
}
return -1*variantId-1;
}
}
class FeatureCacheWithYPrev extends FeatureImplWithYPrev {
Hashtable<FeatureImplWithYPrev,Integer> featureVariantIds = null;
public FeatureCacheWithYPrev(Feature f) {super(f);}
/**
* @param f
* @return
*/
public int add(Feature f) {
if (equals(f)) {
return f.index();
}
if (featureVariantIds == null) {
featureVariantIds = new Hashtable<FeatureImplWithYPrev,Integer>();
}
//Object diffObject = createDiff(f);
FeatureImplWithYPrev diffObject = new FeatureImplWithYPrev(f);
int variantId;
if (featureVariantIds.containsKey(diffObject))
variantId = featureVariantIds.get(diffObject); //findFeatureInVariantList(f);
else {
variantId = featureVariants.size();
featureVariantIds.put(diffObject,variantId);
featureVariants.add(diffObject);
}
return -1*variantId-1;
}
}
public AllFeatureCache(boolean edgeFeaturesXIndependent) {
this.edgeFeaturesXIndependent = edgeFeaturesXIndependent;
distinctFeatures = new ArrayList<FeatureImpl>();
featureVariants = new ArrayList<FeatureImpl>();
}
public int add(Feature f) {
int numAdd = f.index()+1-distinctFeatures.size();
for (int i = 0; i < numAdd; i++) {
distinctFeatures.add(null);
}
if (distinctFeatures.get(f.index())==null) {
if (f.yprev() >= 0)
distinctFeatures.set(f.index(), new FeatureCacheWithYPrev(f));
else
distinctFeatures.set(f.index(), new FeatureCache(f));
return f.index();
} else {
return ((FeatureImpl)(distinctFeatures.get(f.index()))).add(f);
}
}
public class EdgeFeaturesTest extends EdgeFeatures {
FeatureVector testEdgeFeatureIds[];
EdgeFeaturesTest() {
super();
testEdgeFeatureIds = new FeatureVector[4];
for (int i = 0; i < 4; testEdgeFeatureIds[i] = new FeatureVector(),i++);
}
public void addEdgeFeature(int edgeId, int prevPos, int pos, int dataLen) {
super.addEdgeFeature(edgeId, prevPos, pos, dataLen);
int edgeType = getEdgeType(prevPos,pos, dataLen);
if (edgeTypeCached[edgeType]) {
assert(edgeFeatureIds[edgeType].contains(edgeId));
testEdgeFeatureIds[edgeType].add(edgeId);
return;
}
}
public void doneOneRoundEdges() {
super.doneOneRoundEdges();
for (int i = 0; i < 4; i++) {
if (testEdgeFeatureIds[i].size() > 0) {
//System.out.println("Edge features "+i + " size "+ testEdgeFeatureIds[i].size());
assert(testEdgeFeatureIds[i].equals(edgeFeatureIds[i]));
testEdgeFeatureIds[i].clear();
}
}
}
}
public class EdgeFeatures {
FeatureVector edgeFeatureIds[];
boolean edgeTypeCached[];
EdgeFeatures() {
edgeFeatureIds = new FeatureVector[4];
edgeTypeCached = new boolean[4];
for (int i = 0; i < 4; edgeFeatureIds[i] = new FeatureVector(), edgeTypeCached[i++]=false);
}
int getEdgeType(int prevPos, int pos, int dataLen) {
return 2*((prevPos == 0)?1:0) + ((dataLen == pos+1)?1:0);
}
public void addEdgeFeature(int edgeId, int prevPos, int pos, int dataLen) {
int edgeType = getEdgeType(prevPos,pos, dataLen);
if (edgeTypeCached[edgeType]) {
return;
}
edgeFeatureIds[edgeType].add(edgeId);
}
public FeatureVector getEdgeIds(int prevPos, int pos, int dataLen) {
return edgeFeatureIds[getEdgeType(prevPos, pos,dataLen)];
}
public void doneOneRoundEdges() {
for (int i = 0; i < 4; i++) {
if (edgeFeatureIds[i].size() > 0) {
edgeTypeCached[i] = true;
}
}
}
}
public Feature get(int featureId) {
if (featureId >= 0) {
return (Feature) distinctFeatures.get(featureId); // distinctFeatures[featureId];
} else {
return (Feature) featureVariants.get(-1*featureId-1);
}
}
public class FIterator implements Iterator<Feature> {
int index;
int sz;
TIntArrayList intArr;
FIterator(TIntArrayList intArr) {
index = 0;
this.intArr = intArr;
sz = intArr.size();
}
public boolean hasNext() {
return index < sz;
}
public Feature next() {
return AllFeatureCache.this.get(intArr.get(index++));
}
public void remove() {
}
}
public class FeatureVector {
TIntArrayList intList = new TIntArrayList();
public void add(int value) {
this.intList.add(value);
}
public void add(Feature f) {
this.intList.add(AllFeatureCache.this.add(f));
}
public int get(int value) {
return this.intList.get(value);
}
public void clear() {
this.intList.clear();
}
public int size() {
return this.intList.size();
}
public Iterator<Feature> iterator() {
return new FIterator(this.intList);
}
public boolean contains(int value) {
return this.intList.contains(value);
}
}
double DEFAULT_VALUE = RobustMath.LOG0;
public class Flist extends FeatureVector {
/**
*
*/
private static final long serialVersionUID = -8388201269131208682L;
public DoubleMatrix1D mat;
Flist(int numLabels) {
mat = new LogDenseDoubleMatrix1D(numLabels);
mat.assign(DEFAULT_VALUE);
}
public void clear() {super.clear();mat.assign(DEFAULT_VALUE);}
public void add(Feature f, double lambda[]) {
super.add(f);
double oldVal = mat.get(f.y());
if (oldVal == DEFAULT_VALUE)
oldVal = 0;
mat.set(f.y(),oldVal+f.value()*lambda[f.index()]);
}
public void calcMatrix(double lambda[]) {
if (size()==0) return;
mat.assign(DEFAULT_VALUE);
for (Iterator<Feature> iter = iterator(); iter.hasNext();) {
Feature f = iter.next();
double oldVal = mat.get(f.y());
if (oldVal == DEFAULT_VALUE)
oldVal = 0;
mat.set(f.y(),oldVal+f.value()*lambda[f.index()]);
}
}
}
public FeatureVector newFeatureVector() {return new FeatureVector();}
public Flist newFlist(int numLabels) {return new Flist(numLabels);}
}
AllFeatureCache featureCache;
public FeatureGenCache(FeatureGeneratorNested fgen, boolean edgeFeaturesXIndependent) {
alloc(fgen,edgeFeaturesXIndependent);
}
public FeatureGenCache(FeatureGenCache sharedCache, int startDataIndex) {
assert (sharedCache.scanNum>0);
firstScan=false;
dataIndexStart = startDataIndex;
scanNum = sharedCache.scanNum;
fgen = sharedCache.fgen;
sfgen = sharedCache.sfgen;
featureCache = sharedCache.featureCache;
featureIds = sharedCache.featureIds;
perSegmentFeatureOffsets = sharedCache.perSegmentFeatureOffsets;
stats = sharedCache.stats;
}
/**
* @param fgen2
* @param edgeFeaturesXIndependent
*/
private void alloc(FeatureGenerator fgen, boolean edgeFeaturesXIndependent) {
this.sfgen = fgen;
if (sfgen instanceof FeatureGeneratorNested)
this.fgen = ((FeatureGeneratorNested)sfgen);
else
this.fgen = null;
featureCache = new AllFeatureCache(edgeFeaturesXIndependent);
}
public FeatureGenCache(FeatureGenerator fgen, boolean edgeFeaturesXIndependent) {
alloc(fgen,edgeFeaturesXIndependent);
}
public FeatureGenCache(FeatureGenerator fgen, boolean edgeFeaturesXIndependent, DataIter dataIter) {
alloc(fgen,edgeFeaturesXIndependent);
cacheFeaturesOnKeys(dataIter);
}
public void cacheFeaturesOnKeys(DataIter dataIter) {
setDataKeys(dataIter);
startDataScan();
dataIter.startScan();
while (dataIter.hasNext()) {
DataSequence dataSeq = dataIter.next();
nextDataIndex();
for (int p = 0; p < dataSeq.length(); p++) {
startScanFeaturesAt(dataSeq, p);
while (hasNext()) {
next();
}
}
}
startDataScan();
}
// for each distinct feature-id this stores all various forms of the features.
class Stats {
int dataLen;
int maxSegSize;
int pos, prevPos;
int thisSegmentOffsets[];
TIntObjectHashMap<int[]> segmentFeatureOffsets;
BitSet seenSegments = new BitSet();
boolean cacheThis;
boolean cacheEdgeFeatures = false;
class InitProc implements TObjectProcedure {
public boolean execute(Object arg0) {
int vals[] = (int[])arg0;
vals[0]=0; vals[1]=-1;
return true;
}
}
InitProc initProc = new InitProc();
Stats() {
segmentFeatureOffsets = new TIntObjectHashMap();
}
public void clear() {
maxSegSize = 1;
segmentFeatureOffsets.forEachValue(initProc);
seenSegments.clear();
}
int getKey(int prevPos, int pos) {
return pos*dataLen+pos-prevPos-1;
}
int[] getStartEndOffsets(int prevPos, int pos) {
return (int[]) segmentFeatureOffsets.get(getKey(prevPos,pos));
}
/**
* @param data
* @param pos
* @param prevPos
*/
public boolean initSegment(DataSequence data, int prevPos, int pos) {
dataLen = data.length();
maxSegSize = Math.max(maxSegSize, pos-prevPos);
this.pos = pos;
this.prevPos = prevPos;
cacheThis=true;
thisSegmentOffsets = (int[]) segmentFeatureOffsets.get(getKey(prevPos,pos));
if (thisSegmentOffsets==null) {
thisSegmentOffsets = new int[2];
segmentFeatureOffsets.put(getKey(prevPos,pos),thisSegmentOffsets);
}
if (!seenSegments.get(getKey(prevPos,pos))) {
thisSegmentOffsets[0] = thisSegmentOffsets[1] = featureIds.size();
} else {
cacheThis=false;
}
seenSegments.set(getKey(prevPos,pos));
return cacheThis;
/*
if (cacheEdgeFeatures) {
// features already cached in previous segment.
cacheEdgeFeatures = false;
featureCache.cacheEdgeFeaturesDone = true;
} else if ((prevPos >= 0) && !featureCache.cacheEdgeFeaturesDone) {
cacheEdgeFeatures = true;
}
*/
}
/**
* @param f
*/
public void add(Feature f) {
if (!cacheThis) {
// segment has already been seen before and cached.
return;
}
if (featureCache.edgeFeaturesXIndependent && (f.yprev() >= 0)) {
featureCache.edgeFeatures.addEdgeFeature(featureCache.add(f),prevPos, pos,dataLen);
return;
}
assert(f.yprevArray()==null);
featureIds.add(featureCache.add(f));
thisSegmentOffsets[1]++;
}
public boolean checkFeaturesEnd(boolean hasNextFeature) {
if (!hasNextFeature) featureCache.edgeFeatures.doneOneRoundEdges();
return hasNextFeature;
}
}
Stats stats = new Stats();
public void setDataKeys(DataIter dataIter) {
dataIter.startScan();
if (dataIter.hasNext()) {
DataSequence data = dataIter.next();
if (data instanceof KeyedDataSequence)
dbKeyToIndexMap = new DBKeysToIndexMap(dataIter);
}
}
public void startDataScan() {
dataIndex = dataIndexStart-1;
scanNum++;
if (scanNum ==2) {
firstScan = false;
// cache the last data item.
cachePreviousDataSequence();
System.out.println("First scan done..distinct features "+(featureCache.featureVariants.size()+featureCache.distinctFeatures.size()));
}
}
/**
*
*/
private void cachePreviousDataSequence() {
int dataLen = stats.dataLen;
int[][] featureOffsets[] = new int[dataLen][stats.maxSegSize][2];
for (int p = 0; p < dataLen; p++) {
for (int l = 0; (l < stats.maxSegSize) && (p-l >= 0); l++) {
int offsets[] = stats.getStartEndOffsets(p-l-1,p);
featureOffsets[p][l][0] = (offsets==null)?0:offsets[0];
featureOffsets[p][l][1] = (offsets==null)?-1:offsets[1];
}
}
perSegmentFeatureOffsets.add(featureOffsets);
}
protected void cacheFeature(Feature f) {
stats.add(f);
}
public void nextDataIndex() {
dataIndex++;
if (!firstScan) {
return;
}
if (dataIndex > 0) {
cachePreviousDataSequence();
}
stats.clear();
}
public void setDataIndex(int dIndex) {
dataIndex = dIndex;
if (!firstScan) {
return;
}
if (dataIndex > 0) {
cachePreviousDataSequence();
}
stats.clear();
}
/**
* @param data
* @return
*/
protected int getDataIndex(DataSequence data) {
return dataIndex;
}
class Cursor {
int currentFeatureOffset;
int featureOffsetEnd;
int edgeFeatureId = 0;
AllFeatureCache.FeatureVector edgeFeatureIds;
/**
* @param data
* @param pos
* @param prevPos
*/
public void init(DataSequence data, int prevPos, int pos) {
int[][] tfeatures[] = (int[][][]) perSegmentFeatureOffsets.get(getDataIndex(data));
currentFeatureOffset = tfeatures[pos][pos-prevPos-1][0];
featureOffsetEnd = tfeatures[pos][pos-prevPos-1][1];
assert(featureOffsetEnd >= currentFeatureOffset);
edgeFeatureId = -1;
if ((prevPos >= 0) && (featureCache.edgeFeaturesXIndependent)) {
edgeFeatureIds = featureCache.edgeFeatures.getEdgeIds(prevPos,pos,data.length());
edgeFeatureId = edgeFeatureIds.size()-1;
}
}
public void noEdgeFeatures() {
edgeFeatureId=-1;
}
/**
* @return
*/
public boolean hasNext() {
return ((currentFeatureOffset < featureOffsetEnd) || (edgeFeatureId >= 0));
}
/**
* @param feature
*/
public Feature nextFeature() {
int featureId = (currentFeatureOffset < featureOffsetEnd)?
featureIds.get(currentFeatureOffset++):edgeFeatureIds.get(edgeFeatureId--);
return featureCache.get(featureId);
}
}
Cursor cursor = new Cursor();
/* (non-Javadoc)
* @see iitb.CRF.FeatureGeneratorNested#startScanFeaturesAt(iitb.CRF.DataSequence, int, int)
*/
protected void startScanFeaturesAt(DataSequence data, int prevPos, int pos, boolean nested) {
if (dbKeyToIndexMap != null) {
dataIndex = dbKeyToIndexMap.getDataIndex(data);
assert(dataIndex >= 0);
}
if (firstScan) {
boolean cached = stats.initSegment(data,prevPos,pos);
//assert(!nested || !cached);
if (nested)
fgen.startScanFeaturesAt(data,prevPos,pos);
else
sfgen.startScanFeaturesAt(data,pos);
} else {
cursor.init(data,prevPos,pos);
}
}
/* (non-Javadoc)
* @see iitb.CRF.FeatureGenerator#hasNext()
*/
public boolean hasNext() {
return (firstScan)?stats.checkFeaturesEnd(sfgen.hasNext()):cursor.hasNext();
}
/* (non-Javadoc)
* @see iitb.CRF.FeatureGenerator#next()
*/
public Feature next() {
if (firstScan) {
Feature f = sfgen.next();
stats.add(f);
return f;
} else {
return cursor.nextFeature();
}
}
/* (non-Javadoc)
* @see iitb.CRF.FeatureGenerator#featureName(int)
*/
public String featureName(int featureIndex) {
return fgen.featureName(featureIndex);
}
public void startScanFeaturesAt(DataSequence data, int prevPos, int pos) {
startScanFeaturesAt(data,prevPos,pos,true);
}
/* (non-Javadoc)
* @see iitb.CRF.FeatureGenerator#numFeatures()
*/
public int numFeatures() {
return sfgen.numFeatures();
}
/* (non-Javadoc)
* @see iitb.CRF.FeatureGenerator#startScanFeaturesAt(iitb.CRF.DataSequence, int)
*/
public void startScanFeaturesAt(DataSequence data, int pos) {
startScanFeaturesAt(data,pos-1,pos,false);
}
/* (non-Javadoc)
* @see iitb.CRF.FeatureGeneratorNested#maxMemory()
*/
public int maxMemory() {
return (sfgen instanceof FeatureGeneratorNested)?((FeatureGeneratorNested)sfgen).maxMemory():1;
}
public void noEdgeFeatures() {
cursor.noEdgeFeatures();
}
}