/*
* Ivory: A Hadoop toolkit for web-scale information retrieval
*
* Licensed under the Apache License, Version 2.0 (the "License"); you
* may not use this file except in compliance with the License. You may
* obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied. See the License for the specific language governing
* permissions and limitations under the License.
*/
package ivory.ltr;
import ivory.core.ConfigurationException;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
/**
* @author metzler
*
*/
public class Instances {
private static final String QID_COLUMN_NAME = "qid"; // query id feature name
private static final String DOCID_COLUMN_NAME = "docid"; // document id feature name
private static final String GRADE_COLUMN_NAME = "grade"; // relevance grade feature name
private String [] queryIds = null; // query ids
private String [] docIds = null; // document ids
private float [] grades = null; // relevance grades
private String [] featureNames = null; // feature names
private int [] featureCols = null; // feature column indexes
private Map<String,Integer> featureMap = null; // mapping from feature name to column index
private float [][] features = null; // feature values
public Instances(String featFile) throws IOException, ConfigurationException {
initialize(featFile);
}
private void initialize(String featFile) throws IOException, ConfigurationException {
// open feature file for reading
BufferedReader in = new BufferedReader(new FileReader(featFile));
// make pass through training data to count number of
// columns (features) and rows (instances)
int numRows;
int numCols;
int numFeats;
// query id, document id, and grade column indexes
int qidCol = -1;
int docidCol = -1;
int gradeCol = -1;
// read header
String line = in.readLine();
String [] colNames = line.split("\t");
numCols = colNames.length;
numFeats = 0;
for(int i = 0; i < numCols; i++) {
String colName = colNames[i];
if(QID_COLUMN_NAME.equals(colName)) {
qidCol = i;
}
else if(DOCID_COLUMN_NAME.equals(colName)) {
docidCol = i;
}
else if(GRADE_COLUMN_NAME.equals(colName)) {
gradeCol = i;
}
else {
numFeats++;
}
}
System.err.println("Query ID column: " + qidCol);
System.err.println("Document ID column: " + docidCol);
System.err.println("Grade column: " + gradeCol);
System.err.println("Number of features: " + numFeats);
// process the rest of the file
numRows = 0;
while((line = in.readLine()) != null) {
numRows++;
}
System.err.println("Number of instances: " + numRows);
// close feature file
in.close();
// initialize query ids, doc ids, grades, and features
queryIds = new String[numRows];
docIds = new String[numRows];
grades = new float[numRows];
featureNames = new String[numFeats];
featureCols = new int[numFeats];
featureMap = new HashMap<String,Integer>();
features = new float[numRows][numFeats];
// make second pass through feature file
in = new BufferedReader(new FileReader(featFile));
// process header
line = in.readLine();
int featureId = 0;
for(int i = 0; i < numCols; i++) {
String colName = colNames[i];
if(QID_COLUMN_NAME.equals(colName)) {
continue;
}
else if(DOCID_COLUMN_NAME.equals(colName)) {
continue;
}
else if(GRADE_COLUMN_NAME.equals(colName)) {
continue;
}
else {
featureNames[featureId] = colName;
featureCols[featureId] = i;
featureMap.put(colName, featureId);
featureId++;
}
}
// read instances
int rowNum = 0;
while((line = in.readLine()) != null) {
String [] fvals = line.split("\t");
if(fvals.length != numCols) {
throw new ConfigurationException("Line -- " + line + " has the incorrect number of columns! "+fvals.length+" "+numCols);
}
queryIds[rowNum] = new String(fvals[qidCol]);
docIds[rowNum] = new String(fvals[docidCol]);
grades[rowNum] = Float.parseFloat(fvals[gradeCol]);
for(int i = 0; i < featureCols.length; i++) {
int featureCol = featureCols[i];
features[rowNum][i] = Float.parseFloat(fvals[featureCol]);
}
rowNum++;
if(rowNum % 1000 == 0) {
System.err.println("Read " + rowNum + " instances...");
}
}
// close feature file
in.close();
}
public int getNumInstances() {
return features.length;
}
public float [] getInstance(int i) {
return features[i];
}
public Map<String, Integer> getFeatureMap() {
return featureMap;
}
public String[] getQids() {
return queryIds;
}
public String[] getDocids() {
return docIds;
}
public float[] getGrades() {
return grades;
}
public boolean featureIsConstant(Feature f) {
String lastQid = null;
float lastFv = Float.NaN;
for(int i = 0; i < queryIds.length; i++) {
String qid = queryIds[i];
float fv = f.eval(features[i]);
if(lastQid == null) {
lastQid = qid;
lastFv = fv;
}
if(lastQid.equals(qid) && lastFv != fv) {
return false;
}
lastQid = qid;
lastFv = fv;
}
return true;
}
public double getCorrelation(Feature featA, Feature featB) {
double a = 0.0;
double b = 0.0;
double ab = 0.0;
double aa = 0.0;
double bb = 0.0;
int n = features.length;
for(int i = 0; i < n; i++) {
float x = featA.eval(features[i]);
float y = featB.eval(features[i]);
a += x;
b += y;
ab += x*y;
aa += x*x;
bb += y*y;
}
double ma = a / n;
double maa = aa / n;
double mb = b / n;
double mbb = bb / n;
return ( ab - mb*a - ma*b + ma*mb*n ) / ( ( n - 1 ) * Math.sqrt(maa - ma*ma) * Math.sqrt(mbb - mb*mb) );
}
}