Instances.java example

Explorer
Ivory-master
- src
  - java
/*
 * Ivory: A Hadoop toolkit for web-scale information retrieval
 * 
 * Licensed under the Apache License, Version 2.0 (the "License"); you
 * may not use this file except in compliance with the License. You may
 * obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0 
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied. See the License for the specific language governing
 * permissions and limitations under the License.
 */

package ivory.ltr;



import ivory.core.ConfigurationException;

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;


/**
 * @author metzler
 *
 */
public class Instances {

	private static final String QID_COLUMN_NAME = "qid";      // query id feature name
	private static final String DOCID_COLUMN_NAME = "docid";  // document id feature name
	private static final String GRADE_COLUMN_NAME = "grade";  // relevance grade feature name
	
	private String [] queryIds = null;              // query ids
	private String [] docIds = null;                // document ids
	private float [] grades = null;                 // relevance grades
	private String [] featureNames = null;          // feature names
	private int [] featureCols = null;              // feature column indexes
	private Map<String,Integer> featureMap = null;  // mapping from feature name to column index
	private float [][] features = null;             // feature values
	
	public Instances(String featFile) throws IOException, ConfigurationException {
		initialize(featFile);
	}

	private void initialize(String featFile) throws IOException, ConfigurationException {
		// open feature file for reading
		BufferedReader in = new BufferedReader(new FileReader(featFile));
		
		// make pass through training data to count number of
		// columns (features) and rows (instances)
		int numRows;
		int numCols;
		int numFeats;
		
		// query id, document id, and grade column indexes
		int qidCol = -1;
		int docidCol = -1;
		int gradeCol = -1;
		
		// read header
		String line = in.readLine();
		String [] colNames = line.split("\t");
		numCols = colNames.length;
		
		numFeats = 0;
		for(int i = 0; i < numCols; i++) {
			String colName = colNames[i];
			if(QID_COLUMN_NAME.equals(colName)) {
				qidCol = i;
			}
			else if(DOCID_COLUMN_NAME.equals(colName)) {
				docidCol = i;
			}
			else if(GRADE_COLUMN_NAME.equals(colName)) {
				gradeCol = i;
			}
			else {
				numFeats++;
			}
		}
		
		System.err.println("Query ID column: "  + qidCol);
		System.err.println("Document ID column: "  + docidCol);
		System.err.println("Grade column: "  + gradeCol);
		System.err.println("Number of features: "  + numFeats);
		
		// process the rest of the file
		numRows = 0;
		while((line = in.readLine()) != null) {
			numRows++;
		}
		
		System.err.println("Number of instances: " + numRows);
		
		// close feature file
		in.close();
		
		// initialize query ids, doc ids, grades, and features
		queryIds = new String[numRows];
		docIds = new String[numRows];
		grades = new float[numRows];
		featureNames = new String[numFeats];
		featureCols = new int[numFeats];
		featureMap = new HashMap<String,Integer>();
		features = new float[numRows][numFeats];
		
		// make second pass through feature file
		in = new BufferedReader(new FileReader(featFile));
		
		// process header
		line = in.readLine();
		int featureId = 0;
		for(int i = 0; i < numCols; i++) {
			String colName = colNames[i];
			if(QID_COLUMN_NAME.equals(colName)) {
				continue;
			}
			else if(DOCID_COLUMN_NAME.equals(colName)) {
				continue;
			}
			else if(GRADE_COLUMN_NAME.equals(colName)) {
				continue;
			}
			else {
				featureNames[featureId] = colName;
				featureCols[featureId] = i;
				featureMap.put(colName, featureId);
				featureId++;
			}
		}
		
		// read instances
		int rowNum = 0;
		while((line = in.readLine()) != null) {
			String [] fvals = line.split("\t");
			if(fvals.length != numCols) {
				throw new ConfigurationException("Line -- " + line + " has the incorrect number of columns! "+fvals.length+" "+numCols);
			}
		
			queryIds[rowNum] = new String(fvals[qidCol]);
			docIds[rowNum] = new String(fvals[docidCol]);
			grades[rowNum] = Float.parseFloat(fvals[gradeCol]);
			
			for(int i = 0; i < featureCols.length; i++) {
				int featureCol = featureCols[i];
				features[rowNum][i] = Float.parseFloat(fvals[featureCol]);
			}

			rowNum++;
			if(rowNum % 1000 == 0) {
				System.err.println("Read " + rowNum + " instances...");
			}
		}
		
		// close feature file
		in.close();
	}

	public int getNumInstances() {
		return features.length;
	}

	public float [] getInstance(int i) {
		return features[i];
	}

	public Map<String, Integer> getFeatureMap() {
		return featureMap;
	}

	public String[] getQids() {
		return queryIds;
	}

	public String[] getDocids() {
		return docIds;
	}

	public float[] getGrades() {
		return grades;
	}

	public boolean featureIsConstant(Feature f) {
		String lastQid = null;
		float lastFv = Float.NaN;
		for(int i = 0; i < queryIds.length; i++) {
			String qid = queryIds[i];
			float fv = f.eval(features[i]);
			if(lastQid == null) {
				lastQid = qid;
				lastFv = fv;
			}
			if(lastQid.equals(qid) && lastFv != fv) {
				return false;
			}
			lastQid = qid;
			lastFv = fv;
		}
		
		return true;
	}

	public double getCorrelation(Feature featA, Feature featB) {
		double a = 0.0;
		double b = 0.0;
		
		double ab = 0.0;
		
		double aa = 0.0;
		double bb = 0.0;
		
		int n = features.length;
		
		for(int i = 0; i < n; i++) {
			float x = featA.eval(features[i]);
			float y = featB.eval(features[i]);

			a += x;
			b += y;
			
			ab += x*y;
			
			aa += x*x;
			bb += y*y;
		}
		
		double ma = a / n;
		double maa = aa / n;
		
		double mb = b / n;
		double mbb = bb / n;
		
		return ( ab - mb*a - ma*b + ma*mb*n ) / ( ( n - 1 ) * Math.sqrt(maa - ma*ma) * Math.sqrt(mbb - mb*mb) );
	}
}