MyArrays.java example

Explorer
fudannlp-master
package edu.fudan.util;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Map;
import java.util.Map.Entry;
import java.util.TreeMap;

import edu.fudan.ml.types.sv.HashSparseVector;
import gnu.trove.iterator.TIntIntIterator;
import gnu.trove.map.hash.TIntFloatHashMap;
import gnu.trove.map.hash.TIntIntHashMap;

/**
 * 实现数组排序、直方图的功能
 * 
 * @author xpqiu
 * @version 1.0
 * @since FudanNLP 1.5
 */
public class MyArrays {
	/**
	 * 记录之前的label和得分，保留前n个
	 * 
	 * @param score
	 * @param pred
	 * @return 插入位置
	 */
	public static int addBest(float[] scores, Object[] predList, float score, Object pred) {
		int n = scores.length;
		int i;
		for (i = 0; i < n; i++) {
			if (score > scores[i])
				break;
		}
		if (i >= n)
			return -1;
		for (int k = n - 2; k >= i; k--) {
			scores[k + 1] = scores[k];
			predList[k + 1] = predList[k];
		}
		scores[i] = score;
		predList[i] = pred;
		return i;
	}
	
	/**
	 * 
	 * @param freqmap
	 * @return
	 */
	public static TreeMap<Integer, Integer> countFrequency(TIntIntHashMap freqmap) {
		TreeMap<Integer, Integer> map = new TreeMap<Integer,Integer>();
		TIntIntIterator it = freqmap.iterator();
		while(it.hasNext()){
			it.advance();
			int freq = it.value();
			if(map.containsKey(freq)){
				map.put(freq, map.get(freq)+1);
			}else
				map.put(freq, 1);
		}
		return map;		
	}

	/**
	 * 
	 * @param count
	 * @param nbin
	 * @return 直方图
	 */
	public static float[][] histogram(float[] count, int nbin) {
		float maxCount = Float.NEGATIVE_INFINITY;
		float minCount = Float.MAX_VALUE;
		for (int i = 0; i < count.length; i++) {
			if (maxCount < count[i]) {
				maxCount = count[i];
			}
			if (minCount > count[i]) {
				minCount = count[i];
			}
		}
		float[][] hist = new float[2][nbin];
		float interv = (maxCount - minCount) / nbin;
		for (int i = 0; i < count.length; i++) {
			int idx = (int) Math.floor((count[i] - minCount) / interv);
			if (idx == nbin)
				idx--;
			hist[0][idx]++;
		}
		for (int i = 0; i < nbin; i++) {
			hist[1][i] = minCount + i * interv;
		}
		return hist;
	}

	/**
	 * 归一化
	 * 
	 * @param c
	 */
	public static void normalize(float[] c) {
		float max = Float.MIN_VALUE;
		float min = Float.MAX_VALUE;
		for (int i = 0; i < c.length; i++) {
			if (min > c[i])
				min = c[i];
			if (max < c[i])
				max = c[i];
		}
		float val = max - min;
		if (val == 0)
			return;
		for (int i = 0; i < c.length; i++) {
			c[i] = (c[i] - min) / val;
		}
	}
	
	/**
	 * 概率归一化
	 * 
	 * @param c 数组元素必须大于等于0
	 */
	public static void normalize2Prop(float[] c) {
		float sum = sum(c);
		
		if (sum == 0)
			return;
		for (int i = 0; i < c.length; i++) {
			c[i] = c[i] / sum;
		}
	}

	/**
	 * 对数组的绝对值由大到小排序，返回调整后元素对于的原始下标
	 * 
	 * @param c
	 *            待排序数组
	 * @return 原始下标
	 */
	public static int[] sort(float[] c) {

		HashMap<Integer, Float> map = new HashMap<Integer, Float>();

		for (int i = 0; i < c.length; i++) {
			if (c[i] != 0.0) {
				map.put(i, Math.abs(c[i]));
			}
		}
		ArrayList<Map.Entry<Integer, Float>> list = new ArrayList<Map.Entry<Integer, Float>>(
				map.entrySet());

		Collections.sort(list, new Comparator<Map.Entry<Integer, Float>>() {
			@Override
			public int compare(Entry<Integer, Float> o1,
					Entry<Integer, Float> o2) {

				if (o2.getValue() > o1.getValue()) {
					return 1;
				} else if (o1.getValue() > o2.getValue()) {
					return -1;
				} else {
					return 0;
				}
			}
		});
		int[] idx = new int[list.size()];
		for (int i = 0; i < list.size(); i++) {
			idx[i] = list.get(i).getKey();
		}
		return idx;
	}

	/**
	 * 得到总能量值大于thres的元素对应的下标
	 * 
	 * @param c
	 * @param thres
	 * @param r
	 *            true表示返回最大的，false表示返回剩余的
	 * @return 元素下标
	 */
	public static int[] getTop(float[] c, float thres, boolean r) {
		int[] idx = sort(c);
		int i;
		float total = 0;
		float[] cp = new float[idx.length];
		for (i = 0; i < idx.length; i++) {
			cp[i] = (float) Math.pow(c[idx[i]], 2);
			total += cp[i];
		}

		float ratio = 0;
		for (i = 0; i < idx.length; i++) {
			ratio += cp[i] / total;
			if (ratio > thres)
				break;
		}
		int[] a;
		if (r)
			a = Arrays.copyOfRange(idx, 0, i);
		else
			a = Arrays.copyOfRange(idx, i, idx.length);
		return a;
	}

	/**
	 * 对部分下标的元素赋值
	 * 
	 * @param c
	 *            数组
	 * @param idx
	 *            赋值下标
	 * @param v
	 *            值
	 */
	public static void set(float[] c, int[] idx, float v) {
		for (int i = 0; i < idx.length; i++) {
			c[idx[i]] = v;
		}
	}
	
	/**
	 * 移除能量值小于一定阈值的项
	 * @param c 数组
	 * @param v 阈值
	 */
	public static void trim(float[] c, float v) {
		int[] idx = getTop(c, v, false);
		set(c, idx, 0.0f);
	}

	/**
	 * 求和 
	 * @param c
	 * @return 所有元素的和
	 */
	public static int sum(int[] c) {
		int s = 0;
		for (int i = 0; i < c.length; i++) {
			if (c[i] != 0)
				s+=c[i];
		}
		return s;
	}
	/**
	 * 累加
	 * @param c
	 * @return 所有元素的和
	 */
	public static int[] accumulate(int[] c) {
		int[] s = new int[c.length];
		s[0] =c[0];
		for (int i = 1; i < c.length; i++) {			
				s[i]+=s[i-1]+c[i];
		}
		return s;
	}
	/**
	 * 计算方差
	 * @param c
	 * @return
	 */
	public static float viarance(float[] c) {
		float aver = average(c);
		float via = 0.0f;
		for(int i=0;i<c.length;i++){
			float diff = c[i]-aver;
			via+=diff*diff;
		}
		return via/c.length;
	}
	/**
	 * 计算熵
	 * @param c 概率数组
	 * @return
	 */
	public static float entropy(float[] c) {
		
		float e = 0.0f;
		for(int i=0;i<c.length;i++){
			if(c[i]!=0.0&&c[i]!=1){
				e -= c[i]*Math.log(c[i]);
			}
			
		}
		return e;
	}
	
	/**
	 * 计算熵，先将频率转换为概率
	 * @param c 频率数组
	 * @return
	 */
	public static float entropy(int[] c) {
		//total 频率总数
		float total = sum(c);
		if(total==0f)
			return 0f;
		float[] prop = new float[c.length];
		for(int i=0;i<c.length;i++){
			prop[i] = c[i]/(total);	
		}
		return entropy(prop);
	}
	
	/**
	 * 求和 
	 * @param c float
	 * @return 所有元素的和
	 */
	public static float average(float[] c) {
		float s = sum(c);
		return s/c.length;
	}
	/**
	 * 求和 
	 * @param c float
	 * @return 所有元素的和
	 */
	public static float sum(float[] c) {
		float s = 0;
		for (int i = 0; i < c.length; i++) {
			s+=c[i];
		}
		return s;
	}
	
	/**
	 * 统计非零个数
	 * 
	 * @param c
	 * @return 非零元素数量
	 */
	public static int countNoneZero(float[] c) {
		int count = 0;
		for (int i = 0; i < c.length; i++) {
			if (c[i] != 0.0)
				count++;
		}
		return count;
	}

	/**
	 * 统计非零元素
	 * 
	 * @param c
	 * @return 非零元素标记
	 */
	public static boolean[] getNoneZeroIdx(float[] c) {
		boolean[] b = new boolean[c.length];
		for (int i = 0; i < c.length; i++) {
			if (c[i] != 0.0)
				b[i] = true;
		}
		return b;
	}

	public static int[] string2int(String[] c) {
		int[] d = new int[c.length];
		for (int i = 0; i < c.length; i++) {
			d[i] = Integer.parseInt(c[i]);
		}
		return d;
	}
	public static String[] int2string(int[] c) {
		String[] d = new String[c.length];
		for (int i = 0; i < c.length; i++) {
			d[i] = String.valueOf(c[i]);
		}
		return d;
	}

	/**
	 * 对数组的绝对值由大到小排序，返回调整后元素对于的原始下标
	 * 
	 * @param data
	 *            待排序数组
	 * @return 原始下标
	 */
	public static int[] sort(TIntFloatHashMap data) {
		

		return MyCollection.sort(data);
	}

	/**
	 * 得到总能量值大于thres的元素对应的下标
	 * 
	 * @param data 稀疏向量
	 * @param thres
	 * @return 元素下标 int[][] 第一列表示大于阈值的元素 第二列表示小于阈值的元素
	 */
	public static int[][] getTop(TIntFloatHashMap data, float thres) {
		int[] idx = sort(data);
		int i;
		float total = 0;
		float[] cp = new float[idx.length];
		for (i = idx.length; i-- > 0;) {
			cp[i] = (float) Math.pow(data.get(idx[i]), 2);
			total += cp[i];
		}

		float ratio = 0;
		for (i = 0; i < idx.length; i++) {
			ratio += cp[i] / total;
			if (ratio > thres)
				break;
		}
		int[][] a = new int[2][];
		a[0] = Arrays.copyOfRange(idx, 0, i);
		a[1] = Arrays.copyOfRange(idx, i, idx.length);
		return a;
	}

	/**
	 * 对部分下标的元素赋零
	 * 
	 * @param data
	 *            数组
	 * @param idx
	 *            赋值下标
	 */
	public static void setZero(TIntFloatHashMap data, int[] idx) {
		for(int i = 0; i < idx.length; i++)	{
			if (data.containsKey(idx[i]))	{
				data.remove(idx[i]);
			}
		}
	}
	/**
	 * 移除能量值小于一定阈值的项
	 * @return 
	 * 
	 */
	public static int[] trim(TIntFloatHashMap data, float v) {
		int[][] idx = getTop(data, v);
		setZero(data, idx[1]);
		return idx[0];
	}
	
	/**
	 * 移除能量值小于一定阈值的项
	 * 
	 */
	public static void trim(HashSparseVector c, float v) {
		trim(c.data,v);		
	}
}