package nicetext;/* package nicetext; import org.jblas.DoubleMatrix; import java.util.HashMap; import java.io.*; import java.util.*; */ /** * @author vikasing * *//* public class TfIdf { private static final double ALPHA = 0.4; private static final double THRESHOLD = 4.5; public void calculateWordRarity(Set<String> docs) throws IOException { // matrix of (all ngrams) x (total documents) double[][] nGramMatrix = new double[allTextNgMap.size()][totalDocs]; String[] allGsArr = allTextNgMap.keySet().toArray(new String[allTextNgMap.size()]); Set<String> fileNameSet = ngramDocMap.keySet(); Map<Integer, String> fileNameMap = new HashMap<>(); int i = 0; for (String fileName : fileNameSet) { fileNameMap.put(i, fileName); Map<String, Integer> singleDocNGramMap = ngramDocMap.get(fileName); Set<String> grams = singleDocNGramMap.keySet(); for (String gram : grams) { nGramMatrix[Arrays.binarySearch(allGsArr, gram)][i] = singleDocNGramMap.get(gram); } i++; } Map<String, Map<String, Double>> keywordMap = calculateTFIDF(nGramMatrix, allGsArr, fileNameMap); for (String fileName : keywordMap.keySet()) { System.out.println("===========================keywords for "+fileName+"============================================="); Set<String> kewordSet = new TreeSet<>(keywordMap.get(fileName).keySet()); Set<String> gramsToRemove = new HashSet<>(); String[] keywordArr = kewordSet.toArray(new String[kewordSet.size()]); for (int j = 0; j < keywordArr.length; j++) { getOverlapping(gramsToRemove, keywordMap.get(fileName), keywordArr,keywordArr[j]); kewordSet.removeAll(gramsToRemove); } for (String keyword : kewordSet) { System.out.println(keyword +" "+ keywordMap.get(fileName).get(keyword)); } } } */ /** * @param gramsToRemove * @param keywordArr *//* private void getOverlapping(Set<String> gramsToRemove, Map<String, Double> gramMap, String[] keywordArr, String word) { String[] tempArr = word.split(" "); if (tempArr.length>2) { searchArray(gramsToRemove, gramMap, keywordArr, word, tempArr[0]+" "+tempArr[1]); searchArray(gramsToRemove, gramMap, keywordArr, word, tempArr[1]+" "+tempArr[2]); } else if (tempArr.length>1) { for (int k = 0; k < tempArr.length; k++) { searchArray(gramsToRemove, gramMap, keywordArr, word, tempArr[k]); } } } private void searchArray(Set<String> gramsToRemove,Map<String, Double> gramMap, String[] keywordArr, String word,String tempWord) { int pos = Arrays.binarySearch(keywordArr,tempWord); if (pos>-1) {// && (gramMap.get(tempWord).compareTo(gramMap.get(word))==0)) { gramsToRemove.add(keywordArr[pos]); } } private Map<String, Map<String, Double>> calculateTFIDF(double[][] bigArr, String[] allGs, Map<Integer, String> fileNameMap) { DoubleMatrix doubleMatrix = new DoubleMatrix(bigArr); int columns = doubleMatrix.columns; Map<String, Map<String, Double>> keywordMap = new HashMap<>(); Map<Integer, Integer> maxFreqMap = new LinkedHashMap<>(); for (int i = 0; i < columns; i++) { DoubleMatrix aColumn = doubleMatrix.getColumn(i); int maxFrequency = 0; for (int j = 0; j < aColumn.rows; j++) { int temp = (int) aColumn.get(j); if (temp>maxFrequency) { maxFrequency = temp; } } maxFreqMap.put(i, maxFrequency); } for (int j = 0; j < bigArr.length; j++) { double counter = 0; int numOfDocs = bigArr[j].length; for (int k = 0; k < numOfDocs; k++) { if (bigArr[j][k]!=0) { counter ++; } } for (int k = 0; k < numOfDocs; k++) { if (bigArr[j][k]!=0) { double tf = Math.log(bigArr[j][k]+1); //double tf = bigArr[j][k]; //double tf = ALPHA + ((1-ALPHA)*bigArr[j][k])/(double)maxFreqMap.get(k); double idf = Math.log((double)numOfDocs/counter); if (tf*idf>THRESHOLD) { if (keywordMap.containsKey(fileNameMap.get(k))) { keywordMap.get(fileNameMap.get(k)).put(allGs[j],tf*idf); } else { Map<String, Double> keywordScoreMap= new HashMap<>(); keywordScoreMap.put(allGs[j],tf*idf); keywordMap.put(fileNameMap.get(k), keywordScoreMap); } } } } } return keywordMap; } public static void main(String[] args) throws IOException { TfIdf tfIdf = new TfIdf(); tfIdf.calculateWordRarity(); } } */