package focusedCrawler.link.linkanalysis; import java.io.IOException; import java.util.Collections; import java.util.HashMap; import java.util.Iterator; import java.util.Vector; import focusedCrawler.link.BipartiteGraphRepository; import focusedCrawler.util.persistence.Tuple; import focusedCrawler.util.vsm.VSMElement; import focusedCrawler.util.vsm.VSMElementComparator; public class SALSA { private HashMap<String,Vector<VSMElement>> incidenceHubMatrix; private HashMap<String,Vector<VSMElement>> incidenceAuthMatrix; private BipartiteGraphRepository graphRep; private HashMap<String,VSMElement> initialValues; private HashMap<String,VSMElement> nodeRelevance; private boolean pageRank = false; private HashMap<String,VSMElement> hubValues = new HashMap<String, VSMElement>(); private HashMap<String,VSMElement> authValues = new HashMap<String, VSMElement>(); public SALSA(BipartiteGraphRepository graphRep){ this.graphRep = graphRep; this.incidenceHubMatrix = new HashMap<String, Vector<VSMElement>>(); this.incidenceAuthMatrix = new HashMap<String, Vector<VSMElement>>(); this.initialValues = new HashMap<String, VSMElement>(); this.nodeRelevance = new HashMap<String, VSMElement>(); } public VSMElement[] getHubValues() throws IOException{ Iterator<VSMElement> elems = hubValues.values().iterator(); VSMElement[] result = new VSMElement[hubValues.values().size()]; int i = 0; while(elems.hasNext()){ VSMElement elem = elems.next(); String id = elem.getWord(); String url = graphRep.getHubURL(id); elem.setWord(url); result[i] = elem; i++; } return result; } public VSMElement[] getAuthValues(){ Iterator<VSMElement> elems = authValues.values().iterator(); VSMElement[] result = new VSMElement[authValues.values().size()]; int i = 0; while(elems.hasNext()){ VSMElement elem = elems.next(); String id = elem.getWord(); String url = graphRep.getAuthURL(id); elem.setWord(url); result[i] = elem; i++; } return result; } public void setPageRank(boolean pageRank){ this.pageRank = pageRank; } public void setNodeRelevance(HashMap<String,VSMElement> nr){ this.nodeRelevance = nr; } public void execute() throws Exception{ createInitialMatrices(); hubValues = initialValues; authValues = initialValues; for (int i = 0; i < 10; i++) { System.out.println("Iteration:" + i); hubValues = updateVector(incidenceHubMatrix,hubValues,"hub"); authValues = updateVector(incidenceAuthMatrix,authValues,"auth"); } Vector<VSMElement> sortedNodes = new Vector<VSMElement>(hubValues.values()); Collections.sort(sortedNodes, new VSMElementComparator()); for (int i = 0; i < sortedNodes.size() && i < 100; i++) { VSMElement elem = sortedNodes.elementAt(i); String hubURL = graphRep.getHubURL(elem.getWord()).toString(); System.out.println(i + ":HUB:" + elem.getWord() + ":" + hubURL + "=" + elem.getWeight()); } sortedNodes = new Vector<VSMElement>(authValues.values()); Collections.sort(sortedNodes, new VSMElementComparator()); for (int i = 0; i < sortedNodes.size() && i < 100; i++) { VSMElement elem = sortedNodes.elementAt(i); if(graphRep.getAuthURL(elem.getWord()) != null){ String authURL = graphRep.getAuthURL(elem.getWord()).toString(); System.out.println(i + ":AUTH:" + elem.getWord() + ":" + authURL + "=" + elem.getWeight()); } } } private void createInitialMatrices() throws Exception{ HashMap<String,Vector<VSMElement>> lr = new HashMap<String,Vector<VSMElement>>(); HashMap<String,Vector<VSMElement>> lcTranspose = new HashMap<String,Vector<VSMElement>>(); // Tuple[] authTuples = new Tuple[8]; // Tuple[] hubTuples = new Tuple[3]; // authTuples[0] = new Tuple("1","2###"); // authTuples[1] = new Tuple("0","2###"); // authTuples[2] = new Tuple("3","2###5###"); // authTuples[3] = new Tuple("4","5###"); // authTuples[4] = new Tuple("6","5###7###"); // authTuples[5] = new Tuple("8","7###"); // authTuples[6] = new Tuple("9","7###"); // authTuples[7] = new Tuple("10","7###"); // hubTuples[0] = new Tuple("2","0###1###3###"); // hubTuples[1] = new Tuple("5","3###4###6###"); // hubTuples[2] = new Tuple("7","6###8###9###10###"); Tuple<String>[] hubTuples = graphRep.getHubGraph(); for (int i = 0; i < hubTuples.length; i++) {//prob for hubs String key = hubTuples[i].getKey(); if(initialValues.get(key) == null){ initialValues.put(key, new VSMElement(key, 1)); } Vector<VSMElement> row = lr.get(key); if(row == null){ row = new Vector<VSMElement>(); } String values = hubTuples[i].getValue(); String[] outlinks = parseRecord(values); for (int j = 0; j < outlinks.length; j++) { if(initialValues.get(outlinks[j]) == null){ initialValues.put(outlinks[j], new VSMElement(outlinks[j], 1)); } row.add(new VSMElement(outlinks[j], 1/(double)outlinks.length)); } lr.put(key,row); } System.out.println("LR:" + lr.size()); Tuple<String>[] authTuples = graphRep.getAuthGraph(); for (int i = 0; i < authTuples.length; i++) { String key = authTuples[i].getKey(); if(initialValues.get(key) == null){ initialValues.put(key, new VSMElement(key, 1)); } Vector<VSMElement> row = lcTranspose.get(key); if(row == null){ row = new Vector<VSMElement>(); } String values = authTuples[i].getValue(); String[] backlinks = parseRecord(values); for (int j = 0; j < backlinks.length; j++) { if(initialValues.get(backlinks[j]) == null){ initialValues.put(backlinks[j], new VSMElement(backlinks[j], 1)); } row.add(new VSMElement(backlinks[j], 1/(double)backlinks.length)); } lcTranspose.put(key,row); } System.out.println("LC:" + lcTranspose.size()); incidenceHubMatrix = multiply(lr,lcTranspose); System.out.println("incidenceHubMatrix:" + incidenceHubMatrix.size()); incidenceAuthMatrix = multiply(lcTranspose,lr); System.out.println(incidenceAuthMatrix.size()); } private HashMap<String,Vector<VSMElement>> transpose(HashMap<String,Vector<VSMElement>> matrix){ HashMap<String,Vector<VSMElement>> result = new HashMap<String, Vector<VSMElement>>(); Iterator<String> keys = matrix.keySet().iterator(); while(keys.hasNext()){ String key = keys.next(); Vector<VSMElement> elems = matrix.get(key); for (int i = 0; i < elems.size(); i++) { String newKey = elems.elementAt(i).getWord(); Vector<VSMElement> newValues = result.get(newKey); if(newValues == null){ newValues = new Vector<VSMElement>(); } newValues.add(new VSMElement(key, elems.elementAt(i).getWeight())); result.put(newKey,newValues); } } return result; } private HashMap<String,Vector<VSMElement>> multiply(HashMap<String,Vector<VSMElement>> matrix1, HashMap<String,Vector<VSMElement>> matrix2){ HashMap<String,Vector<VSMElement>> result = new HashMap<String, Vector<VSMElement>>(); HashMap<String,Vector<VSMElement>> matrix2Trans = transpose(matrix2); Iterator<String> keys = matrix1.keySet().iterator(); while(keys.hasNext()){ String key = keys.next(); Vector<VSMElement> elemsMatrix1 = matrix1.get(key); Iterator<String> keys2 = matrix2Trans.keySet().iterator(); while(keys2.hasNext()){ String key2 = keys2.next(); Vector<VSMElement> elemsMatrix2 = matrix2Trans.get(key2); double sum = sumVectors(elemsMatrix1,elemsMatrix2); if(sum != 0){ Vector<VSMElement> resultElems = result.get(key); if(resultElems == null){ resultElems = new Vector<VSMElement>(); } resultElems.add(new VSMElement(key2, sum)); result.put(key, resultElems); } } } return result; } private double sumVectors(Vector<VSMElement> v1, Vector<VSMElement> v2){ double result = 0; for (int i = 0; i < v1.size(); i++) { VSMElement elem1 = v1.elementAt(i); for (int j = 0; j < v2.size(); j++) { VSMElement elem2 = v2.elementAt(j); if(elem1.getWord().equals(elem2.getWord())){ result = result + elem1.getWeight()*elem2.getWeight(); } } } return result; } private HashMap<String,VSMElement> updateVector(HashMap<String,Vector<VSMElement>> incidenceMatrix, HashMap<String,VSMElement> values, String prefix){ HashMap<String,VSMElement> newValues = new HashMap<String, VSMElement>(); for (Iterator<String> iterator = incidenceMatrix.keySet().iterator(); iterator.hasNext();) { String key = (String) iterator.next(); Vector<VSMElement> neighbours = incidenceMatrix.get(key); for (int j = 0; j < neighbours.size(); j++) { VSMElement value = values.get(neighbours.elementAt(j).getWord()); if(value != null){ double newValue = (value.getWeight()*neighbours.elementAt(j).getWeight()); VSMElement elem = newValues.get(neighbours.elementAt(j).getWord()); if(elem != null){ newValue = newValue + elem.getWeight(); } newValues.put(neighbours.elementAt(j).getWord(), new VSMElement(neighbours.elementAt(j).getWord(),newValue)); } } } if(pageRank){ Iterator<String> iter = newValues.keySet().iterator(); while(iter.hasNext()){ String key = iter.next(); double rel = 0; VSMElement nodeRel = nodeRelevance.get(key + "_" + prefix); if(nodeRel != null){ rel = nodeRel.getWeight(); } VSMElement elem = newValues.get(key); elem.setWeight(0.15 * elem.getWeight() + 0.85 * rel); } } normalize(newValues); return newValues; } private void normalize(HashMap<String,VSMElement> values){ //normalize double total = 0; Iterator<String> iter = values.keySet().iterator(); while(iter.hasNext()){ String key = iter.next(); VSMElement elem = values.get(key); total = total + elem.getWeight(); } iter = values.keySet().iterator(); while(iter.hasNext()){ String key = iter.next(); VSMElement elem = values.get(key); elem.setWeight(elem.getWeight()/total); } } private String[] parseRecord(String strLinks){ if(strLinks != null){ return strLinks.split("###"); }else{ return null; } } private double[][] transpose(double[][] matrix){ double[][] result = new double[matrix.length][matrix.length]; for (int i = 0; i < matrix.length; i++) { for (int j = 0; j < matrix.length; j++) { result[j][i] = matrix[i][j]; } } return result; } public double[][] multiply(double[][] matrixA, double[][] matrixB){ double[][] result = new double[matrixA.length][matrixB.length]; double[][] matrixC = transpose(matrixB); for (int i = 0; i < result.length; i++) { double[] matrixArow = matrixA[i]; for (int j = 0; j < matrixArow.length; j++) { double[] matrixCrow = matrixC[j]; for (int k = 0; k < matrixCrow.length; k++) { result[i][j] = result[i][j] + matrixArow[k]*matrixCrow[k]; } } } return result; } public void pageRank(){ double[] seeds = new double[12]; seeds[1] = 0.33; seeds[3] = 0.33; seeds[4] = 0.33; double[] weights = new double[12]; weights[0] = 0; weights[1] = 0; weights[2] = 1; weights[3] = 0; weights[4] = 0; weights[5] = 0; weights[6] = 0; weights[7] = 0; weights[8] = 0; weights[9] = 0; weights[10] = 0; weights[11] = 0; double[][] transMatrix = new double[12][12]; transMatrix[0][2] = 1; transMatrix[1][2] = 1; transMatrix[2][0] = 1; transMatrix[2][1] = 1; transMatrix[2][3] = 1; transMatrix[3][2] = 1; transMatrix[3][5] = 1; transMatrix[4][5] = 1; transMatrix[5][3] = 1; transMatrix[5][4] = 1; transMatrix[5][6] = 1; transMatrix[6][5] = 1; transMatrix[6][7] = 1; transMatrix[7][6] = 1; transMatrix[7][8] = 1; transMatrix[7][9] = 1; transMatrix[7][10] = 1; transMatrix[7][11] = 1; transMatrix[8][7] = 1; transMatrix[9][7] = 1; transMatrix[10][7] = 1; transMatrix[11][7] = 1; double[][] lr = new double[12][12]; for (int i = 0; i < lr.length; i++) { double[] row = transMatrix[i]; double total = 0; for (int j = 0; j < row.length; j++) { total = total + row[j]; } for (int j = 0; j < row.length; j++) { lr[i][j] = row[j]/total; } } double[][] lc = new double[12][12]; for (int i = 0; i < lc.length; i++) { double total = 0; for (int j = 0; j < lc.length; j++) { total = total + transMatrix[j][i]; } for (int j = 0; j < lc.length; j++) { lc[j][i] = transMatrix[j][i]/total; } } double[][] hubMatrix = new double[12][12]; hubMatrix = multiply(lr,transpose(lc)); // double[][] authMatrix = new double[12][12]; // authMatrix = multiply(transpose(lc),lr); for (int l = 0; l < 100; l++) { double total = 0; for (int i = 0; i < weights.length; i++) { total = total + weights[i]; } for (int i = 0; i < weights.length; i++) { weights[i] = weights[i]/total; } double[] newWeights = new double[12]; for (int i = 0; i < hubMatrix.length; i++) { double[] values = hubMatrix[i]; double newValue = 0; for (int j = 0; j < values.length; j++) { newValue= newValue + weights[j]*values[j]; } // newWeights[i] = 0.85*newValue + 0.15*seeds[i]; newWeights[i] = newValue; System.out.print(i + ":" + newWeights[i] + " "); } System.out.println("\n"); weights = newWeights; } } }