package i5.las2peer.services.ocd.algorithms.utils; import java.io.File; import java.io.IOException; import java.nio.file.Path; import java.util.HashMap; import java.util.Iterator; import java.util.LinkedList; import java.util.Map; import org.apache.commons.math3.analysis.function.Log; import org.apache.commons.math3.linear.Array2DRowRealMatrix; import org.apache.commons.math3.linear.ArrayRealVector; import org.apache.commons.math3.linear.BlockRealMatrix; import org.apache.commons.math3.linear.RealMatrix; //import org.apache.commons.math3.linear.RealMatrix; import org.apache.commons.math3.linear.RealVector; //import org.json.JSONArray; import org.apache.commons.math3.linear.SingularValueDecomposition; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.PostingsEnum; import org.apache.lucene.index.Term; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.similarities.DefaultSimilarity; import org.apache.lucene.store.SimpleFSDirectory; import org.apache.lucene.util.BytesRef; import org.ejml.data.SimpleMatrix; import i5.las2peer.services.ocd.graphs.CustomGraph; import i5.las2peer.services.ocd.preprocessing.StringConverter; import y.base.Node; import y.base.NodeCursor; public class Termmatrix { private Array2DRowRealMatrix matrix; private LinkedList<String> wordlist; private LinkedList<Node> nodelist; //private HashMap<Node,ArrayRealVector> tfidfMap; /////////////////// ////Constructor//// /////////////////// public Termmatrix(){ } public Termmatrix(CustomGraph graph) throws OcdAlgorithmException{ NodeCursor nodes = graph.nodes(); Node node; this.wordlist = new LinkedList<String>(); //StringConverter conv = new StringConverter(); //wordlist = conv.StringToList(threads); //this.setWordlist(wordlist); String name = ""; String key = ""; int index = 0; int row = 0; /*String threads = ""; String word = null; int column = 0; int row = 0; Log log = new Log();*/ LinkedList<Node> nodeList = new LinkedList<Node>(); //Termmatrix res = new Termmatrix(); this.setNodelist(nodeList); HashMap<String,HashMap<String,Double>> indexMap = computeTFIDF(graph); HashMap<String,Double> valueMap; ArrayRealVector vector = new ArrayRealVector(wordlist.size()); this.matrix = new Array2DRowRealMatrix(indexMap.size(),wordlist.size() ); while(nodes.ok()){ node = nodes.node(); this.addNode(node); name = graph.getNodeName(node); valueMap = indexMap.get(name); for(Map.Entry<String,Double> entry : valueMap.entrySet()){ key = entry.getKey(); index = wordlist.indexOf(key); vector.setEntry(index, entry.getValue()); } this.matrix.setRowVector(row, vector); row++; nodes.next(); } /*while(nodes.ok()){ node = nodes.node(); this.addNode(node); threads = graph.getNodeContent(node) + " " + threads; nodes.next(); }*/ //for(Iterator<Node> it = nodes.iterator(); it.hasNext();){ // Node currNode = it.next(); // threads = currNode.getContent()+ " " + threads; // compute all concatenated threads /*for(Iterator<Node> it2 = tempList.iterator(); it2.hasNext();){ //group by users Node currTemp = it2.next(); if(currTemp.getUserID().equals(currNode.getUserID())){ currTemp.setContent(currTemp.getContent()+currNode.getContent()); add = false; } } if(add){ tempList.add(currNode); }*/ //int nodesSize = tempList.size(); /*int nodesSize = nodes.size(); if(threads == null || threads.isEmpty()){ throw new OcdAlgorithmException("no content received"); } wordlist = conv.StringToList(threads); this.setWordlist(wordlist); //wordlistdup = listWordsDup(threads); //len = stringLength(thread); Array2DRowRealMatrix matrix = new Array2DRowRealMatrix(nodesSize ,wordlist.size()); //for(Iterator<Node>iter = nodes.iterator(); iter.hasNext();){ //Node node = iter.next(); nodes.toFirst(); while(nodes.ok()){ node = nodes.node(); //res.addNode(node.getNodeID()); //String thr = node.getContent(); String thr = graph.getNodeContent(node); LinkedList<String> temp = conv.StringToListDup(thr); RealVector vector = new ArrayRealVector(wordlist.size()); for(Iterator<String> it = wordlist.iterator(); it.hasNext();){ word = it.next(); int freq = countWord(word, temp); int docFreq = countDoc(word, graph); double tfidf = freq * log.value((double)nodesSize/docFreq); vector.setEntry(column, tfidf); column++; } column = 0; matrix.setRowVector(row, vector); nodes.next(); row++; } this.setMatrix(matrix);*///json = converter.matrixToJson(matrix, wordlist); //return json; } ///////////////////////// ////Getter and Setter//// ///////////////////////// public void setMatrix(Array2DRowRealMatrix matrix){ this.matrix = matrix; } public Array2DRowRealMatrix getMatrix(){ return matrix; } public void setWordlist(LinkedList<String> wordlist){ this.wordlist = wordlist; } public LinkedList<String> getWordlist(){ return wordlist; } public void setNodelist(LinkedList<Node> nodelist){ this.nodelist = nodelist; } public LinkedList<Node> getNodeIdList(){ return nodelist; } //////////////////////// ////Update Functions//// //////////////////////// public void addNode(Node node){ nodelist.add(node); } public void addWord(String word){ this.wordlist.add(word); } ///////////////////////////// ////Computation Functions//// ///////////////////////////// /*public Termmatrix convertTFIDF(LinkedList<Node> nodes) throws Exception{ LinkedList<String> wordlist = new LinkedList<String>(); WordConverter conv = new WordConverter(); String threads = ""; String word = null; int column = 0; int row = 0; Log log = new Log(); LinkedList<Node> tempList = new LinkedList<Node>(); for(Iterator<Node> it = nodes.iterator(); it.hasNext();){ boolean add = true; Node currNode = it.next(); threads = currNode.getContent() + threads; // compute all concatenated threads /*for(Iterator<Node> it2 = tempList.iterator(); it2.hasNext();){ //group by users Node currTemp = it2.next(); if(currTemp.getUserID().equals(currNode.getUserID())){ currTemp.setContent(currTemp.getContent()+currNode.getContent()); add = false; } } if(add){ tempList.add(currNode); }*/ /*} //int nodesSize = tempList.size(); int nodesSize = nodes.size(); if(threads == null || threads.isEmpty()){ return null; } wordlist = conv.listWords(threads); this.setWordlist(wordlist); //wordlistdup = listWordsDup(threads); //len = stringLength(thread); Array2DRowRealMatrix matrix = new Array2DRowRealMatrix(nodesSize ,wordlist.size()); for(Iterator<Node>iter = tempList.iterator(); iter.hasNext();){ Node node = iter.next(); this.addNode(node.getNodeID()); String thr = node.getContent(); LinkedList<String> temp = conv.listWordsDup(thr); RealVector vector = new ArrayRealVector(wordlist.size()); for(Iterator<String> it = wordlist.iterator(); it.hasNext();){ word = it.next(); int freq = conv.countWord(word, temp); int docFreq = conv.countDoc(word, nodes); double tfidf = freq * log.value((double)nodesSize/docFreq); vector.setEntry(column, tfidf); column++; } column = 0; matrix.setRowVector(row, vector); row++; } this.setMatrix(matrix);//json = converter.matrixToJson(matrix, wordlist); //return json; return this; }*/ public int countWord(String word, LinkedList<String> list){ int res = 0; for(Iterator<String> it = list.iterator(); it.hasNext();){ if(word.equals(it.next())){ res++; } } return res; } /*public int countDoc(String word, CustomGraph graph){ int res = 0; Node node; //Node node = new Node(); NodeCursor nodes = graph.nodes(); CharSequence wordSeq = word; while(nodes.ok()){ node = nodes.node(); //for(Iterator<Node> it = nodes.iterator(); it.hasNext();){ // node = it.next(); //if(node.getContent().contains(wordSeq)){ if(graph.getNodeContent(node).contains(wordSeq)){ res++; } nodes.next(); } return res; }*/ public String toString(CustomGraph graph){ String res = null; res = "nodelist: "; for(Iterator<Node> it = this.nodelist.iterator(); it.hasNext();){ res = res + graph.getNodeName(it.next()) + " "; } res = res + "\n" + "wordlist: " ; for(Iterator<String> it1 = this.wordlist.iterator(); it1.hasNext();){ res = res + it1.next() + " "; } res = res + "\n" + this.matrix.toString(); return res; } public RealMatrix SVD(){ SingularValueDecomposition svd = new SingularValueDecomposition(matrix); /*RealMatrix u = svd.getU(); RealMatrix s = svd.getS(); RealMatrix v = svd.getV();*/ return (RealMatrix) svd.getU(); } private HashMap<String,HashMap<String,Double>> computeTFIDF(CustomGraph graph){ HashMap<String,HashMap<String,Double>> res = new HashMap<String,HashMap<String,Double>>(); int noOfDocs = graph.nodes().size(); String indexPath = graph.getPath(); //NodeCursor nodes = graph.nodes(); TermsEnum termEnum = null; TermsEnum idEnum = null; PostingsEnum docsEnum = null; try { Path f = new File(indexPath).toPath(); IndexReader re = DirectoryReader.open(SimpleFSDirectory.open(f)) ; for(int k = 0; k < noOfDocs; k++){ //compute termvector for each document for content and name field Terms contentTerms = re.getTermVector(k, "doccontent"); Terms idTerms = re.getTermVector(k, "docid"); //if(idTerms != null){ //compute document/node name idEnum = idTerms.iterator(); BytesRef idBytes = idEnum.next(); //should be only one String docName = idBytes.utf8ToString(); HashMap<String,Double> termMap = new HashMap<String,Double>(); //check if content termvector is empty if(contentTerms == null){ res.put(docName,termMap); }else{ //iterate through content term vector termEnum = contentTerms.iterator(); long noOfTerms = contentTerms.size(); DefaultSimilarity sim = new DefaultSimilarity(); for (int i = 0; i < noOfTerms; i++) { //compute string for each term in the termvector and add to the wordlist of the term matrix BytesRef termBytes = termEnum.next(); String termStr = termBytes.utf8ToString(); if(!wordlist.contains(termStr)){ wordlist.add(termStr); } // enumerate through documents, in this case only one docsEnum = termEnum.postings(null); int docIdEnum; while ((docIdEnum = docsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { //get the term frequency in the document int tf = docsEnum.freq(); //compute inverse document frequency float idf = sim.idf(termEnum.docFreq(), re.numDocs()); termMap.put(termStr, (double) (tf * idf)); } res.put(docName, termMap); } } } //} return res; }catch(IOException e) { e.printStackTrace(); return null; } } }