/*
* Copyright 2015 Themistoklis Mavridis <themis.mavridis@issel.ee.auth.gr>.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.thesmartweb.swebrank;
import java.io.*;
import java.io.IOException;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.*;
import java.util.stream.Stream;
/**
* Class for LDA's results manipulation
* @author themis
*/
public class LDAtopicsWords {
/**
* Method that gets the LDA results from the produced file by jgibblda
* @param example_dir the directory to get the file from
* @param prob_threshold the probability threshold to use in the selection of top words
* @param top_words the amount of top words per topic to choose
* @param nTopics the number of topics of LDA
* @param nTopTopics the amount of top topics to pick out of the the corpora of documents analyzed with LDA
* @return a hashmap with every engine's topics and words per topic
*/
public HashMap<String,HashMap<Integer,HashMap<String,Double>>> readFile(String example_dir,Double prob_threshold,int top_words,int nTopics,int nTopTopics) {
DataManipulation getfiles=new DataManipulation();
Collection<File> inputfiles = getfiles.getinputfiles(example_dir,"twords");
String[] twordsarray=new String[inputfiles.size()];
int j=0;
for (File file : inputfiles){
twordsarray[j]=file.getPath();
j++;
}
int size = twordsarray.length * top_words * nTopics;
String[] line = new String[size];
File file_words = new File(example_dir + "words.txt");
int k = 0;
HashMap<String,HashMap<Integer,HashMap<String,Double>>> enginetopicwordprobmap=new HashMap<>();//hashmap to contain all the top topics and their top words for each searh engine
for (int i = 0; i < twordsarray.length; i++) {
try {
String engine="";
if(twordsarray[i].toLowerCase().contains("bing")){
engine="bing";
}
if(twordsarray[i].toLowerCase().contains("google")){
engine="google";
}
if(twordsarray[i].toLowerCase().contains("yahoo")){
engine="yahoo";
}
//the topics in descending order according to their sum of probability over the documents analyzed with LDA
Map<Integer,Double> TopicsAvgProb = getTopicsTotalProb(example_dir,nTopics,engine);
//========we keep a certain amount of top topics=====
Set<Map.Entry<Integer, Double>> TopicsAvgProbEntrySet = TopicsAvgProb.entrySet();
Iterator it = TopicsAvgProbEntrySet.iterator();
int topTopicsCounter=0;
while(it.hasNext()){
topTopicsCounter++;
it.next();
if(topTopicsCounter>nTopTopics){
it.remove();
}
}
//===========================
FileInputStream fstream = null;
fstream = new FileInputStream(twordsarray[i]);
DataInputStream in = new DataInputStream(fstream);
BufferedReader br = new BufferedReader(new InputStreamReader(in));
String test_line;
//read the lines of the files and get the words that are not numbers
int topicindex=-1;
int topicindexprev=-1;
HashMap<Integer, HashMap<String,Double>> topicwordsmulti = new HashMap<>();
HashMap<String,Double> wordprobmap=new HashMap<>();
boolean flagfirstline=true;
//we read the files containing the top words for all the topics
//and keep only the words from the top topics
while ((test_line = br.readLine()) != null) {
if (test_line.startsWith("Topic")){
String li = test_line.trim();
String index = li.split(" ")[1].trim();
topicindex = Integer.parseInt(index.split("th")[0].trim());//the index of the topic
if(!flagfirstline&&TopicsAvgProb.containsKey(topicindexprev)){
topicwordsmulti.put(topicindexprev, wordprobmap);
}
topicindexprev = topicindex;
wordprobmap=new HashMap<>();
flagfirstline=false;
}
if (test_line.startsWith("\t")) {
String li = test_line.trim();
String word = li.split(" ")[0].trim();
boolean flag_check_number = this.checkIfNumber(word);
if (flag_check_number == false) {
Double wordprobability = Double.parseDouble(li.split(" ")[1].trim());
if (wordprobability.compareTo(prob_threshold)>0&&TopicsAvgProb.containsKey(topicindex)) {
wordprobmap.put(word, wordprobability);
}
}
}
}
if(TopicsAvgProb.containsKey(topicindex)){//this is used in the last topic checked.if it is one of the top topics, its wordmap is added in the total
topicwordsmulti.put(topicindex, wordprobmap);
}
enginetopicwordprobmap.put(engine, topicwordsmulti);
} catch (IOException ex) {
Logger.getLogger(LDAtopicsWords.class.getName()).log(Level.SEVERE, null, ex);
HashMap<String,HashMap<Integer,HashMap<String,Double>>> enginetopicswordsprobmapempty = new HashMap<>();
return enginetopicswordsprobmapempty;
}
}
return enginetopicwordprobmap;
}
/**
* Method to check if a string is a number
* @param in stirng to check
* @return true or false
*/
public boolean checkIfNumber(String in) {
try {
Double.parseDouble(in);
} catch (NumberFormatException ex) {
return false;
}
return true;
}
public Map<Integer,Double> getTopicsTotalProb(String example_dir, int nTopics,String engine){
Map<Integer,Double> TopicsTotalProb = new HashMap<>();
DataManipulation datamanipulation=new DataManipulation();
Collection<File> inputfiles = datamanipulation.getinputfiles(example_dir,"theta");
String[] thetaArray=new String[inputfiles.size()];
int j=0;
for (File file : inputfiles){
thetaArray[j]=file.getPath();
j++;
}
for (int i = 0; i < thetaArray.length; i++) {
if(thetaArray[i].contains(engine)){
try {
FileInputStream fstream = null;
fstream = new FileInputStream(thetaArray[i]);
DataInputStream in = new DataInputStream(fstream);
BufferedReader br = new BufferedReader(new InputStreamReader(in));
String test_line;
boolean flagfirstline=true;
while ((test_line = br.readLine()) != null) {
String[] topicProbs=test_line.split(" ");
for(int k=0;k<topicProbs.length;k++){
if(flagfirstline){
double topicprob = Double.parseDouble(topicProbs[k]);
TopicsTotalProb.put(k, topicprob);
}
else{
Double currentTopicprob = TopicsTotalProb.get(k);
double topicprob = currentTopicprob + Double.parseDouble(topicProbs[k]);
TopicsTotalProb.put(k, topicprob);
}
}
flagfirstline=false;
}
/*
for(int k=0;k<nTopics;k++){
double currentTopicprob = TopicsTotalProb.get(k);
currentTopicprob = (double) currentTopicprob / (double) nDocs;
TopicsTotalProb.put(k, currentTopicprob);
}*/
//TopicsTotalProb = datamanipulation.sortHashMapByValuesD(TopicsTotalProb);
TopicsTotalProb = sortByValue(TopicsTotalProb);
int jasda=0;
} catch (FileNotFoundException ex) {
Logger.getLogger(LDAtopicsWords.class.getName()).log(Level.SEVERE, null, ex);
} catch (IOException ex) {
Logger.getLogger(LDAtopicsWords.class.getName()).log(Level.SEVERE, null, ex);
}
}
}
return TopicsTotalProb;
}
public static Map sortByValue(Map unsortedMap) {
Map sortedMap = new TreeMap(new ValueComparator(unsortedMap));
sortedMap.putAll(unsortedMap);
return sortedMap;
}
}
class ValueComparator implements Comparator {
Map map;
public ValueComparator(Map map) {
this.map = map;
}
public int compare(Object keyA, Object keyB) {
Comparable valueA = (Comparable) map.get(keyA);
Comparable valueB = (Comparable) map.get(keyB);
return valueB.compareTo(valueA);
}
}