LDAtopicsWords.java example

Explorer

SWebRank-master
- src
  - main
    - java
      - com
        seomoz
        api
        authentication
        Authenticator.java
        Base64.java
        constants
        AnchorTextConstants.java
        LinksConstants.java
        TopPagesConstants.java
        URLMetricsConstants.java
        example
        Sample.java
        response
        AnchorTextResponse.java
        LinksResponse.java
        TopPagesResponse.java
        UrlResponse.java
        service
        AnchorTextService.java
        LinksService.java
        TopPagesService.java
        URLMetricsService.java
        util
        ConnectionUtil.java
        thesmartweb
        swebrank
        APIconn.java
        AnnotationClient.java
        BingResults.java
        CheckConvergence.java
        CombinationGenerator.java
        Combinations_Engine.java
        DBpediaSpotlightClient.java
        DandelionEntities.java
        DataManipulation.java
        Diffbot.java
        ElasticGetWordList.java
        GoogleResults.java
        JSONparsing.java
        LDAcall.java
        LDAsemStats.java
        LDAtopicsWords.java
        Lemmatizer.java
        LinksParseAnalysis.java
        Main.java
        Moz.java
        NWD_Analysis.java
        NWD_total.java
        PermutationGenerator.java
        PorterStemmer.java
        ReadInput.java
        Search_analysis.java
        Sensebot.java
        Sindice.java
        StHttpRequest.java
        StemmerSnow.java
        Stopwords.java
        TFIDF.java
        Total_analysis.java
        TwitterAnalysis.java
        VisibilityScore.java
        WebParser.java
        YahooConn.java
        YahooEntityCategory.java
        YahooResults.java
      - org
        tartarus
        snowball
        Among.java
        SnowballProgram.java
        SnowballStemmer.java
        TestApp.java
        ext
        englishStemmer.java

/* 
 * Copyright 2015 Themistoklis Mavridis <themis.mavridis@issel.ee.auth.gr>.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.thesmartweb.swebrank;

import java.io.*;
import java.io.IOException;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.*;
import java.util.stream.Stream;

/**
 * Class for LDA's results manipulation
 * @author themis
 */
public class LDAtopicsWords {

    /**
     * Method that gets the LDA results from the produced file by jgibblda
     * @param example_dir the directory to get the file from
     * @param prob_threshold the probability threshold to use in the selection of top words
     * @param top_words the amount of top words per topic to choose
     * @param nTopics the number of topics of LDA
     * @param nTopTopics the amount of top topics to pick out of the the corpora of documents analyzed with LDA
     * @return a hashmap with every engine's topics and words per topic
     */
    public HashMap<String,HashMap<Integer,HashMap<String,Double>>> readFile(String example_dir,Double prob_threshold,int top_words,int nTopics,int nTopTopics)  {
        DataManipulation getfiles=new DataManipulation();
        Collection<File> inputfiles = getfiles.getinputfiles(example_dir,"twords");
        String[] twordsarray=new String[inputfiles.size()];
        int j=0;
        for (File file : inputfiles){
            twordsarray[j]=file.getPath();
            j++;
        }
        int size = twordsarray.length * top_words * nTopics;
        String[] line = new String[size];
        File file_words = new File(example_dir + "words.txt");
        int k = 0;
        HashMap<String,HashMap<Integer,HashMap<String,Double>>> enginetopicwordprobmap=new HashMap<>();//hashmap to contain all the top topics and their top words for each searh engine
        for (int i = 0; i < twordsarray.length; i++) {
            try {
                String engine="";
                if(twordsarray[i].toLowerCase().contains("bing")){
                    engine="bing";
                }
                if(twordsarray[i].toLowerCase().contains("google")){
                    engine="google";
                }
                if(twordsarray[i].toLowerCase().contains("yahoo")){
                    engine="yahoo";
                }
                //the topics in descending order according to their sum of probability over the documents analyzed with LDA
                Map<Integer,Double> TopicsAvgProb = getTopicsTotalProb(example_dir,nTopics,engine);
                //========we keep a certain amount of top topics=====
                Set<Map.Entry<Integer, Double>> TopicsAvgProbEntrySet = TopicsAvgProb.entrySet();
                Iterator it = TopicsAvgProbEntrySet.iterator();
                int topTopicsCounter=0;
                while(it.hasNext()){
                    topTopicsCounter++;
                    it.next();
                    if(topTopicsCounter>nTopTopics){
                        it.remove();
                    }
                }
                //===========================
                FileInputStream fstream = null;
                fstream = new FileInputStream(twordsarray[i]);
                DataInputStream in = new DataInputStream(fstream);
                BufferedReader br = new BufferedReader(new InputStreamReader(in));
                String test_line;
                //read the lines of the files and get the words that are not numbers
                int topicindex=-1;
                int topicindexprev=-1;
                HashMap<Integer, HashMap<String,Double>> topicwordsmulti = new HashMap<>();
                HashMap<String,Double> wordprobmap=new HashMap<>();
                boolean flagfirstline=true;
                //we read the files containing the top words for all the topics
                //and keep only the words from the top topics
                while ((test_line = br.readLine()) != null) {
                    if (test_line.startsWith("Topic")){
                        String li = test_line.trim();
                        String index = li.split(" ")[1].trim();
                        topicindex = Integer.parseInt(index.split("th")[0].trim());//the index of the topic
                        if(!flagfirstline&&TopicsAvgProb.containsKey(topicindexprev)){
                            topicwordsmulti.put(topicindexprev, wordprobmap);
                        }
                        topicindexprev = topicindex;
                        wordprobmap=new HashMap<>();
                        flagfirstline=false;
                    }
                    if (test_line.startsWith("\t")) {
                        String li = test_line.trim();
                        String word = li.split(" ")[0].trim();
                        boolean flag_check_number = this.checkIfNumber(word);
                        if (flag_check_number == false) {
                            Double wordprobability = Double.parseDouble(li.split(" ")[1].trim());
                            if (wordprobability.compareTo(prob_threshold)>0&&TopicsAvgProb.containsKey(topicindex)) {
                                wordprobmap.put(word, wordprobability);
                            }
                        }
                    }
                }
                if(TopicsAvgProb.containsKey(topicindex)){//this is used in the last topic checked.if it is one of the top topics, its wordmap is added in the total
                    topicwordsmulti.put(topicindex, wordprobmap);
                }
                enginetopicwordprobmap.put(engine, topicwordsmulti);
            } catch (IOException ex) {
                Logger.getLogger(LDAtopicsWords.class.getName()).log(Level.SEVERE, null, ex);
                HashMap<String,HashMap<Integer,HashMap<String,Double>>> enginetopicswordsprobmapempty = new HashMap<>();
                return enginetopicswordsprobmapempty;
            }
        }
        return enginetopicwordprobmap;
    }

    /**
     * Method to check if a string is a number
     * @param in stirng to check
     * @return true or false
     */
    public boolean checkIfNumber(String in) {

        try {

           Double.parseDouble(in);

        } catch (NumberFormatException ex) {
            return false;
        }

        return true;
    }
    public Map<Integer,Double> getTopicsTotalProb(String example_dir, int nTopics,String engine){
        Map<Integer,Double> TopicsTotalProb = new HashMap<>();
        DataManipulation datamanipulation=new DataManipulation();
        Collection<File> inputfiles = datamanipulation.getinputfiles(example_dir,"theta");
        String[] thetaArray=new String[inputfiles.size()];
        int j=0;
        for (File file : inputfiles){
            thetaArray[j]=file.getPath();
            j++;
        }
        for (int i = 0; i < thetaArray.length; i++) {
            if(thetaArray[i].contains(engine)){
                try {
                    FileInputStream fstream = null;
                    fstream = new FileInputStream(thetaArray[i]);
                    DataInputStream in = new DataInputStream(fstream);
                    BufferedReader br = new BufferedReader(new InputStreamReader(in));
                    String test_line;
                    boolean flagfirstline=true;
                    while ((test_line = br.readLine()) != null) {
                        String[] topicProbs=test_line.split(" ");
                        for(int k=0;k<topicProbs.length;k++){
                            if(flagfirstline){
                                double topicprob = Double.parseDouble(topicProbs[k]);
                                TopicsTotalProb.put(k, topicprob);
                            }
                            else{
                                Double currentTopicprob = TopicsTotalProb.get(k);
                                double topicprob = currentTopicprob + Double.parseDouble(topicProbs[k]);
                                TopicsTotalProb.put(k, topicprob);
                            }                                             
                        }
                        flagfirstline=false;
                    }
                    /*
                    for(int k=0;k<nTopics;k++){
                        double currentTopicprob = TopicsTotalProb.get(k);
                        currentTopicprob = (double) currentTopicprob / (double) nDocs;
                        TopicsTotalProb.put(k, currentTopicprob);
                    }*/
                    //TopicsTotalProb = datamanipulation.sortHashMapByValuesD(TopicsTotalProb);
                    TopicsTotalProb = sortByValue(TopicsTotalProb);
                    int jasda=0;
                } catch (FileNotFoundException ex) {
                    Logger.getLogger(LDAtopicsWords.class.getName()).log(Level.SEVERE, null, ex);
                } catch (IOException ex) {
                    Logger.getLogger(LDAtopicsWords.class.getName()).log(Level.SEVERE, null, ex);
                }
            }
        }
        return TopicsTotalProb;
    
    }
    public static Map sortByValue(Map unsortedMap) {
        Map sortedMap = new TreeMap(new ValueComparator(unsortedMap));
        sortedMap.putAll(unsortedMap);
        return sortedMap;
    }
}
class ValueComparator implements Comparator {
 
	Map map;
 
	public ValueComparator(Map map) {
		this.map = map;
	}
 
	public int compare(Object keyA, Object keyB) {
		Comparable valueA = (Comparable) map.get(keyA);
		Comparable valueB = (Comparable) map.get(keyB);
		return valueB.compareTo(valueA);
	}
}