Main.java example

Explorer
SWebRank-master
- src
  - main
    - java
      - com
        seomoz
        api
        authentication
        Authenticator.java
        Base64.java
        constants
        AnchorTextConstants.java
        LinksConstants.java
        TopPagesConstants.java
        URLMetricsConstants.java
        example
        Sample.java
        response
        AnchorTextResponse.java
        LinksResponse.java
        TopPagesResponse.java
        UrlResponse.java
        service
        AnchorTextService.java
        LinksService.java
        TopPagesService.java
        URLMetricsService.java
        util
        ConnectionUtil.java
        thesmartweb
        swebrank
        APIconn.java
        AnnotationClient.java
        BingResults.java
        CheckConvergence.java
        CombinationGenerator.java
        Combinations_Engine.java
        DBpediaSpotlightClient.java
        DandelionEntities.java
        DataManipulation.java
        Diffbot.java
        ElasticGetWordList.java
        GoogleResults.java
        JSONparsing.java
        LDAcall.java
        LDAsemStats.java
        LDAtopicsWords.java
        Lemmatizer.java
        LinksParseAnalysis.java
        Main.java
        Moz.java
        NWD_Analysis.java
        NWD_total.java
        PermutationGenerator.java
        PorterStemmer.java
        ReadInput.java
        Search_analysis.java
        Sensebot.java
        Sindice.java
        StHttpRequest.java
        StemmerSnow.java
        Stopwords.java
        TFIDF.java
        Total_analysis.java
        TwitterAnalysis.java
        VisibilityScore.java
        WebParser.java
        YahooConn.java
        YahooEntityCategory.java
        YahooResults.java
      - org
        tartarus
        snowball
        Among.java
        SnowballProgram.java
        SnowballStemmer.java
        TestApp.java
        ext
        englishStemmer.java
/* 
 * Copyright 2015 Themistoklis Mavridis <themis.mavridis@issel.ee.auth.gr>.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.thesmartweb.swebrank;
/**
 * Main method of SWebRank. 
 * It receives the input in a txt file in a structure format.
 * Passes all the input variables to the total analysis class.
 * It receives the wordlist of every iteration for each query and creates a wordlist for every domain.
 * It get all the combinations and permutations of all the words in the wordlist.
 * Creates the new queries according to their Normalized Web Distance (using Bing Search API)
 * It compares the wordlist of every iteration with the previous one using Normalized Mutual Information
 * @author Themis Mavridis
 */

import java.io.*;
import java.util.*;
import java.util.List;
import org.apache.commons.io.FilenameUtils;
import java.nio.file.Path;
import java.nio.file.Paths;
import org.elasticsearch.action.index.IndexRequest;
import org.elasticsearch.action.index.IndexResponse;
import org.elasticsearch.client.Client;
import org.elasticsearch.client.transport.TransportClient;
import org.elasticsearch.common.settings.ImmutableSettings;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.transport.InetSocketTransportAddress;
import org.elasticsearch.node.Node;
import static org.elasticsearch.node.NodeBuilder.nodeBuilder;
import org.json.simple.JSONObject;

/**
 * Main class of SWebRank that gets the settings and get the results of every iteration.
 * It calls the process to create new queries and to check if we converge.
 * @author themis
 */
public class Main {

    /**
     * @param args the command line arguments
     */
    public static void main(String[] args){
        Path input_path=Paths.get("//mnt//var//DBs//inputsL10//nba//");//input directory
        String output_parent_directory="//mnt//var//DBs//outputsConfL10//nba//";//output directory
        String config_path="//mnt//var//DBs//config//";//input directory
        //---Disable apache log manually----
        //System.setProperty("org.apache.commons.logging.Log","org.apache.commons.logging.impl.NoOpLog");
        System.setProperty("org.apache.commons.logging.Log","org.apache.commons.logging.impl.Log4JLogger");
        //--------------Domain that is searched----------
        String domain="";
        //------------------search engine related options----------------------
        List<String> queries=null;
        int results_number = 0;//the number of results that are returned from each search engine
        List<Boolean> enginechoice = null;
        //list element #0. True/False Bing
        //list element #1. True/False Google
        //list element #2. True/False Yahoo!
        //list element #3. True/False Merged
        //-----------Moz options---------------------
        List<Boolean> mozMetrics = null;
        //The list is going to contain the moz related input in the following order
        //list element #1. True/False, True we use Moz API, false not
        //list element #2. True if we use Domain Authority
        //list element #3. True if we use External MozRank
        //list element #4. True if we use MozRank
        //list element #5. True if we use MozTrust
        //list element #6. True if we use Subdomain MozRank
        //list element #7. True if we use Page Authority
        //only one is used (the first to be set to true)
        boolean moz_threshold_option = false;//set to true we use the threshold
        Double moz_threshold = 0.0;//if we want to have a threshold in moz
        int top_count_moz = 0;//if we want to get the moz top-something results
        //---------------Semantic Analysis method----------------
        List<Boolean> ContentSemantics=null;
        int SensebotConcepts = 0;//define the amount of concepts that sensebot is going to recognize
        List<Double> SWebRankSettings=null;
        //------(string)directory is going to be used later-----
        String output_child_directory;
        //-------we get all the paths of the txt (input) files from the input directory-------
        DataManipulation getfiles=new DataManipulation();//class responsible for the extraction of paths
        Collection<File> inputs_files;//array to include the paths of the txt files
        inputs_files=getfiles.getinputfiles(input_path.toString(),"txt");//method to retrieve all the path of the input documents
        //------------read the txt files------------
        for (File input : inputs_files) {
            ReadInput ri=new ReadInput();//function to read the input
            boolean check_reading_input=ri.perform(input);
            if(check_reading_input){
                domain=ri.domain;
                //----------
                queries=ri.queries;    
                results_number=ri.results_number;
                enginechoice=ri.enginechoice;
                //------------
                mozMetrics=ri.mozMetrics;
                moz_threshold_option=ri.moz_threshold_option;
                moz_threshold=ri.moz_threshold.doubleValue();
                //---------------
                ContentSemantics=ri.ContentSemantics;
                SWebRankSettings=ri.SWebRankSettings;
            }                 
            int top_visible=0;//option to set the amount of results you can get in the merged search engine
            //------if we choose to use a Moz metric or Visibility score for our ranking, we need to set the results_number for the search engines to its max which is 50 
            //-----we set the top results number for moz or Visibility rank----
            if(mozMetrics.get(0)||enginechoice.get(3)){
              if(mozMetrics.get(0)){top_count_moz=results_number;}//if moz is true, top_count_moz gets the value of result number
              if(enginechoice.get(3)){top_visible=results_number;}//if merged engine is true, top_visible gets the value of result number
              results_number=50;//this is the max amount of results that you can get from the search engine APIs
            }
            //-----if we want to use Moz we should check first if it works
            if(mozMetrics.get(0)){
              Moz Moz = new Moz();
              //---if it works, moz remains true, otherwise it is set to false
              mozMetrics.add(0,Moz.check(config_path));
              //if it is false and we have chosen to use Visibility score with Moz, we reset back to the standard settings (ranking and not merged)
              //therefore, we reset the number of results from 50 to the top_count_moz which contained the original number of results
              if(!mozMetrics.get(0)){
                if(!enginechoice.get(3)){results_number=top_count_moz;}
              }
            }
            //----------we set the wordLists that we are going to use---------------------
            List<String> finalList = new ArrayList<String>();//finalList is going to contain all the content in the end
            Total_analysis ta = new Total_analysis();//we call total analysis
            int iteration_counter=0;//the iteration_counter is used in order to count the number of iterations of the algorithm and to be checked with perf_limit
            //this list of arraylists  is going to contain all the wordLists that are produced for every term of the String[] query,
            //in order to calculate the NGD scores between every term of the wordList and the term that was used as query in order to produce the spesific wordList
            List<ArrayList<String>> array_wordLists = new ArrayList<>();
            List<String> wordList_previous=new ArrayList<>();
            List<String> wordList_new=new ArrayList<>();
            double convergence=0;//we create the convergence percentage and initialize it
            String conv_percentages="";//string that contains all the convergence percentages
            DataManipulation wordsmanipulation=new DataManipulation();//method to manipulate various word data (String, list<String>, etc)
            do{ //if we run the algorithm for the 1st time we already have the query so we skip the loop below that produces the new array of query
                if(iteration_counter!=0){
                    wordList_previous = wordList_new;
                    //we add the previous wordList to the finalList
                    finalList=wordsmanipulation.AddAList(wordList_previous, finalList);
                    List<String> query_new_list_total = new ArrayList<>();
                    int iteration_previous=iteration_counter-1;
                    Combinations_Engine cn = new Combinations_Engine();//call the class to combine the terms produced
                    for(String query:queries){
                        List<String> ids=new ArrayList<>();
                        if(enginechoice.get(0)){
                            String id=domain+"/"+query+"/bing"+"/"+iteration_previous;
                            ids.add(id);
                        }
                        if(enginechoice.get(1)){
                            String id=domain+"/"+query+"/google"+"/"+iteration_previous;
                            ids.add(id);
                        }
                        if(enginechoice.get(2)){
                            String id=domain+"/"+query+"/yahoo"+"/"+iteration_previous;
                            ids.add(id);
                        }
                        ElasticGetWordList ESget=new ElasticGetWordList();//we call this class to get the wordlist from the Elastic Search
                        List<String> maxWords = ESget.getMaxWords(ids, SWebRankSettings.get(9).intValue(),config_path);//we are going to get a max amount of words
                        int query_index=queries.indexOf(query);
                        int size_query_new = SWebRankSettings.get(10).intValue();//the amount of new queries we are willing to create
                        //we create the new queries for every query of the previous round by combining the words produced from this query
                        List<String> query_new_list = cn.perform(maxWords, SWebRankSettings.get(7), queries, SWebRankSettings.get(6), query_index, size_query_new, config_path);
                        //we add the list of new queries to the total list that containas all the new queries
                        query_new_list_total.addAll(query_new_list);
                        System.out.println("query pointer=" + query_index + "");
                    }
                    //---------------------the following cleans a list from null and duplicates
                    query_new_list_total=wordsmanipulation.clearListString(query_new_list_total);
                    //--------------we create the new directory that our files are going to be saved 
                    String txt_directory=FilenameUtils.getBaseName(input.getName());
                    output_child_directory=output_parent_directory+txt_directory+"_level_"+iteration_counter+"//";
                    //----------------append the wordlist to a file------------------
                    wordsmanipulation.AppendWordList(query_new_list_total, output_child_directory+"queries_"+iteration_counter+".txt");
                    if(query_new_list_total.size()<1){break;}//if we don't create new queries we end the while loop
                    //total analysis' function is going to do all the work and return back what we need
                    ta = new Total_analysis();
                    ta.perform(wordList_previous,iteration_counter,output_child_directory,domain,enginechoice, query_new_list_total, results_number, top_visible, mozMetrics, moz_threshold_option, moz_threshold.doubleValue(), top_count_moz, ContentSemantics, SensebotConcepts, SWebRankSettings, config_path);
                    //we get the array of wordlists
                    array_wordLists=ta.getarray_wordLists();
                    //get the wordlist that includes all the new queries
                    wordList_new=ta.getwordList_total();
                    //---------------------the following cleans a list from null and duplicates-------------
                    wordList_new=wordsmanipulation.clearListString(wordList_new);
                    //----------------append the wordlist to a file--------------------
                    wordsmanipulation.AppendWordList(wordList_new, output_child_directory+ "wordList.txt");
                    //the concergence percentage of this iteration
                    convergence = ta.getConvergence();//we are going to use convergence score to check the convergence
                    //a string that contains all the convergence percentage for each round separated by \n character
                    conv_percentages = conv_percentages + "\n" + convergence;
                    //a file that is going to include the convergence percentages
                    wordsmanipulation.AppendString(conv_percentages, output_child_directory+ "convergence_percentage.txt");
                    //we add the new wordList to the finalList
                    finalList=wordsmanipulation.AddAList(wordList_new, finalList);
                    //we set the query array to be equal to the query new total that we have created
                    queries=query_new_list_total;
                    //we increment the iteration_counter in order to count the iterations of the algorithm and to use the perf_limit
                    iteration_counter++;
                }
                else{//the following source code is performed on the 1st run of the loop
                    //------------we extract the parent path of the file 
                    String txt_directory=FilenameUtils.getBaseName(input.getName());
                    //----------we create a string that is going to be used for the corresponding directory of outputs
                    output_child_directory=output_parent_directory+txt_directory+"_level_"+iteration_counter+"//";
                    //we call total analysis function performOld
                    ta.perform(wordList_new,iteration_counter,output_child_directory,domain, enginechoice, queries, results_number, top_visible, mozMetrics, moz_threshold_option, moz_threshold.doubleValue(), top_count_moz, ContentSemantics, SensebotConcepts, SWebRankSettings, config_path);
                    //we get the array of wordlists
                    array_wordLists=ta.getarray_wordLists();
                    //get the wordlist that includes all the new queries
                    wordList_new=ta.getwordList_total();
                    //---------------------the following cleans a list from null and duplicates
                    wordList_new=wordsmanipulation.clearListString(wordList_new);
                    //----------------append the wordlist to a file
                    wordsmanipulation.AppendWordList(wordList_new, output_child_directory+"wordList.txt");
                    //-----------------------------------------
                    iteration_counter++;//increase the iteration_counter that counts the iterations of the algorithm
                }
            }while(convergence<SWebRankSettings.get(5).doubleValue()&&iteration_counter<SWebRankSettings.get(8).intValue());//while the convergence percentage is below the limit and the iteration_counter below the performance limit
                if(iteration_counter==1){ finalList=wordsmanipulation.AddAList(wordList_new, finalList);}
                //--------------------content List----------------
                if (!finalList.isEmpty()) {
                    //---------------------the following cleans the final list from null and duplicates
                    finalList=wordsmanipulation.clearListString(finalList);
                    //write the keywords to a file
                    boolean flag_file = false;//boolean flag to declare successful write to file
                    flag_file=wordsmanipulation.AppendWordList(finalList, output_parent_directory+"total_content.txt");
                    if(!flag_file){
                        System.out.print("can not create the content file for: "+output_parent_directory+"total_content.txt");
                    }
                }
                //we are going to save the total content with its convergence on the ElasticSearch cluster in a separated index
                //Node node = nodeBuilder().client(true).clusterName("lshrankldacluster").node();
                //Client client = node.client();
                //get the elastic search indexes in a list
                List<String> elasticIndexes=ri.GetKeyFile(config_path, "elasticSearchIndexes");
                Settings settings = ImmutableSettings.settingsBuilder()
                    .put("cluster.name","lshrankldacluster").build();
                Client client = new TransportClient(settings)
                    .addTransportAddress(new
                            InetSocketTransportAddress("localhost", 9300)
                    );
                JSONObject objEngineLevel = new JSONObject();
                objEngineLevel.put("TotalContent", finalList);//we save the total content
                objEngineLevel.put("Convergences", conv_percentages);//we save the convergence percentages
                IndexRequest indexReq=new IndexRequest(elasticIndexes.get(0),"content",domain);//we save also the domain 
                indexReq.source(objEngineLevel);
                IndexResponse indexRes = client.index(indexReq).actionGet();
                //node.close();
                client.close();
                //----------------------convergence percentages writing to file---------------
                //use the conv_percentages string
                if(conv_percentages.length()!=0){
                    boolean flag_file = false;//boolean flag to declare successful write to file
                    flag_file=wordsmanipulation.AppendString(conv_percentages, output_parent_directory+"convergence_percentages.txt");
                    if(!flag_file){
                        System.out.print("can not create the convergence file for: "+output_parent_directory+"convergence_percentages.txt");
                    }
                }
        }
    }
}