/*
* Copyright 2015 Themistoklis Mavridis <themis.mavridis@issel.ee.auth.gr>.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.thesmartweb.swebrank;
/**
* Main method of SWebRank.
* It receives the input in a txt file in a structure format.
* Passes all the input variables to the total analysis class.
* It receives the wordlist of every iteration for each query and creates a wordlist for every domain.
* It get all the combinations and permutations of all the words in the wordlist.
* Creates the new queries according to their Normalized Web Distance (using Bing Search API)
* It compares the wordlist of every iteration with the previous one using Normalized Mutual Information
* @author Themis Mavridis
*/
import java.io.*;
import java.util.*;
import java.util.List;
import org.apache.commons.io.FilenameUtils;
import java.nio.file.Path;
import java.nio.file.Paths;
import org.elasticsearch.action.index.IndexRequest;
import org.elasticsearch.action.index.IndexResponse;
import org.elasticsearch.client.Client;
import org.elasticsearch.client.transport.TransportClient;
import org.elasticsearch.common.settings.ImmutableSettings;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.transport.InetSocketTransportAddress;
import org.elasticsearch.node.Node;
import static org.elasticsearch.node.NodeBuilder.nodeBuilder;
import org.json.simple.JSONObject;
/**
* Main class of SWebRank that gets the settings and get the results of every iteration.
* It calls the process to create new queries and to check if we converge.
* @author themis
*/
public class Main {
/**
* @param args the command line arguments
*/
public static void main(String[] args){
Path input_path=Paths.get("//mnt//var//DBs//inputsL10//nba//");//input directory
String output_parent_directory="//mnt//var//DBs//outputsConfL10//nba//";//output directory
String config_path="//mnt//var//DBs//config//";//input directory
//---Disable apache log manually----
//System.setProperty("org.apache.commons.logging.Log","org.apache.commons.logging.impl.NoOpLog");
System.setProperty("org.apache.commons.logging.Log","org.apache.commons.logging.impl.Log4JLogger");
//--------------Domain that is searched----------
String domain="";
//------------------search engine related options----------------------
List<String> queries=null;
int results_number = 0;//the number of results that are returned from each search engine
List<Boolean> enginechoice = null;
//list element #0. True/False Bing
//list element #1. True/False Google
//list element #2. True/False Yahoo!
//list element #3. True/False Merged
//-----------Moz options---------------------
List<Boolean> mozMetrics = null;
//The list is going to contain the moz related input in the following order
//list element #1. True/False, True we use Moz API, false not
//list element #2. True if we use Domain Authority
//list element #3. True if we use External MozRank
//list element #4. True if we use MozRank
//list element #5. True if we use MozTrust
//list element #6. True if we use Subdomain MozRank
//list element #7. True if we use Page Authority
//only one is used (the first to be set to true)
boolean moz_threshold_option = false;//set to true we use the threshold
Double moz_threshold = 0.0;//if we want to have a threshold in moz
int top_count_moz = 0;//if we want to get the moz top-something results
//---------------Semantic Analysis method----------------
List<Boolean> ContentSemantics=null;
int SensebotConcepts = 0;//define the amount of concepts that sensebot is going to recognize
List<Double> SWebRankSettings=null;
//------(string)directory is going to be used later-----
String output_child_directory;
//-------we get all the paths of the txt (input) files from the input directory-------
DataManipulation getfiles=new DataManipulation();//class responsible for the extraction of paths
Collection<File> inputs_files;//array to include the paths of the txt files
inputs_files=getfiles.getinputfiles(input_path.toString(),"txt");//method to retrieve all the path of the input documents
//------------read the txt files------------
for (File input : inputs_files) {
ReadInput ri=new ReadInput();//function to read the input
boolean check_reading_input=ri.perform(input);
if(check_reading_input){
domain=ri.domain;
//----------
queries=ri.queries;
results_number=ri.results_number;
enginechoice=ri.enginechoice;
//------------
mozMetrics=ri.mozMetrics;
moz_threshold_option=ri.moz_threshold_option;
moz_threshold=ri.moz_threshold.doubleValue();
//---------------
ContentSemantics=ri.ContentSemantics;
SWebRankSettings=ri.SWebRankSettings;
}
int top_visible=0;//option to set the amount of results you can get in the merged search engine
//------if we choose to use a Moz metric or Visibility score for our ranking, we need to set the results_number for the search engines to its max which is 50
//-----we set the top results number for moz or Visibility rank----
if(mozMetrics.get(0)||enginechoice.get(3)){
if(mozMetrics.get(0)){top_count_moz=results_number;}//if moz is true, top_count_moz gets the value of result number
if(enginechoice.get(3)){top_visible=results_number;}//if merged engine is true, top_visible gets the value of result number
results_number=50;//this is the max amount of results that you can get from the search engine APIs
}
//-----if we want to use Moz we should check first if it works
if(mozMetrics.get(0)){
Moz Moz = new Moz();
//---if it works, moz remains true, otherwise it is set to false
mozMetrics.add(0,Moz.check(config_path));
//if it is false and we have chosen to use Visibility score with Moz, we reset back to the standard settings (ranking and not merged)
//therefore, we reset the number of results from 50 to the top_count_moz which contained the original number of results
if(!mozMetrics.get(0)){
if(!enginechoice.get(3)){results_number=top_count_moz;}
}
}
//----------we set the wordLists that we are going to use---------------------
List<String> finalList = new ArrayList<String>();//finalList is going to contain all the content in the end
Total_analysis ta = new Total_analysis();//we call total analysis
int iteration_counter=0;//the iteration_counter is used in order to count the number of iterations of the algorithm and to be checked with perf_limit
//this list of arraylists is going to contain all the wordLists that are produced for every term of the String[] query,
//in order to calculate the NGD scores between every term of the wordList and the term that was used as query in order to produce the spesific wordList
List<ArrayList<String>> array_wordLists = new ArrayList<>();
List<String> wordList_previous=new ArrayList<>();
List<String> wordList_new=new ArrayList<>();
double convergence=0;//we create the convergence percentage and initialize it
String conv_percentages="";//string that contains all the convergence percentages
DataManipulation wordsmanipulation=new DataManipulation();//method to manipulate various word data (String, list<String>, etc)
do{ //if we run the algorithm for the 1st time we already have the query so we skip the loop below that produces the new array of query
if(iteration_counter!=0){
wordList_previous = wordList_new;
//we add the previous wordList to the finalList
finalList=wordsmanipulation.AddAList(wordList_previous, finalList);
List<String> query_new_list_total = new ArrayList<>();
int iteration_previous=iteration_counter-1;
Combinations_Engine cn = new Combinations_Engine();//call the class to combine the terms produced
for(String query:queries){
List<String> ids=new ArrayList<>();
if(enginechoice.get(0)){
String id=domain+"/"+query+"/bing"+"/"+iteration_previous;
ids.add(id);
}
if(enginechoice.get(1)){
String id=domain+"/"+query+"/google"+"/"+iteration_previous;
ids.add(id);
}
if(enginechoice.get(2)){
String id=domain+"/"+query+"/yahoo"+"/"+iteration_previous;
ids.add(id);
}
ElasticGetWordList ESget=new ElasticGetWordList();//we call this class to get the wordlist from the Elastic Search
List<String> maxWords = ESget.getMaxWords(ids, SWebRankSettings.get(9).intValue(),config_path);//we are going to get a max amount of words
int query_index=queries.indexOf(query);
int size_query_new = SWebRankSettings.get(10).intValue();//the amount of new queries we are willing to create
//we create the new queries for every query of the previous round by combining the words produced from this query
List<String> query_new_list = cn.perform(maxWords, SWebRankSettings.get(7), queries, SWebRankSettings.get(6), query_index, size_query_new, config_path);
//we add the list of new queries to the total list that containas all the new queries
query_new_list_total.addAll(query_new_list);
System.out.println("query pointer=" + query_index + "");
}
//---------------------the following cleans a list from null and duplicates
query_new_list_total=wordsmanipulation.clearListString(query_new_list_total);
//--------------we create the new directory that our files are going to be saved
String txt_directory=FilenameUtils.getBaseName(input.getName());
output_child_directory=output_parent_directory+txt_directory+"_level_"+iteration_counter+"//";
//----------------append the wordlist to a file------------------
wordsmanipulation.AppendWordList(query_new_list_total, output_child_directory+"queries_"+iteration_counter+".txt");
if(query_new_list_total.size()<1){break;}//if we don't create new queries we end the while loop
//total analysis' function is going to do all the work and return back what we need
ta = new Total_analysis();
ta.perform(wordList_previous,iteration_counter,output_child_directory,domain,enginechoice, query_new_list_total, results_number, top_visible, mozMetrics, moz_threshold_option, moz_threshold.doubleValue(), top_count_moz, ContentSemantics, SensebotConcepts, SWebRankSettings, config_path);
//we get the array of wordlists
array_wordLists=ta.getarray_wordLists();
//get the wordlist that includes all the new queries
wordList_new=ta.getwordList_total();
//---------------------the following cleans a list from null and duplicates-------------
wordList_new=wordsmanipulation.clearListString(wordList_new);
//----------------append the wordlist to a file--------------------
wordsmanipulation.AppendWordList(wordList_new, output_child_directory+ "wordList.txt");
//the concergence percentage of this iteration
convergence = ta.getConvergence();//we are going to use convergence score to check the convergence
//a string that contains all the convergence percentage for each round separated by \n character
conv_percentages = conv_percentages + "\n" + convergence;
//a file that is going to include the convergence percentages
wordsmanipulation.AppendString(conv_percentages, output_child_directory+ "convergence_percentage.txt");
//we add the new wordList to the finalList
finalList=wordsmanipulation.AddAList(wordList_new, finalList);
//we set the query array to be equal to the query new total that we have created
queries=query_new_list_total;
//we increment the iteration_counter in order to count the iterations of the algorithm and to use the perf_limit
iteration_counter++;
}
else{//the following source code is performed on the 1st run of the loop
//------------we extract the parent path of the file
String txt_directory=FilenameUtils.getBaseName(input.getName());
//----------we create a string that is going to be used for the corresponding directory of outputs
output_child_directory=output_parent_directory+txt_directory+"_level_"+iteration_counter+"//";
//we call total analysis function performOld
ta.perform(wordList_new,iteration_counter,output_child_directory,domain, enginechoice, queries, results_number, top_visible, mozMetrics, moz_threshold_option, moz_threshold.doubleValue(), top_count_moz, ContentSemantics, SensebotConcepts, SWebRankSettings, config_path);
//we get the array of wordlists
array_wordLists=ta.getarray_wordLists();
//get the wordlist that includes all the new queries
wordList_new=ta.getwordList_total();
//---------------------the following cleans a list from null and duplicates
wordList_new=wordsmanipulation.clearListString(wordList_new);
//----------------append the wordlist to a file
wordsmanipulation.AppendWordList(wordList_new, output_child_directory+"wordList.txt");
//-----------------------------------------
iteration_counter++;//increase the iteration_counter that counts the iterations of the algorithm
}
}while(convergence<SWebRankSettings.get(5).doubleValue()&&iteration_counter<SWebRankSettings.get(8).intValue());//while the convergence percentage is below the limit and the iteration_counter below the performance limit
if(iteration_counter==1){ finalList=wordsmanipulation.AddAList(wordList_new, finalList);}
//--------------------content List----------------
if (!finalList.isEmpty()) {
//---------------------the following cleans the final list from null and duplicates
finalList=wordsmanipulation.clearListString(finalList);
//write the keywords to a file
boolean flag_file = false;//boolean flag to declare successful write to file
flag_file=wordsmanipulation.AppendWordList(finalList, output_parent_directory+"total_content.txt");
if(!flag_file){
System.out.print("can not create the content file for: "+output_parent_directory+"total_content.txt");
}
}
//we are going to save the total content with its convergence on the ElasticSearch cluster in a separated index
//Node node = nodeBuilder().client(true).clusterName("lshrankldacluster").node();
//Client client = node.client();
//get the elastic search indexes in a list
List<String> elasticIndexes=ri.GetKeyFile(config_path, "elasticSearchIndexes");
Settings settings = ImmutableSettings.settingsBuilder()
.put("cluster.name","lshrankldacluster").build();
Client client = new TransportClient(settings)
.addTransportAddress(new
InetSocketTransportAddress("localhost", 9300)
);
JSONObject objEngineLevel = new JSONObject();
objEngineLevel.put("TotalContent", finalList);//we save the total content
objEngineLevel.put("Convergences", conv_percentages);//we save the convergence percentages
IndexRequest indexReq=new IndexRequest(elasticIndexes.get(0),"content",domain);//we save also the domain
indexReq.source(objEngineLevel);
IndexResponse indexRes = client.index(indexReq).actionGet();
//node.close();
client.close();
//----------------------convergence percentages writing to file---------------
//use the conv_percentages string
if(conv_percentages.length()!=0){
boolean flag_file = false;//boolean flag to declare successful write to file
flag_file=wordsmanipulation.AppendString(conv_percentages, output_parent_directory+"convergence_percentages.txt");
if(!flag_file){
System.out.print("can not create the convergence file for: "+output_parent_directory+"convergence_percentages.txt");
}
}
}
}
}