/* * Copyright 2015 Themistoklis Mavridis <themis.mavridis@issel.ee.auth.gr>. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.thesmartweb.swebrank; /** * Main method of SWebRank. * It receives the input in a txt file in a structure format. * Passes all the input variables to the total analysis class. * It receives the wordlist of every iteration for each query and creates a wordlist for every domain. * It get all the combinations and permutations of all the words in the wordlist. * Creates the new queries according to their Normalized Web Distance (using Bing Search API) * It compares the wordlist of every iteration with the previous one using Normalized Mutual Information * @author Themis Mavridis */ import java.io.*; import java.util.*; import java.util.List; import org.apache.commons.io.FilenameUtils; import java.nio.file.Path; import java.nio.file.Paths; import org.elasticsearch.action.index.IndexRequest; import org.elasticsearch.action.index.IndexResponse; import org.elasticsearch.client.Client; import org.elasticsearch.client.transport.TransportClient; import org.elasticsearch.common.settings.ImmutableSettings; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.transport.InetSocketTransportAddress; import org.elasticsearch.node.Node; import static org.elasticsearch.node.NodeBuilder.nodeBuilder; import org.json.simple.JSONObject; /** * Main class of SWebRank that gets the settings and get the results of every iteration. * It calls the process to create new queries and to check if we converge. * @author themis */ public class Main { /** * @param args the command line arguments */ public static void main(String[] args){ Path input_path=Paths.get("//mnt//var//DBs//inputsL10//nba//");//input directory String output_parent_directory="//mnt//var//DBs//outputsConfL10//nba//";//output directory String config_path="//mnt//var//DBs//config//";//input directory //---Disable apache log manually---- //System.setProperty("org.apache.commons.logging.Log","org.apache.commons.logging.impl.NoOpLog"); System.setProperty("org.apache.commons.logging.Log","org.apache.commons.logging.impl.Log4JLogger"); //--------------Domain that is searched---------- String domain=""; //------------------search engine related options---------------------- List<String> queries=null; int results_number = 0;//the number of results that are returned from each search engine List<Boolean> enginechoice = null; //list element #0. True/False Bing //list element #1. True/False Google //list element #2. True/False Yahoo! //list element #3. True/False Merged //-----------Moz options--------------------- List<Boolean> mozMetrics = null; //The list is going to contain the moz related input in the following order //list element #1. True/False, True we use Moz API, false not //list element #2. True if we use Domain Authority //list element #3. True if we use External MozRank //list element #4. True if we use MozRank //list element #5. True if we use MozTrust //list element #6. True if we use Subdomain MozRank //list element #7. True if we use Page Authority //only one is used (the first to be set to true) boolean moz_threshold_option = false;//set to true we use the threshold Double moz_threshold = 0.0;//if we want to have a threshold in moz int top_count_moz = 0;//if we want to get the moz top-something results //---------------Semantic Analysis method---------------- List<Boolean> ContentSemantics=null; int SensebotConcepts = 0;//define the amount of concepts that sensebot is going to recognize List<Double> SWebRankSettings=null; //------(string)directory is going to be used later----- String output_child_directory; //-------we get all the paths of the txt (input) files from the input directory------- DataManipulation getfiles=new DataManipulation();//class responsible for the extraction of paths Collection<File> inputs_files;//array to include the paths of the txt files inputs_files=getfiles.getinputfiles(input_path.toString(),"txt");//method to retrieve all the path of the input documents //------------read the txt files------------ for (File input : inputs_files) { ReadInput ri=new ReadInput();//function to read the input boolean check_reading_input=ri.perform(input); if(check_reading_input){ domain=ri.domain; //---------- queries=ri.queries; results_number=ri.results_number; enginechoice=ri.enginechoice; //------------ mozMetrics=ri.mozMetrics; moz_threshold_option=ri.moz_threshold_option; moz_threshold=ri.moz_threshold.doubleValue(); //--------------- ContentSemantics=ri.ContentSemantics; SWebRankSettings=ri.SWebRankSettings; } int top_visible=0;//option to set the amount of results you can get in the merged search engine //------if we choose to use a Moz metric or Visibility score for our ranking, we need to set the results_number for the search engines to its max which is 50 //-----we set the top results number for moz or Visibility rank---- if(mozMetrics.get(0)||enginechoice.get(3)){ if(mozMetrics.get(0)){top_count_moz=results_number;}//if moz is true, top_count_moz gets the value of result number if(enginechoice.get(3)){top_visible=results_number;}//if merged engine is true, top_visible gets the value of result number results_number=50;//this is the max amount of results that you can get from the search engine APIs } //-----if we want to use Moz we should check first if it works if(mozMetrics.get(0)){ Moz Moz = new Moz(); //---if it works, moz remains true, otherwise it is set to false mozMetrics.add(0,Moz.check(config_path)); //if it is false and we have chosen to use Visibility score with Moz, we reset back to the standard settings (ranking and not merged) //therefore, we reset the number of results from 50 to the top_count_moz which contained the original number of results if(!mozMetrics.get(0)){ if(!enginechoice.get(3)){results_number=top_count_moz;} } } //----------we set the wordLists that we are going to use--------------------- List<String> finalList = new ArrayList<String>();//finalList is going to contain all the content in the end Total_analysis ta = new Total_analysis();//we call total analysis int iteration_counter=0;//the iteration_counter is used in order to count the number of iterations of the algorithm and to be checked with perf_limit //this list of arraylists is going to contain all the wordLists that are produced for every term of the String[] query, //in order to calculate the NGD scores between every term of the wordList and the term that was used as query in order to produce the spesific wordList List<ArrayList<String>> array_wordLists = new ArrayList<>(); List<String> wordList_previous=new ArrayList<>(); List<String> wordList_new=new ArrayList<>(); double convergence=0;//we create the convergence percentage and initialize it String conv_percentages="";//string that contains all the convergence percentages DataManipulation wordsmanipulation=new DataManipulation();//method to manipulate various word data (String, list<String>, etc) do{ //if we run the algorithm for the 1st time we already have the query so we skip the loop below that produces the new array of query if(iteration_counter!=0){ wordList_previous = wordList_new; //we add the previous wordList to the finalList finalList=wordsmanipulation.AddAList(wordList_previous, finalList); List<String> query_new_list_total = new ArrayList<>(); int iteration_previous=iteration_counter-1; Combinations_Engine cn = new Combinations_Engine();//call the class to combine the terms produced for(String query:queries){ List<String> ids=new ArrayList<>(); if(enginechoice.get(0)){ String id=domain+"/"+query+"/bing"+"/"+iteration_previous; ids.add(id); } if(enginechoice.get(1)){ String id=domain+"/"+query+"/google"+"/"+iteration_previous; ids.add(id); } if(enginechoice.get(2)){ String id=domain+"/"+query+"/yahoo"+"/"+iteration_previous; ids.add(id); } ElasticGetWordList ESget=new ElasticGetWordList();//we call this class to get the wordlist from the Elastic Search List<String> maxWords = ESget.getMaxWords(ids, SWebRankSettings.get(9).intValue(),config_path);//we are going to get a max amount of words int query_index=queries.indexOf(query); int size_query_new = SWebRankSettings.get(10).intValue();//the amount of new queries we are willing to create //we create the new queries for every query of the previous round by combining the words produced from this query List<String> query_new_list = cn.perform(maxWords, SWebRankSettings.get(7), queries, SWebRankSettings.get(6), query_index, size_query_new, config_path); //we add the list of new queries to the total list that containas all the new queries query_new_list_total.addAll(query_new_list); System.out.println("query pointer=" + query_index + ""); } //---------------------the following cleans a list from null and duplicates query_new_list_total=wordsmanipulation.clearListString(query_new_list_total); //--------------we create the new directory that our files are going to be saved String txt_directory=FilenameUtils.getBaseName(input.getName()); output_child_directory=output_parent_directory+txt_directory+"_level_"+iteration_counter+"//"; //----------------append the wordlist to a file------------------ wordsmanipulation.AppendWordList(query_new_list_total, output_child_directory+"queries_"+iteration_counter+".txt"); if(query_new_list_total.size()<1){break;}//if we don't create new queries we end the while loop //total analysis' function is going to do all the work and return back what we need ta = new Total_analysis(); ta.perform(wordList_previous,iteration_counter,output_child_directory,domain,enginechoice, query_new_list_total, results_number, top_visible, mozMetrics, moz_threshold_option, moz_threshold.doubleValue(), top_count_moz, ContentSemantics, SensebotConcepts, SWebRankSettings, config_path); //we get the array of wordlists array_wordLists=ta.getarray_wordLists(); //get the wordlist that includes all the new queries wordList_new=ta.getwordList_total(); //---------------------the following cleans a list from null and duplicates------------- wordList_new=wordsmanipulation.clearListString(wordList_new); //----------------append the wordlist to a file-------------------- wordsmanipulation.AppendWordList(wordList_new, output_child_directory+ "wordList.txt"); //the concergence percentage of this iteration convergence = ta.getConvergence();//we are going to use convergence score to check the convergence //a string that contains all the convergence percentage for each round separated by \n character conv_percentages = conv_percentages + "\n" + convergence; //a file that is going to include the convergence percentages wordsmanipulation.AppendString(conv_percentages, output_child_directory+ "convergence_percentage.txt"); //we add the new wordList to the finalList finalList=wordsmanipulation.AddAList(wordList_new, finalList); //we set the query array to be equal to the query new total that we have created queries=query_new_list_total; //we increment the iteration_counter in order to count the iterations of the algorithm and to use the perf_limit iteration_counter++; } else{//the following source code is performed on the 1st run of the loop //------------we extract the parent path of the file String txt_directory=FilenameUtils.getBaseName(input.getName()); //----------we create a string that is going to be used for the corresponding directory of outputs output_child_directory=output_parent_directory+txt_directory+"_level_"+iteration_counter+"//"; //we call total analysis function performOld ta.perform(wordList_new,iteration_counter,output_child_directory,domain, enginechoice, queries, results_number, top_visible, mozMetrics, moz_threshold_option, moz_threshold.doubleValue(), top_count_moz, ContentSemantics, SensebotConcepts, SWebRankSettings, config_path); //we get the array of wordlists array_wordLists=ta.getarray_wordLists(); //get the wordlist that includes all the new queries wordList_new=ta.getwordList_total(); //---------------------the following cleans a list from null and duplicates wordList_new=wordsmanipulation.clearListString(wordList_new); //----------------append the wordlist to a file wordsmanipulation.AppendWordList(wordList_new, output_child_directory+"wordList.txt"); //----------------------------------------- iteration_counter++;//increase the iteration_counter that counts the iterations of the algorithm } }while(convergence<SWebRankSettings.get(5).doubleValue()&&iteration_counter<SWebRankSettings.get(8).intValue());//while the convergence percentage is below the limit and the iteration_counter below the performance limit if(iteration_counter==1){ finalList=wordsmanipulation.AddAList(wordList_new, finalList);} //--------------------content List---------------- if (!finalList.isEmpty()) { //---------------------the following cleans the final list from null and duplicates finalList=wordsmanipulation.clearListString(finalList); //write the keywords to a file boolean flag_file = false;//boolean flag to declare successful write to file flag_file=wordsmanipulation.AppendWordList(finalList, output_parent_directory+"total_content.txt"); if(!flag_file){ System.out.print("can not create the content file for: "+output_parent_directory+"total_content.txt"); } } //we are going to save the total content with its convergence on the ElasticSearch cluster in a separated index //Node node = nodeBuilder().client(true).clusterName("lshrankldacluster").node(); //Client client = node.client(); //get the elastic search indexes in a list List<String> elasticIndexes=ri.GetKeyFile(config_path, "elasticSearchIndexes"); Settings settings = ImmutableSettings.settingsBuilder() .put("cluster.name","lshrankldacluster").build(); Client client = new TransportClient(settings) .addTransportAddress(new InetSocketTransportAddress("localhost", 9300) ); JSONObject objEngineLevel = new JSONObject(); objEngineLevel.put("TotalContent", finalList);//we save the total content objEngineLevel.put("Convergences", conv_percentages);//we save the convergence percentages IndexRequest indexReq=new IndexRequest(elasticIndexes.get(0),"content",domain);//we save also the domain indexReq.source(objEngineLevel); IndexResponse indexRes = client.index(indexReq).actionGet(); //node.close(); client.close(); //----------------------convergence percentages writing to file--------------- //use the conv_percentages string if(conv_percentages.length()!=0){ boolean flag_file = false;//boolean flag to declare successful write to file flag_file=wordsmanipulation.AppendString(conv_percentages, output_parent_directory+"convergence_percentages.txt"); if(!flag_file){ System.out.print("can not create the convergence file for: "+output_parent_directory+"convergence_percentages.txt"); } } } } }