/* * Copyright 2015 Themistoklis Mavridis <themis.mavridis@issel.ee.auth.gr>. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.thesmartweb.swebrank; /** * * * @author Themis Mavridis */ import java.io.*; import java.io.IOException; import java.nio.file.Path; import java.nio.file.Paths; import java.util.logging.Level; import java.util.logging.Logger; import java.util.*; import java.util.List; import java.util.Scanner; /** * Class to read the input files * @author themis */ public class ReadInput { /** * the number of results that are returned from each search engine */ protected int results_number; /** * moz threshold */ protected Double moz_threshold; /** * if we want to have a threshold in moz or not */ protected boolean moz_threshold_option; /** * amount of Sensebot Concept (if we choose to use Sensebot) */ protected int SensebotConcepts; /** * List that has true 0 if we would like to use Moz * #1 Domain Authority * #2 External MozRank * #3 MozRank * #4 MozTrust * #5 Page Authority * #6 Subdomain MozRank */ protected List<Boolean> mozMetrics; /** * List that contains the queries that are defined in the input files */ protected List<String> queries; /** * Search engine choice, Bing is in 1st place, Google is in 2nd place, Yahoo 3rd */ protected List<Boolean> enginechoice; /** * List that contains which Semantic Analysis algorithm we choose * #1 Diffbotflag * #2 LDA * #3 Sensebotflag * #4 TF-IDF */ protected List<Boolean> ContentSemantics; /** * List that contains then SWebRank settings * #0 beta * #1 number of topics * #2 number of iterations * #3 number of top words * #4 probability * #5 nmi convergence limit * #6 nwd threshold * #7 combine limit * #8 performance limit * #9 new terms to combine from wordlist per query per round * #10 max new queries to generate per previous round query */ protected List<Double> SWebRankSettings; /** * Domain of queries */ protected String domain; /** * Initialize the values */ public ReadInput() { this.SensebotConcepts = 0; this.moz_threshold_option = false; this.moz_threshold = 0.0; this.results_number = 0; this.queries = new LinkedList<String>();//better in add(E element) this.enginechoice= new ArrayList<Boolean>();//lower complexity in get this.ContentSemantics=new ArrayList<Boolean>(); this.SWebRankSettings=new ArrayList<Double>(); this.mozMetrics=new ArrayList<Boolean>(); this.domain=""; } /** * Method to read SWebRank's settings * @param Input The file that contains the input settings to be read * @return True/False if everything was read correctly */ public boolean perform(File Input){ FileInputStream inputStream=null; Scanner sc=null; try{ inputStream=new FileInputStream(Input); sc=new Scanner(inputStream); if (sc.hasNextLine()) { domain = sc.nextLine().toString().split(":")[1].trim(); } if (sc.hasNextLine()) { int queries_number = Integer.parseInt(sc.nextLine().toString().split(":")[1].trim()); int j=0; while(j<queries_number){ String temp=sc.nextLine().toString().split(":")[1].trim(); boolean add = queries.add(temp); j++; } } if (sc.hasNextLine()) { results_number = Integer.parseInt(sc.nextLine().toString().split(":")[1].trim()); } //------------------- if (sc.hasNextLine()) {//Bing enginechoice.add(Boolean.parseBoolean(sc.nextLine().toString().split(":")[1].trim())); } if (sc.hasNextLine()) {//Google enginechoice.add(Boolean.parseBoolean(sc.nextLine().toString().split(":")[1].trim())); } if (sc.hasNextLine()) {//Yahoo enginechoice.add(Boolean.parseBoolean(sc.nextLine().toString().split(":")[1].trim())); } if (sc.hasNextLine()) {//Merged Engine Results enginechoice.add(Boolean.parseBoolean(sc.nextLine().toString().split(":")[1].trim())); } //-------------------- if (sc.hasNextLine()) {//0moz mozMetrics.add(Boolean.parseBoolean(sc.nextLine().toString().split(":")[1].trim())); } if (sc.hasNextLine()) {//1Domain Authority mozMetrics.add(Boolean.parseBoolean(sc.nextLine().toString().split(":")[1].trim())); } if (sc.hasNextLine()) {//2External MozRank mozMetrics.add(Boolean.parseBoolean(sc.nextLine().toString().split(":")[1].trim())); } if (sc.hasNextLine()) {//3MozRank mozMetrics.add(Boolean.parseBoolean(sc.nextLine().toString().split(":")[1].trim())); } if (sc.hasNextLine()) {//4MozTrust mozMetrics.add(Boolean.parseBoolean(sc.nextLine().toString().split(":")[1].trim())); } if (sc.hasNextLine()) {//5Page Authority mozMetrics.add(Boolean.parseBoolean(sc.nextLine().toString().split(":")[1].trim())); } if (sc.hasNextLine()) {//6Subdomain_MozRank mozMetrics.add(Boolean.parseBoolean(sc.nextLine().toString().split(":")[1].trim())); } if (sc.hasNextLine()) { moz_threshold_option = Boolean.parseBoolean(sc.nextLine().toString().split(":")[1].trim()); } //------if we are going to use the threshold then we insert the value to the moz_threshold variable //------otherwise, we just skip it if (sc.hasNextLine()) { if(moz_threshold_option){ moz_threshold = Double.parseDouble(sc.nextLine().toString().split(":")[1].trim()); } else{ sc.nextLine(); moz_threshold=-1.0; } } //-----the following is used to check if we have one and only one Moz option active if(mozMetrics.get(0)){ int k=1; int true_position=-1; //we search for the option set to true while(true_position<0&&k<mozMetrics.size()){ if(mozMetrics.get(k)){ true_position=k; } k++; } //if the user has set multiple options set true, we are keeping the first one if(true_position>0){ int mozMetricsSize=mozMetrics.size(); for(k=true_position+1;k<mozMetricsSize;k++){ mozMetrics.set(k,false); } } //if the user has not set any option to true, we are not going to use Moz else{ mozMetrics.set(0, false); } } //----------------------------- if (sc.hasNextLine()) {//0Diffbotflag ContentSemantics.add(Boolean.parseBoolean(sc.nextLine().toString().split(":")[1].trim())); } if (sc.hasNextLine()) {//1LDAflag ContentSemantics.add(Boolean.parseBoolean(sc.nextLine().toString().split(":")[1].trim())); } if (sc.hasNextLine()) {//2Sensebotflag ContentSemantics.add(Boolean.parseBoolean(sc.nextLine().toString().split(":")[1].trim())); } if (sc.hasNextLine()) {//3TFIDFflag ContentSemantics.add(Boolean.parseBoolean(sc.nextLine().toString().split(":")[1].trim())); } if(ContentSemantics.get(2).booleanValue()){ if (sc.hasNextLine()) { SensebotConcepts=Integer.parseInt(sc.nextLine().toString().split(":")[1].trim()); } } else { sc.nextLine(); SensebotConcepts=0; } //----------------------------------- //-----the following is used to check if we have one and only one Content Semantic option active int k=1; int true_position=-1; while(true_position<0&&k<ContentSemantics.size()){ if(ContentSemantics.get(k)){ true_position=k; } k++; } if(true_position>0){ int contentSemanticsSize=ContentSemantics.size(); for(k=true_position+1;k<contentSemanticsSize;k++){ ContentSemantics.set(k,false); } } //if the user has not set any option to true, we are not going to use LDA else{ ContentSemantics.set(1,true); } //----------------------------- if (sc.hasNextLine()) {//0beta SWebRankSettings.add(Double.parseDouble(sc.nextLine().toString().split(":")[1].trim())); } if (sc.hasNextLine()) {//1number of topics SWebRankSettings.add(Double.parseDouble(sc.nextLine().toString().split(":")[1].trim())); } if (sc.hasNextLine()) {//2number of iterations SWebRankSettings.add(Double.parseDouble(sc.nextLine().toString().split(":")[1].trim())); } if (sc.hasNextLine()) {//3number of top words SWebRankSettings.add(Double.parseDouble(sc.nextLine().toString().split(":")[1].trim())); } if (sc.hasNextLine()) {//4number of probability threshold SWebRankSettings.add(Double.parseDouble(sc.nextLine().toString().split(":")[1].trim())); } if (sc.hasNextLine()) {//5conversion limit SWebRankSettings.add(Double.parseDouble(sc.nextLine().toString().split(":")[1].trim())); } if (sc.hasNextLine()) {//6ngd threshold SWebRankSettings.add(Double.parseDouble(sc.nextLine().toString().split(":")[1].trim())); } if (sc.hasNextLine()) {//7combine limit SWebRankSettings.add(Double.parseDouble(sc.nextLine().toString().split(":")[1].trim())); } if (sc.hasNextLine()) {//8performance limit SWebRankSettings.add(Double.parseDouble(sc.nextLine().toString().split(":")[1].trim())); } if (sc.hasNextLine()) {//9amount of terms to get from each query from the wordlist of LDA to create the new queries per round SWebRankSettings.add(Double.parseDouble(sc.nextLine().toString().split(":")[1].trim())); } if (sc.hasNextLine()) {//10amount of queries to create for each query using NWD SWebRankSettings.add(Double.parseDouble(sc.nextLine().toString().split(":")[1].trim())); } if (sc.hasNextLine()) {//11amount of top topics to choose SWebRankSettings.add(Double.parseDouble(sc.nextLine().toString().split(":")[1].trim())); } if (sc.hasNextLine()) {//12dbpedia spotlight confidence SWebRankSettings.add(Double.parseDouble(sc.nextLine().toString().split(":")[1].trim())); } if (sc.hasNextLine()) {//13dbpedia spotlight support SWebRankSettings.add(Double.parseDouble(sc.nextLine().toString().split(":")[1].trim())); } if (sc.ioException() !=null){ return false; } } catch (IOException ex) { Logger.getLogger(ReadInput.class.getName()).log(Level.SEVERE, null, ex); return false; } finally { if (inputStream !=null){ try { inputStream.close(); } catch (IOException ex) { Logger.getLogger(ReadInput.class.getName()).log(Level.SEVERE, null, ex); return false; } } if (sc !=null){ sc.close(); return true; } } return false; } /** * Method to get the api credentials in a list from a directory that contains multiple txt files * @param config_path the directory to read * @param name the name of the file that we would like to read * @return a list with the credentials */ public List<String> GetKeyFile(String config_path,String name){ Path input_path=Paths.get(config_path); DataManipulation getfiles=new DataManipulation();//class responsible for the extraction of paths Collection<File> inputs_files;//array to include the paths of the txt files inputs_files=getfiles.getinputfiles(input_path.toString(),"txt");//method to retrieve all the path of the input documents List<String> linesList = new ArrayList<>(); ReadInput ri = new ReadInput(); for (File input : inputs_files) { if(input.getName().contains(name)){ linesList=ri.readLinesConfig(input); } } return linesList; } /** * Get the API credentials from a given file * @param Input the file to read * @return a List with the credentials in strings */ public List<String> readLinesConfig(File Input){ FileInputStream inputStream=null; Scanner sc=null; List<String> output = new ArrayList<>(); try{ inputStream=new FileInputStream(Input); sc=new Scanner(inputStream); while (sc.hasNextLine()) { output.add(sc.nextLine().trim()); } } catch (IOException ex) { Logger.getLogger(ReadInput.class.getName()).log(Level.SEVERE, null, ex); return output; } finally { if (inputStream !=null){ try { inputStream.close(); } catch (IOException ex) { Logger.getLogger(ReadInput.class.getName()).log(Level.SEVERE, null, ex); return output; } } if (sc !=null){ sc.close(); } } return output; } }