/*
* Copyright 2015 Themistoklis Mavridis <themis.mavridis@issel.ee.auth.gr>.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.thesmartweb.swebrank;
import com.snowtide.PDF;
import com.snowtide.pdf.Document;
import com.snowtide.pdf.OutputTarget;
import java.io.*;
import java.util.*;
import java.io.IOException;
import java.net.URL;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.commons.io.FileUtils;
import org.elasticsearch.ElasticsearchException;
import org.elasticsearch.action.index.IndexRequest;
import org.elasticsearch.action.index.IndexResponse;
import org.elasticsearch.client.Client;
import org.elasticsearch.client.transport.TransportClient;
import org.elasticsearch.common.settings.ImmutableSettings;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.transport.InetSocketTransportAddress;
import org.elasticsearch.node.Node;
import static org.elasticsearch.node.NodeBuilder.nodeBuilder;
import org.json.simple.JSONObject;
/**
* Class to analyze the urls of the results of the Search Engine APIs
* @author Themistoklis Mavridis
*/
public class LinksParseAnalysis {
/**
* the url to analyze
*/
public String url_check;
/**
* the top words if we would like to use TF-IDF
*/
protected List<String> topWordsTFIDF;
/**
* Method that exports the content from the urls provided and stores it in the ElasticSearch cluster of ours in a specific index
* and calls the Semantic Analysis algorithm selected. Until now the method exports content from:
* -html
* -youtube videos
* -pdf files
* @param total_links It contains all the links that we are going to analyze
* @param domain The domain that we analyze
* @param engine The search engine that we analyze the results from
* @param example_dir It contains the directory where to save the results of the analysis
* @param quer It contains the query for which the urls were the results for (it is used for the creation of the id in elasticsearch)
* @param nTopics The number of topics for Latent Dirichlet Allocation
* @param alpha The alpha value of LDA
* @param beta The beta value of LDA
* @param niters The number of iterations of LDA
* @param top_words The amount of top words per topic to keep for LDA
* @param LDAflag Flag if LDA is used
* @param TFIDFflag Flag if TFIDF is used
* @param config_path the path that contains the configuration files
* @return the parsed output for each url provided
*/
public String[] perform(String[] total_links, String domain, String engine, String example_dir,String quer,int nTopics,double alpha,double beta,int niters,int top_words,boolean LDAflag,boolean TFIDFflag, String config_path){
String[] parse_output = new String[total_links.length];
try {
System.gc();
WebParser web = new WebParser();//our web parser
APIconn apicon = new APIconn();//our instance to check the connection to a url
int counter_LDA_documents = 0;
Settings settings = ImmutableSettings.settingsBuilder()
.put("cluster.name","lshrankldacluster").build();
Client client = new TransportClient(settings)
.addTransportAddress(new
InetSocketTransportAddress("localhost", 9300)
);
//Node node = nodeBuilder().client(true).clusterName("lshrankldacluster").node();//our elasticsearch node builder
//Client client = node.client();//the client for elasticsearch node
for (int i = 0; i < (total_links.length); i++) {
parse_output[i]="";
if (total_links[i] != null) {
System.out.println("Link: "+total_links[i]+"\n");
DataManipulation dm = new DataManipulation();
boolean structuredFiled = dm.StructuredFileCheck(total_links[i]);//we check if the url contains a structured document file type
if (!apicon.check_conn(total_links[i]).contains("ok-conn")||structuredFiled||total_links[i].contains("http://www.youtube.com/watch?")) {
if (total_links[i].contains("http://www.youtube.com/watch?")) {//if the link is a youtube link we have to treat its JSON differently
String ventry = total_links[i].substring(31);
JSONparsing ypr = new JSONparsing();
url_check=total_links[i];
File current_url = new File(example_dir+ engine +"/" + i + "/"+ "current_url.txt");
FileUtils.writeStringToFile(current_url ,url_check);
parse_output[i] = ypr.GetYoutubeDetails(ventry).replace("\n","").replace("r","");
System.gc();
if (parse_output[i]!=null) {
counter_LDA_documents++;
String directory = example_dir+ engine + "/" + i + "/";
File file_content_lda = new File(directory + "youtube_content.txt");
FileUtils.writeStringToFile(file_content_lda, parse_output[i]);
}
}
if (total_links[i].contains(".pdf")) {//if the link has a pdf we use Snowtide Pdf reader
url_check=total_links[i];
File current_url = new File(example_dir+ engine +"/" + i + "/"+ "current_url.txt");
FileUtils.writeStringToFile(current_url ,url_check);
File current_pdf = new File(example_dir+ engine +"/" + i + "/"+ "current_pdf.txt");
URL URLlink = new URL(url_check);
FileUtils.copyURLToFile(URLlink, current_pdf);
Document pdf = PDF.open(current_pdf);
StringWriter buffer = new StringWriter();
pdf.pipe(new OutputTarget(buffer));
pdf.close();
parse_output[i] = buffer.toString().replace("\n", "").replace("\r", "");
Stopwords stopwords = new Stopwords();
parse_output[i] = stopwords.stop(parse_output[i]);
System.gc();
boolean deleteQuietly = FileUtils.deleteQuietly(current_pdf);//we delete the file after we read it
if (parse_output[i]!=null) {
counter_LDA_documents++;
String directory = example_dir+ engine + "/" + i + "/";
File file_content_lda = new File(directory + "pdf_content.txt");
FileUtils.writeStringToFile(file_content_lda, parse_output[i]);
}
}
}
else {//if the link does not follow to the cases above, we parse it using WebParser
int number = i;
String directory = example_dir + engine + "/" + number + "/";
System.out.println("Link:"+total_links[i]+"\n");
url_check=total_links[i];
File current_url = new File(directory+"current_url.txt");
FileUtils.writeStringToFile(current_url ,url_check);
System.gc();
parse_output[i] = web.Parse(url_check);//we call the parser
System.gc();
if (parse_output[i]!=null) {
counter_LDA_documents++;//we count the amount of documents, as it is needed for JGibbLDA as seen in http://jgibblda.sourceforge.net/#2.3._Input_Data_Format
directory = example_dir+ engine + "/" + i + "/";
//write the output from the html parsing
File file_content_lda = new File(directory + "html_parse_content.txt");
FileUtils.writeStringToFile(file_content_lda, parse_output[i]);
}
}
JSONObject obj = new JSONObject();//an object to save the parsed content in elasticsearch
obj.put("ParsedContent", parse_output[i]);
String id=domain+"/"+quer+"/"+engine+"/"+total_links[i];
ReadInput ri = new ReadInput();
List<String> elasticIndexes=ri.GetKeyFile(config_path, "elasticSearchIndexes");
IndexRequest indexReq=new IndexRequest(elasticIndexes.get(3),"content",id);
indexReq.source(obj);
IndexResponse indexRes = client.index(indexReq).actionGet();
}
}
//node.close();
client.close();
String output_string_content = Integer.toString(counter_LDA_documents);
TwitterAnalysis tw=new TwitterAnalysis();//we are going gather info from Twitter using Twitter4j
String twitter_txt=tw.perform(quer,config_path);
for (int i = 0; i < parse_output.length; i++) {//we combine all the parsed content into one document
if (parse_output[i]!=null) {
output_string_content = output_string_content + "\n" + parse_output[i];
}
}
if(!(twitter_txt.equalsIgnoreCase("fail"))){
output_string_content = output_string_content + "\n" + twitter_txt;//we add also the twitter content
}
String directory = example_dir + engine + "/";
//call LDA
File file_content_lda = new File(directory + "content_for_analysis.txt");//we are going to save the content also in txt format for backup and usage for LDA
FileUtils.writeStringToFile(file_content_lda, output_string_content);
if(LDAflag){
LDAcall ld = new LDAcall();//we call lda
ld.call(nTopics, alpha, beta, niters, top_words, directory);
}
else if(TFIDFflag){
TFIDF tf=new TFIDF();//we call TFIDF
topWordsTFIDF=tf.compute(parse_output,top_words,example_dir);
}
return parse_output;
} catch (IOException | ElasticsearchException | ArrayIndexOutOfBoundsException ex) {
Logger.getLogger(LinksParseAnalysis.class.getName()).log(Level.SEVERE, null, ex);
return parse_output;
}
}
/**
* Method to return the topWords from TFIDF
* @return topWords from TFIDF in a List
*/
public List<String> return_topWordsTFIDF(){return topWordsTFIDF;}
}