LinksParseAnalysis.java example

Explorer

SWebRank-master
- src
  - main
    - java
      - com
        seomoz
        api
        authentication
        Authenticator.java
        Base64.java
        constants
        AnchorTextConstants.java
        LinksConstants.java
        TopPagesConstants.java
        URLMetricsConstants.java
        example
        Sample.java
        response
        AnchorTextResponse.java
        LinksResponse.java
        TopPagesResponse.java
        UrlResponse.java
        service
        AnchorTextService.java
        LinksService.java
        TopPagesService.java
        URLMetricsService.java
        util
        ConnectionUtil.java
        thesmartweb
        swebrank
        APIconn.java
        AnnotationClient.java
        BingResults.java
        CheckConvergence.java
        CombinationGenerator.java
        Combinations_Engine.java
        DBpediaSpotlightClient.java
        DandelionEntities.java
        DataManipulation.java
        Diffbot.java
        ElasticGetWordList.java
        GoogleResults.java
        JSONparsing.java
        LDAcall.java
        LDAsemStats.java
        LDAtopicsWords.java
        Lemmatizer.java
        LinksParseAnalysis.java
        Main.java
        Moz.java
        NWD_Analysis.java
        NWD_total.java
        PermutationGenerator.java
        PorterStemmer.java
        ReadInput.java
        Search_analysis.java
        Sensebot.java
        Sindice.java
        StHttpRequest.java
        StemmerSnow.java
        Stopwords.java
        TFIDF.java
        Total_analysis.java
        TwitterAnalysis.java
        VisibilityScore.java
        WebParser.java
        YahooConn.java
        YahooEntityCategory.java
        YahooResults.java
      - org
        tartarus
        snowball
        Among.java
        SnowballProgram.java
        SnowballStemmer.java
        TestApp.java
        ext
        englishStemmer.java

/* 
 * Copyright 2015 Themistoklis Mavridis <themis.mavridis@issel.ee.auth.gr>.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.thesmartweb.swebrank;

import com.snowtide.PDF;
import com.snowtide.pdf.Document;
import com.snowtide.pdf.OutputTarget;
import java.io.*;
import java.util.*;
import java.io.IOException;
import java.net.URL;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.commons.io.FileUtils;
import org.elasticsearch.ElasticsearchException;
import org.elasticsearch.action.index.IndexRequest;
import org.elasticsearch.action.index.IndexResponse;
import org.elasticsearch.client.Client;
import org.elasticsearch.client.transport.TransportClient;
import org.elasticsearch.common.settings.ImmutableSettings;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.transport.InetSocketTransportAddress;
import org.elasticsearch.node.Node;
import static org.elasticsearch.node.NodeBuilder.nodeBuilder;
import org.json.simple.JSONObject;

/**
 * Class to analyze the urls of the results of the Search Engine APIs
 * @author Themistoklis Mavridis
 */
public class LinksParseAnalysis {

    /**
     * the url to analyze
     */
    public String url_check;

    /**
     * the top words if we would like to use TF-IDF
     */
    protected List<String> topWordsTFIDF;

    /**
     * Method that exports the content from the urls provided and stores it in the ElasticSearch cluster of ours in a specific index
     * and calls the Semantic Analysis algorithm selected. Until now the method exports content from: 
     * -html
     * -youtube videos
     * -pdf files
     * @param total_links It contains all the links that we are going to analyze
     * @param domain The domain that we analyze
     * @param engine The search engine that we analyze the results from
     * @param example_dir It contains the directory where to save the results of the analysis
     * @param quer It contains the query for which the urls were the results for (it is used for the creation of the id in elasticsearch)
     * @param nTopics The number of topics for Latent Dirichlet Allocation
     * @param alpha The alpha value of LDA
     * @param beta The beta value of LDA
     * @param niters The number of iterations of LDA
     * @param top_words The amount of top words per topic to keep for LDA
     * @param LDAflag Flag if LDA is used
     * @param TFIDFflag Flag if TFIDF is used
     * @param config_path the path that contains the configuration files
     * @return the parsed output for each url provided
     */
    public String[] perform(String[] total_links, String domain, String engine, String example_dir,String quer,int nTopics,double alpha,double beta,int niters,int top_words,boolean LDAflag,boolean TFIDFflag, String config_path){
        String[] parse_output = new String[total_links.length];
        try {
            System.gc();
            WebParser web = new WebParser();//our web parser
            APIconn apicon = new APIconn();//our instance to check the connection to a url
            int counter_LDA_documents = 0;
            Settings settings = ImmutableSettings.settingsBuilder()
                    .put("cluster.name","lshrankldacluster").build();
            Client client = new TransportClient(settings)
                    .addTransportAddress(new
                            InetSocketTransportAddress("localhost", 9300)
                    );
            
            //Node node = nodeBuilder().client(true).clusterName("lshrankldacluster").node();//our elasticsearch node builder
           //Client client = node.client();//the client for elasticsearch node
            for (int i = 0; i < (total_links.length); i++) {
                parse_output[i]="";
                if (total_links[i] != null) {
                    System.out.println("Link: "+total_links[i]+"\n");
                    DataManipulation dm = new DataManipulation();
                    boolean structuredFiled = dm.StructuredFileCheck(total_links[i]);//we check if the url contains a structured document file type
                    
                    if (!apicon.check_conn(total_links[i]).contains("ok-conn")||structuredFiled||total_links[i].contains("http://www.youtube.com/watch?")) {
                        if (total_links[i].contains("http://www.youtube.com/watch?")) {//if the link is a youtube link we have to treat its JSON differently
                            String ventry = total_links[i].substring(31);
                            JSONparsing ypr = new JSONparsing();
                            url_check=total_links[i];
                            File current_url = new File(example_dir+ engine +"/" + i + "/"+ "current_url.txt");
                            FileUtils.writeStringToFile(current_url ,url_check);
                            parse_output[i] = ypr.GetYoutubeDetails(ventry).replace("\n","").replace("r","");
                            System.gc();
                            if (parse_output[i]!=null) {
                                counter_LDA_documents++;
                                String directory = example_dir+ engine + "/" + i + "/";
                                File file_content_lda = new File(directory + "youtube_content.txt");
                                FileUtils.writeStringToFile(file_content_lda, parse_output[i]);
                            }
                        }
                        if (total_links[i].contains(".pdf")) {//if the link has a pdf we use Snowtide Pdf reader
                            url_check=total_links[i];
                            File current_url = new File(example_dir+ engine +"/" + i + "/"+ "current_url.txt");
                            FileUtils.writeStringToFile(current_url ,url_check);
                            File current_pdf = new File(example_dir+ engine +"/" + i + "/"+ "current_pdf.txt");
                            URL URLlink = new  URL(url_check);
                            FileUtils.copyURLToFile(URLlink, current_pdf);
                            Document pdf = PDF.open(current_pdf);
                            StringWriter buffer = new StringWriter();
                            pdf.pipe(new OutputTarget(buffer));
                            pdf.close();
                            parse_output[i] = buffer.toString().replace("\n", "").replace("\r", "");
                            Stopwords stopwords = new Stopwords();
                            parse_output[i] = stopwords.stop(parse_output[i]);
                            System.gc();
                            boolean deleteQuietly = FileUtils.deleteQuietly(current_pdf);//we delete the file after we read it
                            if (parse_output[i]!=null) {
                                counter_LDA_documents++;
                                String directory = example_dir+ engine + "/" + i + "/";
                                File file_content_lda = new File(directory + "pdf_content.txt");
                                FileUtils.writeStringToFile(file_content_lda, parse_output[i]);
                            }
                        }
                    }
                    else {//if the link does not follow to the cases above, we parse it using WebParser
                        int number = i;
                        String directory = example_dir + engine  + "/" + number + "/";
                        System.out.println("Link:"+total_links[i]+"\n");
                        url_check=total_links[i];
                        File current_url = new File(directory+"current_url.txt");                               
                        FileUtils.writeStringToFile(current_url ,url_check);
                        System.gc();
                        parse_output[i] = web.Parse(url_check);//we call the parser
                        System.gc();
                        if (parse_output[i]!=null) {
                            counter_LDA_documents++;//we count the amount of documents, as it is needed for JGibbLDA as seen in http://jgibblda.sourceforge.net/#2.3._Input_Data_Format
                            directory = example_dir+ engine + "/" + i + "/";
                            //write the output from the html parsing
                            File file_content_lda = new File(directory + "html_parse_content.txt");
                            FileUtils.writeStringToFile(file_content_lda, parse_output[i]);
                        }
                    }
                    JSONObject obj = new JSONObject();//an object to save the parsed content in elasticsearch
                    obj.put("ParsedContent", parse_output[i]);
                    String id=domain+"/"+quer+"/"+engine+"/"+total_links[i];
                    ReadInput ri = new ReadInput();
                    List<String> elasticIndexes=ri.GetKeyFile(config_path, "elasticSearchIndexes");
                    IndexRequest indexReq=new IndexRequest(elasticIndexes.get(3),"content",id);
                    indexReq.source(obj);
                    IndexResponse indexRes = client.index(indexReq).actionGet();
                }
            }
            //node.close();
            client.close();
            String output_string_content = Integer.toString(counter_LDA_documents);
            TwitterAnalysis tw=new TwitterAnalysis();//we are going gather info from Twitter using Twitter4j
            String twitter_txt=tw.perform(quer,config_path);
            for (int i = 0; i < parse_output.length; i++) {//we combine all the parsed content into one document
                if (parse_output[i]!=null) {
                    output_string_content = output_string_content + "\n" + parse_output[i];
                }
            }
            if(!(twitter_txt.equalsIgnoreCase("fail"))){
                output_string_content = output_string_content + "\n" + twitter_txt;//we add also the twitter content
            }
            String directory = example_dir + engine + "/";
            //call LDA
            File file_content_lda = new File(directory + "content_for_analysis.txt");//we are going to save the content also in txt format for backup and usage for LDA
            FileUtils.writeStringToFile(file_content_lda, output_string_content);
            if(LDAflag){
                LDAcall ld = new LDAcall();//we call lda
                ld.call(nTopics, alpha, beta, niters, top_words, directory);
            }
            else if(TFIDFflag){
                TFIDF tf=new TFIDF();//we call TFIDF
                topWordsTFIDF=tf.compute(parse_output,top_words,example_dir);
            }
            return parse_output;
        } catch (IOException | ElasticsearchException | ArrayIndexOutOfBoundsException ex) {
            Logger.getLogger(LinksParseAnalysis.class.getName()).log(Level.SEVERE, null, ex);
            return parse_output;
        }
}

    /**
     * Method to return the topWords from TFIDF
     * @return topWords from TFIDF in a List
     */
    public List<String> return_topWordsTFIDF(){return topWordsTFIDF;}
}