package at.chille.crawler.analysis;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import at.chille.crawler.database.model.Header;
import at.chille.crawler.database.model.HostInfo;
import at.chille.crawler.database.model.PageInfo;
public class AnalysisHeader extends Analysis
{
public AnalysisHeader()
{
super();
}
public AnalysisHeader(boolean showDetails)
{
super(showDetails);
}
public AnalysisHeader(long useCrawlingSessionID, boolean showDetails)
{
super(useCrawlingSessionID, showDetails);
}
@Override
public void init()
{
this.name = "Headers (Interactive Analysis)";
this.description = "Interactive Header browsing";
}
private Map<String, Map<String, List<PageInfo>>> availableHeaders = new HashMap<String, Map<String, List<PageInfo>>>();
@Override
public int analyze()
{
Collection<HostInfo> hostInfos = this.getHostsToAnalyze();
// CrawlingSession session = selectCrawlingSession();
availableHeaders.clear();
for (HostInfo host : hostInfos)
{
for (PageInfo page : host.getPages().values())
{
for (Header header : page.getHeaders())
{
if (!availableHeaders.containsKey(header.getName()))
{
availableHeaders.put(header.getName(),
new HashMap<String, List<PageInfo>>());
}
Map<String, List<PageInfo>> availableValues = availableHeaders
.get(header.getName());
if (!availableValues.containsKey(header.getValue()))
{
availableValues.put(header.getValue(),
new ArrayList<PageInfo>());
}
List<PageInfo> pages = availableValues.get(header
.getValue());
pages.add(page);
}
}
}
// Output for each Header-Key all possible Header Values
for (Map.Entry<String, Map<String, List<PageInfo>>> availableHeader : availableHeaders
.entrySet())
{
out.println(availableHeader.getKey());
int total = 0;
for (Map.Entry<String, List<PageInfo>> availableValue : availableHeader
.getValue().entrySet())
{
total += availableValue.getValue().size();
out.println(" -> " + availableValue.getKey() + " ("
+ availableValue.getValue().size() + ")");
if (this.showDetails)
{
for (PageInfo page : availableValue.getValue())
{
out.println(" url: " + page.getUrl());
}
}
}
out.println("Total number of Pages with this Header-Name: " + total);
}
return 0;
}
@Override
public String exportToFolder(String folder)
{
String style = "<link rel=\"stylesheet\" href=\"../style.css\" />";
try
{
File indexFile = new File(folder, "header.html");
FileWriter fw = new FileWriter(indexFile, false);
BufferedWriter index = new BufferedWriter(fw);
index.write("<html><body><h1>Headers & Co</h1><ul>");
index.newLine();
for (Map.Entry<String, Map<String, List<PageInfo>>> availableHeader : availableHeaders
.entrySet())
{
File detailFile = new File(folder, "header_"
+ availableHeader.getKey().replace("/", "-") + ".html");
FileWriter fw2 = new FileWriter(detailFile, false);
BufferedWriter detail = new BufferedWriter(fw2);
detail.write("<html><head>" + style + "</head><body>");
detail.newLine();
detail.write("<h1>Header: " + availableHeader.getKey() + "</h1>");
detail.newLine();
int total = 0;
for (Map.Entry<String, List<PageInfo>> availableValue : availableHeader
.getValue().entrySet())
{
total += availableValue.getValue().size();
detail.write("<span class=\"value\"><h2>" + availableValue.getKey() + " ("
+ availableValue.getValue().size() + ")</h2><ul>");
detail.newLine();
for (PageInfo page : availableValue.getValue())
{
detail.write("<li>" + page.getUrl() + "</li>");
detail.newLine();
}
detail.write("</ul></span>");
}
detail.newLine();
detail.write("Total values for this header: " + total);
detail.newLine();
detail.write("</body></html>");
detail.close();
fw2.close();
index.write("<li><a href=\"" + this.getRelativePath(detailFile, indexFile)
+ "\">" + availableHeader.getKey() + "</a> (" + total
+ ")</li>");
index.newLine();
}
index.write("</ul></body></html>");
index.close();
fw.close();
return indexFile.getCanonicalPath();
}
catch (Exception e)
{
e.printStackTrace();
}
return null;
}
}