package focusedCrawler.target;
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.PrintWriter;
import java.util.HashSet;
import java.util.Scanner;
import focusedCrawler.target.model.Page;
public class TargetStorageMonitor {
private PrintWriter fCrawledPages;
private PrintWriter fRelevantPages;
private PrintWriter fNonRelevantPages;
private PrintWriter fHarvestInfo;
int totalOnTopicPages = 0;
private int totalOfPages = 0;
public TargetStorageMonitor(String dataPath) {
File file = new File(dataPath+"/data_monitor/");
if(!file.exists()) {
file.mkdirs();
}
String fileCrawledPages = dataPath + "/data_monitor/crawledpages.csv";
String fileRelevantPages = dataPath + "/data_monitor/relevantpages.csv";
String fileHarvestInfo = dataPath + "/data_monitor/harvestinfo.csv";
String fileNonRelevantPages = dataPath + "/data_monitor/nonrelevantpages.csv";
try {
fCrawledPages = createBufferedWriter(fileCrawledPages);
fRelevantPages = createBufferedWriter(fileRelevantPages);
fHarvestInfo = createBufferedWriter(fileHarvestInfo);
fNonRelevantPages = createBufferedWriter(fileNonRelevantPages);
} catch (Exception e) {
throw new IllegalStateException("Problem while opening files to export target metrics", e);
}
}
private PrintWriter createBufferedWriter(String file) throws FileNotFoundException {
boolean append = true;
BufferedOutputStream bos = new BufferedOutputStream(new FileOutputStream(file, append));
boolean autoFlush = true;
return new PrintWriter(bos, autoFlush);
}
public synchronized void countPage(Page page, boolean isRelevant, double prob) {
long currentTime = System.currentTimeMillis();
totalOfPages++;
fCrawledPages.printf("%s\t%d\n", page.getURL().toString(), currentTime);
fHarvestInfo.printf("%d\t%d\t%d\n", totalOnTopicPages, totalOfPages, currentTime);
if(isRelevant) {
totalOnTopicPages++;
fRelevantPages.printf("%s\t%.10f\t%d\n", page.getURL().toString(), prob, currentTime);
} else {
fNonRelevantPages.printf("%s\t%.10f\t%d\n", page.getURL().toString(), prob, currentTime);
}
}
public int getTotalOfPages() {
return totalOfPages;
}
public static HashSet<String> readRelevantUrls(String dataPath) {
String fileRelevantPages = dataPath + "/data_monitor/relevantpages.csv";
HashSet<String> relevantUrls = new HashSet<>();
try(Scanner scanner = new Scanner(new File(fileRelevantPages))) {
while(scanner.hasNext()){
String nextLine = scanner.nextLine();
String[] splittedLine = nextLine.split("\t");
if(splittedLine.length == 3) {
String url = splittedLine[0];
relevantUrls.add(url);
}
}
return relevantUrls;
} catch (FileNotFoundException e) {
throw new RuntimeException("Failed to load relevant URL from target monitor file: "+fileRelevantPages);
}
}
public void close() {
fCrawledPages.close();
fHarvestInfo.close();
fNonRelevantPages.close();
fRelevantPages.close();
}
}