package focusedCrawler.integration;
import static org.hamcrest.CoreMatchers.is;
import static org.junit.Assert.assertThat;
import java.io.File;
import java.net.URL;
import java.net.URLDecoder;
import java.nio.file.Files;
import java.nio.file.Paths;
import org.junit.Rule;
import org.junit.Test;
import org.junit.rules.TemporaryFolder;
import focusedCrawler.Main;
import focusedCrawler.target.classifier.TargetClassifier;
import focusedCrawler.target.classifier.TargetClassifierFactory;
import focusedCrawler.target.model.Page;
public class BuildModelTest {
@Rule
public TemporaryFolder tempFolder = new TemporaryFolder();
@Test
public void wekaFeaturesFileShouldBeGeneratedInTheProperFormat() throws Exception {
String trainingPath = BuildModelTest.class.getResource("build_model_test").getFile();
String modelPath = tempFolder.newFolder().toString();
// Train a page classifier model
String[] args = {"buildModel", "-t", trainingPath, "-o", modelPath};
Main.main(args);
// Load model trained
TargetClassifier tc = TargetClassifierFactory.create(modelPath);
// Classify one example from training data just for sanity check
Page samplePositivePage = readOnePageFromFolder(trainingPath+"/positive");
Page sampleNegativePage = readOnePageFromFolder(trainingPath+"/negative");
assertThat(tc.classify(samplePositivePage).isRelevant(), is(true));
assertThat(tc.classify(sampleNegativePage).isRelevant(), is(false));
}
private Page readOnePageFromFolder(String positiveFolder) throws Exception {
File[] allPositivePages = (new File(positiveFolder)).listFiles();
assertThat(allPositivePages.length, is(6));
String positiveFileName = allPositivePages[0].getName();
String fileContent = new String(Files.readAllBytes(Paths.get(allPositivePages[0].getAbsolutePath())));
Page samplePositivePage = new Page(new URL(URLDecoder.decode(positiveFileName, "UTF-8")), fileContent);
return samplePositivePage;
}
}