package com.matrobot.gha.insights.filter;
import java.io.IOException;
import com.matrobot.gha.archive.repo.RepositoryRecord;
import com.matrobot.gha.insights.ml.Dataset;
import com.matrobot.gha.insights.ml.Sample;
public class ClassifyRepositoryFilter {
private static final int MIN_ACTIVITY = 5;
private RepositoryArchiveList datasets = new RepositoryArchiveList();
public ClassifyRepositoryFilter(String firstPath, String secondPath, String thirdPath) throws IOException{
datasets.addFromFile(firstPath);
datasets.addFromFile(secondPath);
datasets.addFromFile(thirdPath);
}
/**
* Training data for single feature vector
* @return
*/
public Dataset getDataset() {
Dataset dataset = new Dataset(2);
for(RepositoryRecord record : datasets.getDataset(1).values()){
RepositoryRecord nextRecord = datasets.findRepository(2, record.repoName);
RepositoryRecord prevRecord = datasets.findRepository(0, record.repoName);
if(record.pushEventCount > MIN_ACTIVITY && prevRecord.pushEventCount > MIN_ACTIVITY){
Sample sample = createSample(record, nextRecord, prevRecord);
dataset.addSample(sample);
}
}
return dataset;
}
private Sample createSample(RepositoryRecord record, RepositoryRecord nextRecord, RepositoryRecord prevRecord) {
Sample sample = new Sample(2);
double prevActivity = prevRecord.pushEventCount;
if(prevActivity > 0){
sample.features[0] = record.pushEventCount/prevActivity-1;
}
else{
sample.features[0] = 0;
}
sample.features[1] = record.committers.size()-prevRecord.committers.size();
sample.output = (nextRecord.pushEventCount > record.pushEventCount)? 1 : 0;
return sample;
}
}