package eu.europeana.creative.dataset.culturecam.v1.analysis; import it.cnr.isti.feature.extraction.FeatureExtractionException; import it.cnr.isti.feature.extraction.Image2Features; import it.cnr.isti.vir.features.FeaturesCollectorArr; import it.cnr.isti.vir.features.IFeaturesCollector; import it.cnr.isti.vir.file.FeaturesCollectorsArchive; import it.cnr.isti.vir.id.IDString; import it.cnr.isti.vir.id.IHasID; import it.cnr.isti.vir.readers.CoPhIRv2Reader; import it.cnr.isti.vir.similarity.knn.IntDoubleString; import it.cnr.isti.vir.similarity.metric.LireMetric; import it.cnr.isti.vir.similarity.metric.Metric; import it.cnr.isti.vir.util.Pivots; import java.io.BufferedReader; import java.io.ByteArrayInputStream; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.util.ArrayList; import java.util.Comparator; import java.util.List; import java.util.Set; import java.util.SortedSet; import java.util.TreeSet; import org.apache.log4j.Logger; import eu.europeana.service.ir.image.IRConfiguration; import eu.europeana.service.ir.image.IRConfigurationImpl; import eu.europeana.service.ir.image.exceptions.TechnicalRuntimeException; public class DatasetAnalyzer<F> { private final class DistanceIdComparator implements Comparator<IntDoubleString> { @Override public int compare(IntDoubleString o1, IntDoubleString o2) { return o2.compareTo(o1); } } private Logger log = Logger.getLogger(getClass()); private IRConfiguration configuration; private String dataset = null; private Image2Features img2ftx; private File datasetFCArchiveFile; boolean resetFeaturesArchive = false; FeaturesCollectorsArchive datasetFCArchive; List<IFeaturesCollector> datasetFeatures; // private IFeaturesCollector[] qObj; private final Metric<F> comp; // private File lireObjectPivotsFile; // FeaturesCollectorsArchive lireObjectPivotsArchive; // SubsetAnalyserImpl() { // this(null, null); // } // SubsetAnalyserImpl(IRConfiguration configuration, String subset) { // this(configuration, null, ); // } public DatasetAnalyzer(String dataset, Metric<F> comp, boolean resetFeaturesArchive) { this(null, dataset, comp, resetFeaturesArchive); } public DatasetAnalyzer(IRConfiguration configuration, String dataset, Metric<F> comp, boolean resetFeaturesArchive) { this.configuration = configuration; this.dataset = dataset; this.comp = comp; this.resetFeaturesArchive = resetFeaturesArchive; } // @Override public IRConfiguration getConfiguration() { if (configuration == null) configuration = new IRConfigurationImpl(); return configuration; } // @Override public void init() { // ensure initialization of configuration attribute getConfiguration(); try { if (img2ftx == null) img2ftx = new Image2Features(getConfiguration() .getIndexConfFolder(getDataset())); } catch (Exception e) { throw new TechnicalRuntimeException( "Cannot instantiate feature extractor!", e); // log.warn("Cannot instantiate feature extractor!", e); } } protected void initDatasetFCArchive() { initDatasetFCArchive(resetFeaturesArchive); } protected void initDatasetFCArchive(boolean resetFile) { // create file path if needed if (!getDatasetFCArchiveFile().exists()) getDatasetFCArchiveFile().getParentFile().mkdirs(); else if (resetFile) getDatasetFCArchiveFile().delete(); try { datasetFCArchive = new FeaturesCollectorsArchive( getDatasetFCArchiveFile(), new LireMetric().getRequestedFeaturesClasses(), IDString.class, FeaturesCollectorArr.class); } catch (Exception e) { throw new TechnicalRuntimeException( "Cannot instantiate (pivots) feature collection archive!", e); } } public String getDataset() { return dataset; } public void setDataset(String dataset) { this.dataset = dataset; } public File getDatasetFCArchiveFile() { if (datasetFCArchiveFile == null) datasetFCArchiveFile = getConfiguration() .getFeaturesArchiveFile(getDataset()); return datasetFCArchiveFile; } public FeaturesCollectorsArchive getDatasetFCArchive() { return datasetFCArchive; } // @Override public void extractDatasetFeatures(Set<String> ThumbnailIds) throws FeatureExtractionException { // init file initDatasetFCArchive(); // TODO: move resetFeaturesArchive from constructor to this method //todo if (!resetFeaturesArchive && getDatasetFCArchiveFile().exists() && getDatasetFCArchiveFile().length() > 50) return;// skip feature extraction File thumbnailFile = null; int cnt = 0; try { for (String thumbnailId : ThumbnailIds) { log.debug("extracting features for item with ID: " + thumbnailId); thumbnailFile = getConfiguration().getImageFile(getDataset(), thumbnailId); final int PLACEHOLDER_SIZE = 3583; if(thumbnailFile.length() == PLACEHOLDER_SIZE){ log.debug("Skip placeholder thumbnail: " + thumbnailFile.getAbsolutePath()); continue; } storeImageFeatures(thumbnailId, new FileInputStream( thumbnailFile)); cnt++; if (cnt % 1000 == 0) log.debug("Features extracted for #pivots: " + cnt); } // write index files an close getDatasetFCArchive().close(); } catch (Exception e) { throw new FeatureExtractionException( "Cannot write pivot Features Archives!", e); } } public SortedSet<IntDoubleString> generateOrder(Set<String> thumbnailIds) { List<String> thumbnails = new ArrayList<String>(); thumbnails.addAll(thumbnailIds); double[][] interDist = evalInterDistances(thumbnails); TreeSet<IntDoubleString> res = new TreeSet<IntDoubleString>( new Comparator<IntDoubleString>() { @Override public int compare(IntDoubleString o1, IntDoubleString o2) { return o2.compareTo(o1); } }); log.debug("Avg inter-dist before ordering: " + Pivots.getTrMatrixAvg(interDist)); log.debug("Avg inter-dist before ordering(50): " + Pivots.getTrMatrixAvg(interDist, Math.min(interDist.length, 50))); double sum; String stringId; for (int i = 0; i < interDist.length; i++) { sum = 0; for (int j = 0; j < interDist[i].length; j++) { sum += Math.abs(interDist[i][j]); } stringId = thumbnails.get(i); res.add(new IntDoubleString(i, sum, stringId)); } return res; } public SortedSet<IntDoubleString> generateOrderNoInterDist(){ // if(useInterDist) // return TreeSet<IntDoubleString> res = new TreeSet<IntDoubleString>( new DistanceIdComparator()); String stringId; int i = 0; double avgDistance; final double datasetSize = (double)getDatasetFeatures().size(); for(IFeaturesCollector features : getDatasetFeatures()){ stringId = ((FeaturesCollectorArr) features).getID().toString(); avgDistance = evalDistSum(features)/datasetSize; res.add(new IntDoubleString(i, avgDistance, stringId)); System.out.println("adding item to set: " + stringId + " dist: " + avgDistance); i++; } return res; } private double evalDistSum(IFeaturesCollector queryfeatures) { double sum = 0; double[] dist = evalDistances(queryfeatures); for (int i = 0; i < dist.length; i++) { sum+=dist[i]; } return sum; } public SortedSet<IntDoubleString> generateOrderWithInterDist() { double[][] interDist = evalInterDistances(); TreeSet<IntDoubleString> res = new TreeSet<IntDoubleString>( new Comparator<IntDoubleString>() { @Override public int compare(IntDoubleString o1, IntDoubleString o2) { return o2.compareTo(o1); } }); log.debug("Avg inter-dist before ordering: " + Pivots.getTrMatrixAvg(interDist)); log.debug("Avg inter-dist before ordering(50): " + Pivots.getTrMatrixAvg(interDist, Math.min(interDist.length, 50))); double sum; String stringId; for (int i = 0; i < interDist.length; i++) { sum = 0; for (int j = 0; j < interDist[i].length; j++) { sum += Math.abs(interDist[i][j]); } stringId = ((IHasID)getDatasetFeatures().get(i)).getID().toString(); res.add(new IntDoubleString(i, sum, stringId)); } return res; } protected final double[][] evalInterDistances(List<String> ids) { getDatasetFeatures(); double temp[][] = new double[ids.size()][ids .size()]; // for ( int i=0; i<temp.length; i++ ) { // temp[i] = new double[i]; // } String idi; String idj; for (int i = 0; i < temp.length; i++) { for (int j = 0; j < temp[i].length; j++) { idi = ids.get(i); idj = ids.get(j); temp[i][j] = comp.distance(getFeaturesForId(idi), getFeaturesForId(idj)); } } return temp; } protected final double[] evalDistances(String id) { getDatasetFeatures(); final IFeaturesCollector features = getFeaturesForId(id); return evalDistances(features); } protected double[] evalDistances(final IFeaturesCollector features) { double temp[] = new double[getDatasetFeatures().size()]; for (int j = 0; j < temp.length; j++) { //idj = ids.get(j); temp[j] = comp.distance(features, getDatasetFeatures().get(j)); } return temp; } protected IFeaturesCollector getFeaturesForId(String id){ for (IFeaturesCollector features : getDatasetFeatures()) { if(id.equals(((IHasID) features).getID().toString())) return features; } throw new RuntimeException("Cannot find features colector for thumbnailId: " + id); } protected final double[][] evalInterDistances() { getDatasetFeatures(); double temp[][] = new double[datasetFeatures.size()][datasetFeatures .size()]; // for ( int i=0; i<temp.length; i++ ) { // temp[i] = new double[i]; // } for (int i = 0; i < temp.length; i++) { for (int j = 0; j < temp[i].length; j++) { temp[i][j] = comp.distance(datasetFeatures.get(i), datasetFeatures.get(j)); } System.out.println("i=" + i); } return temp; } protected List<IFeaturesCollector> getDatasetFeatures() { if (datasetFeatures == null) { try { datasetFeatures = getDatasetFCArchive().getAll(); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); throw new TechnicalRuntimeException( "Cannot get features from subset archive", e); } } return datasetFeatures; } protected void storeImageFeatures(String thumbnailId, InputStream imageObj) throws FeatureExtractionException { String imgFeatures; imgFeatures = img2ftx.extractFeatures(imageObj); storeImageFeatures(thumbnailId, imgFeatures); } protected void storeImageFeatures(String docID, String imgFeatures) throws FeatureExtractionException { BufferedReader br = null; try { InputStream is = new ByteArrayInputStream(imgFeatures.getBytes()); // read it with BufferedReader br = new BufferedReader(new InputStreamReader(is)); FeaturesCollectorArr features = CoPhIRv2Reader.getObj(br); // System.out.println("writting"); // LireObject object = new LireObject(features); features.setID(new IDString(docID)); getDatasetFCArchive().add(features); } catch (Exception e) { throw new FeatureExtractionException( "Cannot store pivot features: " + docID, e); } finally { if (br != null) try { br.close(); } catch (IOException e) { // this exception should not occur // if it occurs nothing harmful should occur log.warn("warning: exception occured when closing buffered reader of image features for image " + docID + "\nError message" + e.getLocalizedMessage()); } } } }