package eu.europeana.creative.dataset.culturecam.bl; import it.cnr.isti.feature.extraction.FeatureExtractionException; import it.cnr.isti.vir.features.IFeaturesCollector; import it.cnr.isti.vir.features.mpeg7.imageanalysis.ScalableColorPlusImpl.ImageType; import it.cnr.isti.vir.similarity.knn.IntDoubleString; import it.cnr.isti.vir.similarity.metric.LireMetric; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; import java.util.Arrays; import java.util.List; import java.util.Map; import java.util.SortedSet; import org.junit.Test; import eu.europeana.api.client.dataset.DatasetDescriptor; import eu.europeana.api.client.exception.TechnicalRuntimeException; import eu.europeana.api.client.thumbnails.processing.LargeThumbnailsetProcessing; import eu.europeana.creative.dataset.culturecam.bl.analysis.SubsetAnalyserImpl; import eu.europeana.creative.dataset.pt.classification.GrayScaleSepiaDetector; public class BLSetAnalysisTest extends BaseBlTest { @Test public void performDatasetAggregation() throws IOException { String datasetName = "culturecam"; setDataset(datasetName); File cvsFolder = new File(getBlCvsFolder(STEP_FILTERED_ORDERED)); File[] collectionFiles = cvsFolder.listFiles(); File outFolder = new File(getBlCvsFolder(STEP_AGGREGATED)); File outFile = new File(outFolder, "BL_FLickr.csv"); BufferedReader reader = null; // String headerLine = null; String line = null; outFile.getParentFile().mkdirs(); BufferedWriter datasetWriter = new BufferedWriter(new FileWriter(outFile)); log.debug("Aggregating dataset: " + getDataset()); for (int i = 0; i < collectionFiles.length; i++) { reader = new BufferedReader(new FileReader(collectionFiles[i])); boolean firstLine = true; while ((line = reader.readLine()) != null) { // write headers to sysout if (firstLine) { log.debug("Writting dataset headerline: " + line); firstLine = false; } // write all data to dataset datasetWriter.write(line); datasetWriter.write("\n"); } datasetWriter.flush(); // close reader try { reader.close(); } catch (IOException e) { System.out.println("cannot close reader for: " + collectionFiles[i]); e.printStackTrace(); } } log.trace("Closing dataset file"); datasetWriter.close(); } //@Test public void analyseBlSets() throws Exception { for (String setId : blSets.keySet()) { log.debug("Analyzing set: " + setId); analyseBlSet(setId); // break; } } private void analyseBlSet(String setId) throws FileNotFoundException, IOException, FeatureExtractionException { String datasetName = "culturecam"; setDataset(datasetName); final String subsetName = blSets.get(setId); DatasetDescriptor datasetDescriptor = new DatasetDescriptor(subsetName, setId); blockSize = 1000; String thumbnailsFile = getCvsFileForStep(datasetDescriptor, STEP_THUMBNAILS); File thumbnailsCvsFile = new File(thumbnailsFile); //order by avg distance descending SortedSet<IntDoubleString> order = generateOrder(datasetName, subsetName, datasetDescriptor, thumbnailsCvsFile); if(order.isEmpty()){ log.warn("No items in dataset. Processing stoped for dataset: " + datasetName); return; //System.out.println("No items dataset: " + ); } //categorize by color-fullness File categorizationFile = categorizeThumbnails(datasetDescriptor, thumbnailsCvsFile); Map<String, String> categorizedMap = readThumbnailsMap(categorizationFile); //filter definition String[] filter = new String[] { ImageType.GRAYSCALE.name() }; List<String> filterOut = Arrays.asList(filter); //generate filtered ordered File filteredOrderedFile = writeFilteredOrderedSubset(datasetDescriptor, order, categorizedMap, filterOut); //read filtered ordered map Map<String, String> filteredOrderedMap = readThumbnailsMap(filteredOrderedFile); log.debug(datasetDescriptor.toString() + " : " + filteredOrderedMap.size()); } private File writeFilteredOrderedSubset(DatasetDescriptor datasetDescriptor, SortedSet<IntDoubleString> order, Map<String, String> categorizedMap, List<String> filterOut) { String filteredOrderedFileName = getCvsFileForStep(datasetDescriptor, STEP_FILTERED_ORDERED); File filteredOrderedFile = new File(filteredOrderedFileName); try { // create parent dirs filteredOrderedFile.getParentFile().mkdirs(); log.warn("Existing files will be overwritten! " + filteredOrderedFile); BufferedWriter writer = new BufferedWriter(new FileWriter(filteredOrderedFile)); writeCvsFileHeader(writer, datasetDescriptor.getImageSetName(), -1, datasetDescriptor.getClassifications()); int count = 0; String thumbnailId; String category; String thumbnailData; for (IntDoubleString intDoubleString : order) { thumbnailId = intDoubleString.getStringId(); thumbnailData = categorizedMap.get(thumbnailId); category = thumbnailData.substring(thumbnailData.lastIndexOf(';') +1); if( filterOut.contains(category.trim())){ log.debug("Filter Out:" + thumbnailData + " : " + category); continue; } // intDoubleString, // subsetMap.get(intDoubleString.getStringId())); //csvOrder = intDoubleString.toString().replaceAll(" ", ";"); writer.write(thumbnailId); writer.write(";"); writer.write(thumbnailData); writer.write(";"); writer.write(intDoubleString.getDistance().toString()); writer.write(";"); writer.write("\n"); count++; if (count % 1000 == 0) writer.flush(); } writer.flush(); writer.close(); } catch (Exception e) { throw new TechnicalRuntimeException("cannot write cvs file"); } return filteredOrderedFile; } protected SortedSet<IntDoubleString> generateOrder(String datasetName, final String subsetName, DatasetDescriptor datasetDescriptor, File thumbnailsCvsFile) throws FileNotFoundException, IOException, FeatureExtractionException { Map<String, String> subsetMap = readThumbnailsMap(thumbnailsCvsFile); SubsetAnalyserImpl<IFeaturesCollector> analyser = new SubsetAnalyserImpl<IFeaturesCollector>( datasetName, subsetName, new LireMetric(), false); analyser.init(); analyser.extractSubsetFeatures(subsetMap.keySet()); SortedSet<IntDoubleString> order = analyser.generateOrder(); String orderedThumbnailsFile = getCvsFileForStep(datasetDescriptor, STEP_ORDERED); File outFile = new File(orderedThumbnailsFile); writeOrderedSubset(datasetDescriptor, subsetMap, order, outFile); // for (IntDouble intDouble : order) { // System.out.println(intDouble); // } return order; } // @Test public File categorizeThumbnails(DatasetDescriptor datasetDescriptor, File thumbnailsFile) throws FileNotFoundException, IOException { // String thumbnailsFile = getCvsFileForStep(datasetDescriptor, // STEP_THUMBNAILS); // new File(thumbnailsFile) String outFile = getCvsFileForStep(datasetDescriptor, STEP_CLASSIFIED); LargeThumbnailsetProcessing datasetCategorization = new LargeThumbnailsetProcessing( thumbnailsFile); // String imageFolder = getConfiguration().getImageFolder(getDataset()); String imageFolder = IMAGE_FOLDER_NAME; GrayScaleSepiaDetector observer = new GrayScaleSepiaDetector(new File( imageFolder), 85, 3); final File outputFile = new File(outFile); observer.setOutputFile(outputFile); datasetCategorization.addObserver(observer); datasetCategorization.processThumbnailset(start, limit, blockSize); System.out.println("Skipped items: " + datasetCategorization.getFailureCount()); return outputFile; } private void writeOrderedSubset(DatasetDescriptor dataset, Map<String, String> subsetMap, SortedSet<IntDoubleString> order, File file) { try { // create parent dirs file.getParentFile().mkdirs(); log.warn("Existing files will be overwritten! " + file); BufferedWriter writer = new BufferedWriter(new FileWriter(file)); writeCvsFileHeader(writer, dataset.getImageSetName(), order.size(), dataset.getClassifications()); int count = 0; String csvOrder; for (IntDoubleString intDoubleString : order) { // intDoubleString, // subsetMap.get(intDoubleString.getStringId())); csvOrder = intDoubleString.toString().replaceAll(" ", ";"); writer.write(csvOrder); writer.write(";"); writer.write("\n"); count++; if (count % 1000 == 0) writer.flush(); } writer.flush(); writer.close(); } catch (Exception e) { throw new TechnicalRuntimeException("cannot write cvs file"); } } protected void writeThumbnailsToCsvFile(DatasetDescriptor dataset, Map<String, String> thumbnails, File file, int fileWritePolicy) throws IOException { } }