package eu.europeana.creative.dataset.pd; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.FileReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.nio.channels.FileChannel; import java.util.LinkedHashMap; import java.util.Map; import org.junit.Before; import org.junit.Test; import eu.europeana.api.client.EuropeanaApi2Client; import eu.europeana.api.client.dataset.DatasetDescriptor; import eu.europeana.api.client.exception.EuropeanaApiProblem; import eu.europeana.api.client.model.EuropeanaApi2Results; import eu.europeana.api.client.model.search.Facet; import eu.europeana.api.client.model.search.FacetField; import eu.europeana.api.client.thumbnails.ThumbnailAccessorUtils; import eu.europeana.api.client.thumbnails.download.ThumbnailDownloader; import eu.europeana.api.client.thumbnails.processing.LargeThumbnailsetProcessing; import eu.europeana.creative.dataset.IRTestConfigurations; import eu.europeana.creative.dataset.culturecam.input.SelectionDescriptionEnum; import eu.europeana.creative.dataset.culturecam.input.SelectionDescriptionImpl; import eu.europeana.creative.dataset.pt.classification.GrayScaleSepiaDetector; import eu.europeana.service.ir.image.IRConfiguration; import eu.europeana.service.ir.image.IRConfigurationImpl; public class PdThumbnailMapsTest extends ThumbnailAccessorUtils implements IRTestConfigurations{ //private boolean overwriteThumbnails = false; String jsonInputFilename = "/collections/pd/selection/input/pd_collections_facets.json"; String collectionsInputFilename = "/selection/input/pd_collections.csv"; String selectedCollectionsInputFilename = "/selection/input/selected_pd_collections.csv"; //String designInputFilename = "/selection/input/design_v1.csv"; //String thumbnailMapFolder = "/selection/thumbnailmap"; final String STEP_THUMBNAILMAP = "THUMBNAILMAP"; final String STEP_SUBSET = "SUBSET"; private static final String STEP_CLASSIFIED = "CLASSIFIED"; private String processingStep = null; private boolean overwriteThumbnails = false; final String IMAGE_FOLDER = "/app/eucreative/imagesimilarityhome/pd/image/"; @Before public void init(){ String dataset = "pd"; setDataset(dataset); } //@Test public void buildCollectionsCvs() throws IOException, EuropeanaApiProblem{ String json = readJsonFile(jsonInputFilename); EuropeanaApi2Client client = new EuropeanaApi2Client(); EuropeanaApi2Results results = client.parseApiResponse(json); File pdCollectionFacets = new File(getCollectionsCvsFolder() + collectionsInputFilename); Facet facet = results.getFacets().get(0); Map<String, String> collectionsMap = new LinkedHashMap<String, String>(); String collection; String collectionData; //build Map for (FacetField field : facet.getFields()) { collection = field.getLabel(); collectionData = field.getCount()+";" + "http://www.europeana.eu/portal/search.html?query=europeana_collectionName:"; //escape bracket collectionData += collection.replace("(", "\\("); collectionData += "&profile=minimal&qf=RIGHTS%3Ahttp%3A%2F%2Fcreativecommons.org%2Fpublicdomain%2F*&qt=false"; //keep only collections with more than 100 objects if(field.getCount() > 100){ collectionsMap.put(collection, collectionData); System.out.println(collection + ":" + field.getCount()); }else break; } DatasetDescriptor descriptor = new DatasetDescriptor("facets", "pd-collection"); writeThumbnailsToCsvFile(descriptor, collectionsMap, pdCollectionFacets, POLICY_OVERWRITE_FILE ); } //@Test public void buildSelectedCollections() throws IOException, EuropeanaApiProblem{ File selectedPdCollections = new File(getCollectionsCvsFolder() + selectedCollectionsInputFilename); //we misuse the readThumbnailsMap as this is the same implementation as readCollectionsMap DatasetDescriptor descriptor; int missingThumbnails; int missingThumbnailsSum = 0; int expectedResultsTotal = 0; int expectedResults = 0; SelectionDescriptionImpl selectionDescription; Map<String, String> selectedCollections = readThumbnailsMap(selectedPdCollections); //#ID;Title;Portal link;Results;items;selection;dicriminator;Content selection comments for (Map.Entry<String, String> collection : selectedCollections.entrySet()) { selectionDescription = new SelectionDescriptionImpl(collection.getKey(), collection.getValue().split(";")); descriptor = buildDatasetDescriptor(selectionDescription); this.setProcessingStep(STEP_THUMBNAILMAP); expectedResults = selectionDescription.getIntFieldValue(SelectionDescriptionEnum.RESULT_COUNT); File thumbnailsMapFile = getCollectionCsvFile(descriptor, STEP_THUMBNAILMAP); if(thumbnailsMapFile.exists()){ log.info("Skip selected collection. Thumbnailsmap exists already :" + thumbnailsMapFile); continue; } missingThumbnails = createSubset(descriptor.getImageSetName(), descriptor.getCollectionName(), selectionDescription.getFieldValue(SelectionDescriptionEnum.PORTAL_LINK), 0, expectedResults); missingThumbnailsSum += missingThumbnails; expectedResultsTotal += expectedResults; if(missingThumbnails > 0) System.out.println("Missing thumbnails in dataset:" + descriptor + ": " + missingThumbnails); } //we expect no more than 10 missing Thumbnails log.info("Number of missing thumbnails: " + missingThumbnailsSum); log.info("Total expected results: " + expectedResultsTotal); } protected DatasetDescriptor buildDatasetDescriptor( SelectionDescriptionImpl selectionDescription) { DatasetDescriptor descriptor; String[] idParts; idParts = selectionDescription.getId().split("_", 2); descriptor = new DatasetDescriptor(idParts[0], idParts[1].replace('(', '-')); return descriptor; } //@Test public void downloadSelectedCollectionsThumbnails() throws FileNotFoundException, IOException { File selectedPdCollections = new File(getCollectionsCvsFolder() + selectedCollectionsInputFilename); //we misuse the readThumbnailsMap as this is the same implementation as readCollectionsMap Map<String, String> selectedCollections = readThumbnailsMap(selectedPdCollections); //#ID;Title;Portal link;Results;items;selection;dicriminator;Content selection comments DatasetDescriptor descriptor; SelectionDescriptionImpl selectionDescription; for (Map.Entry<String, String> collection : selectedCollections.entrySet()) { selectionDescription = new SelectionDescriptionImpl(collection.getKey(), collection.getValue().split(";")); descriptor = buildDatasetDescriptor(selectionDescription); System.out.println("Downloading thumbnails for collection: " + descriptor); downloadThumbnails(descriptor); } } private void downloadThumbnails(DatasetDescriptor descriptor) throws FileNotFoundException, IOException { //this.setProcessingStep(STEP_THUMBNAILMAP); //expectedResults = selectionDescription.getIntFieldValue(SelectionDescriptionEnum.RESULT_COUNT); File thumbnailsMapFile = getCollectionCsvFile(descriptor, STEP_THUMBNAILMAP); File downloadFolder = getConfig().getImageFolderAsFile(getDataset()); LargeThumbnailsetProcessing datasetDownloader = new LargeThumbnailsetProcessing(thumbnailsMapFile); ThumbnailDownloader observer = new ThumbnailDownloader(downloadFolder); observer.setSkipExistingFiles(!overwriteThumbnails); observer.setFilterThumbnails(true); datasetDownloader.addObserver(observer); datasetDownloader.processThumbnailset(0, -1, 1000); log.debug("Skipped items: " + datasetDownloader.getSkippedItemsCount()); log.warn("Failed downloads: " + datasetDownloader.getFailureCount()); log.info("Downloaded files: " + datasetDownloader.getItemsProcessed()); } //@Test public void categorizeSubsetThumbnails() throws FileNotFoundException, IOException { File selectedPdCollections = new File(getCollectionsCvsFolder() + selectedCollectionsInputFilename); //we misuse the readThumbnailsMap as this is the same implementation as readCollectionsMap Map<String, String> selectedCollections = readThumbnailsMap(selectedPdCollections); //#ID;Title;Portal link;Results;items;selection;dicriminator;Content selection comments DatasetDescriptor descriptor; SelectionDescriptionImpl selectionDescription; for (Map.Entry<String, String> collection : selectedCollections.entrySet()) { selectionDescription = new SelectionDescriptionImpl(collection.getKey(), collection.getValue().split(";")); descriptor = buildDatasetDescriptor(selectionDescription); System.out.println("performing thumbnail categorization for collection: " + descriptor); categorizeThumbnails(descriptor); } } public File categorizeThumbnails(DatasetDescriptor datasetDescriptor) throws FileNotFoundException, IOException { // String thumbnailsFile = getCvsFileForStep(datasetDescriptor, // STEP_THUMBNAILS); // new File(thumbnailsFile) File inputFile = getCollectionCsvFile(datasetDescriptor, STEP_THUMBNAILMAP); File outputFile = getCollectionCsvFile(datasetDescriptor, STEP_CLASSIFIED); LargeThumbnailsetProcessing datasetCategorization = new LargeThumbnailsetProcessing( inputFile); // String imageFolder = getConfiguration().getImageFolder(getDataset()); String imageFolder = IMAGE_FOLDER; GrayScaleSepiaDetector observer = new GrayScaleSepiaDetector(new File( imageFolder), 85, 3); //final File outputFile = new File(outFile); //Set<String> selectedCategories = new HashSet<String>(); //selectedCategories.add(ScalableColorPlusImpl.ImageType.COLOR.) observer.setOutputFile(outputFile); datasetCategorization.addObserver(observer); if(blockSize < 0) blockSize = 1000; datasetCategorization.processThumbnailset(start, limit, blockSize); System.out.println("Skipped items: " + datasetCategorization.getFailureCount()); return outputFile; } // protected Map<String, String> generateSubset( // Map<String, String> fullCollectionMap, int selectionCount) { // // Map<String, String> subsetMap = new HashMap<String, String>(selectionCount); // if(fullCollectionMap.size() < selectionCount) // throw new RuntimeException("Fullcollection has less items than the expected subset: " + selectionCount); // // Object[] keys = fullCollectionMap.keySet().toArray(); // int i; // Random random = new Random(); // // while(subsetMap.size() < selectionCount){ // i = random.nextInt(fullCollectionMap.size()); // subsetMap.put((String)keys[i], fullCollectionMap.get(keys[i])); // } // // return subsetMap; // } // protected String buildSubSetName( // SelectionDescriptionImpl selectionDescription) { // String subsetName = selectionDescription.getFieldValue(SelectionDescriptionEnum.TITLE); // subsetName = subsetName.substring(0, Math.min(subsetName.length(), 10)); // return subsetName; // } protected File getCollectionCsvFile(DatasetDescriptor dataset) { return getCollectionCsvFile(dataset, getProcessingStep()); } protected File getCollectionCsvFile(DatasetDescriptor dataset, final String processingStep) { if(processingStep != null){ String fileName = getCollectionsCvsFolder() + processingStep.toLowerCase() + "/" + dataset.getImageSetName() + "_" + encode(dataset.getCollectionName()) + ".csv"; return new File(fileName); }else{ return super.getCollectionCsvFile(dataset); } } @Test public void performDatasetAggregation() throws IOException { File cvsFolder = new File(getCollectionsCvsFolder() + STEP_CLASSIFIED.toLowerCase() + "/"); File[] collectionFiles = cvsFolder.listFiles(); BufferedReader reader = null; // String headerLine = null; String line = null; BufferedWriter datasetWriter = getDataSetFileWriter(false); log.debug("Aggregating dataset: " + getDataset()); boolean isColorful; for (int i = 0; i < collectionFiles.length; i++) { reader = new BufferedReader(new FileReader(collectionFiles[i])); boolean firstLine = true; while ((line = reader.readLine()) != null) { // write headers to sysout if (firstLine) { log.debug("Writting dataset headerline: " + line); firstLine = false; } isColorful = line.endsWith(";COLOR"); // write all data to dataset if(isColorful){ datasetWriter.write(line); datasetWriter.write("\n"); } } datasetWriter.flush(); // close reader try { reader.close(); } catch (IOException e) { System.out.println("cannot close reader for: " + collectionFiles[i]); e.printStackTrace(); } } log.trace("Closing dataset file"); datasetWriter.close(); } @Test public void copyUsedImages() throws FileNotFoundException, IOException{ File datasetFile = getConfig().getDatasetFile(getDataset()); Map<String, String> thumbnailsMap = readThumbnailsMap(datasetFile); File sourceFile, destFile; for (String id : thumbnailsMap.keySet()) { sourceFile = getConfig().getImageFile(getDataset(), id); destFile = new File(sourceFile.getAbsolutePath().replace("app", "tmp")); copyFile(sourceFile, destFile); } System.out.println("completed copying images!"); } protected IRConfiguration getConfig() { IRConfiguration config = new IRConfigurationImpl(); return config; } public File getDataSetFile(boolean urls) { IRConfiguration config = getConfig(); if (urls) return config.getDatasetUrlsFile(getDataset()); else return config.getDatasetFile(getDataset()); } protected String getCollectionsCvsFolder() { return getCollectionsCvsFolder(getDataset()); } public String getCollectionsCvsFolder(String dataset) { return IRTestConfigurations.COLLECTIONS_FOLDER + dataset + "/"; } public String getProcessingStep() { return processingStep; } public void setProcessingStep(String processingStep) { this.processingStep = processingStep; } protected void copyFile(File sourceFile, File destFile) throws IOException { if(!destFile.getParentFile().exists()) destFile.getParentFile().mkdirs(); if(!destFile.exists()) destFile.createNewFile(); FileChannel source = null; FileChannel destination = null; try { source = new FileInputStream(sourceFile).getChannel(); destination = new FileOutputStream(destFile).getChannel(); destination.transferFrom(source, 0, source.size()); } finally { if(source != null) { source.close(); } if(destination != null) { destination.close(); } } } private String readJsonFile(String testResource) throws IOException { BufferedReader reader = null; StringBuilder out = null; try { InputStream resourceAsStream = getClass().getResourceAsStream( testResource); reader = new BufferedReader(new InputStreamReader( resourceAsStream)); out = new StringBuilder(); String line; while ((line = reader.readLine()) != null) { out.append(line); } } finally { if(reader!= null) reader.close(); } return out.toString(); } }