package eu.europeana.creative.dataset; import it.cnr.isti.indexer.IndexHelper; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.IOException; import java.util.List; import java.util.Map; import org.junit.Test; import eu.europeana.api.client.thumbnails.ThumbnailsAccessor; import eu.europeana.api.client.thumbnails.ThumbnailsForCollectionAccessorTest; import eu.europeana.service.ir.image.IRConfiguration; import eu.europeana.service.ir.image.IRConfigurationImpl; import eu.europeana.service.ir.image.api.ImageSearchingService; import eu.europeana.service.ir.image.api.ImageSearchingServiceImpl; import eu.europeana.service.ir.image.exceptions.ImageSearchingException; public class HistorianaDatasetBuilderTest extends ThumbnailsForCollectionAccessorTest implements IRTestConfigurations { public static String CLASS_WW1 = "ww1"; final String DATASET_HISTORIANA = "historiana"; private String dataset = null; private ImageSearchingService imageSearchingService; // @Test public void createDemoDataset() throws IOException { setDataset(DATASET_HISTORIANA); //http://www.europeana.eu/portal/search.html?query=*%3A*&start=1&rows=12&qf=PROVIDER%3A%22Europeana+1914+-+1918%22&qf=TYPE%3AIMAGE // int objects0 = buildImageSet("Europeana1914-1918", // null, new String[] { CLASS_WW1, // "ugc1914" }, null, null, null, "IMAGE", "Europeana 1914 - 1918"); // assertEquals(58723, objects0); //http://www.europeana.eu/portal/search.html?query=*:*&qf=1914-1918&qf=PROVIDER:%22EFG+-+The+European+Film+Gateway%22&qf=TYPE:IMAGE&rows=12 // int objects1 = buildImageSet("European Digital Film Gateway", // null, new String[] { CLASS_WW1, // "filmgateway" }, "1914-1918", null, null, "IMAGE", "EFG - The European Film Gateway"); // assertEquals(136, objects1); // 1 object without thumbnail //http://www.europeana.eu/portal/search.html?query=europeana_collectionName:9200168*&qf=TYPE:IMAGE&rows=24; // int objects2 = buildImageSet("Serbian National Library", // "9200168*", new String[] { CLASS_WW1, // "serbia" }, null, null, null, "IMAGE"); // assertEquals(206, objects2); // 1 object without thumbnail //http://www.europeana.eu/portal/search.html?query=DATA_PROVIDER%3A%22National+Library+of+the+Netherlands+-+Koninklijke+Bibliotheek%22&rows=12&qf=eerste+wereldoorlog // int objects3 = buildImageSet("Netherlands National Library", // null, new String[] { CLASS_WW1, // "netherlands" }, "eerste wereldoorlog", null, null, "IMAGE", null, "\"National Library of the Netherlands - Koninklijke Bibliotheek\"" ); // assertEquals(2780, objects3); //http://www.europeana.eu/portal/search.html?query=*:*&qf=PROVIDER:%22The+Great+War+Archive%2C+University+of+Oxford%22&qf=TYPE:IMAGE&rows=12 // int objects4 = buildImageSet("Th Great War Archive_University of Oxford", // null, new String[] { CLASS_WW1, // "uk-great war archive" }, null, null, null, "IMAGE", "\"The Great War Archive, University of Oxford\""); // assertEquals(6342, objects4); //http://www.europeana.eu/portal/search.html?query=DATA_PROVIDER%3A%22The+Wellcome+Library%22+Great+War+OR+First+World+War+OR+WW1+OR+1914-1918&rows=12 // int objects5 = buildImageSet("Welcome", // null, new String[] { CLASS_WW1, // "welcome" }, "Great War OR First World War OR WW1 OR 1914-1918", null, null, "IMAGE", null, "\"The Wellcome Library\""); // assertEquals(133, objects5); //13 videos skipped out // http://www.europeana.eu/portal/search.html?query=DATA_PROVIDER%3A%22Istituto+centrale+per+il+catalogo+unico%22&start=1&rows=24&qf=First+World+War // int objects6 = buildImageSet("Italy_Istituto_Centrale", // null, new String[] { CLASS_WW1, // "italy-iccu" }, "First World War", null, null, "IMAGE", null, "\"Istituto centrale per il catalogo unico\""); // assertEquals(149878, objects6); //13 videos skipped out performDatasetAggregation(); } private void performDatasetAggregation() throws IOException { File cvsFolder = new File(getCollectionsCvsFolder()); File[] collectionFiles = cvsFolder.listFiles(); BufferedReader reader = null; // String headerLine = null; String line = null; BufferedWriter datasetWriter = getDataSetFileWriter(false); for (int i = 0; i < collectionFiles.length; i++) { reader = new BufferedReader(new FileReader(collectionFiles[i])); boolean firstLine = true; while ((line = reader.readLine()) != null) { // write headers to sysout if (firstLine) { System.out.println(line); firstLine = false; } // write all data to dataset datasetWriter.write(line); datasetWriter.write("\n"); } datasetWriter.flush(); // close reader try { reader.close(); } catch (IOException e) { System.out.println("cannot close reader for: " + collectionFiles[i]); e.printStackTrace(); } } datasetWriter.close(); } // private BufferedWriter getDataSetFileWriter(boolean urls) // throws IOException { // File datasetFile = getDataSetFile(urls); // datasetFile.getParentFile().mkdirs(); // // return new BufferedWriter(new FileWriter(datasetFile)); // } public File getDataSetFile(boolean urls) { IRConfiguration config = getConfig(); if (urls) return config.getDatasetUrlsFile(getDataset()); else return config.getDatasetFile(getDataset()); } public void testGetThumbnailsForCollectionLimit() { // avoid execution } public void testGetThumbnailsForCollectionAll() { // avoid execution } public String getDataset() { return dataset; } public void setDataset(String dataset) { this.dataset = dataset; } /// @Test public void downloadThumbnails() throws FileNotFoundException, IOException { setDataset(DATASET_HISTORIANA); File datasetFile = getConfig().getDatasetFile(DATASET_HISTORIANA); IndexHelper ixHelper = new IndexHelper(); Map<String, String> thumbnailsMap = ixHelper .getThumbnailsMap(datasetFile); ThumbnailsAccessor ta = new ThumbnailsAccessor(); List<String> skippedItems = ta.copyThumbnails(thumbnailsMap, getConfig().getImageFolderAsFile(DATASET_HISTORIANA)); System.out.println("Skipped items: " + skippedItems.size()); for (String itemId : skippedItems) { System.out.println(itemId); } } protected IRConfiguration getConfig() { IRConfiguration config = new IRConfigurationImpl(); return config; } @Test public void buildIndexedUrlsFile() throws FileNotFoundException, IOException, ImageSearchingException { setDataset(DATASET_HISTORIANA); IRConfiguration config = getConfig(); File datasetFile = config.getDatasetFile(getDataset()); IndexHelper ixHelper = new IndexHelper(); Map<String, String> thumbnailsMap = ixHelper .getThumbnailsMap(datasetFile); BufferedWriter indexedUrlsWriter = getDataSetFileWriter(true); //EuropeanaId euId = new EuropeanaId(); int counter = 0; for (Map.Entry<String, String> thumbnail : thumbnailsMap.entrySet()) { //euId.setNewId(thumbnail.getKey()); try { getImageSearchingService().searchSimilar(thumbnail.getKey()); if (getImageSearchingService().getTotalResults() > 0) { // write to file indexedUrlsWriter.append(thumbnail.getKey()).append("; "); indexedUrlsWriter.append(thumbnail.getValue()).append("\n"); counter++; } else { // not indexed yet System.out.println("Skipped item: " + thumbnail.getKey()); } } catch (ImageSearchingException e) { System.out.println(e.getMessage()); } } System.out.println("correct items: " + counter); } public ImageSearchingService getImageSearchingService() { if (imageSearchingService == null) { imageSearchingService = new ImageSearchingServiceImpl(getDataset(), getConfig()); imageSearchingService.init(); } return imageSearchingService; } @Override public String getCollectionsCvsFolder(String dataset) { return COLLECTIONS_FOLDER + dataset + "/"; } @Override protected String getCollectionsCvsFolder() { return getCollectionsCvsFolder(getDataset()); } }