/* * Carrot2 project. * * Copyright (C) 2002-2016, Dawid Weiss, Stanisław Osiński. * All rights reserved. * * Refer to the full license file "carrot2.LICENSE" * in the root folder of the repository checkout or at: * http://www.carrot2.org/carrot2.LICENSE */ package org.carrot2.source.ambient; import java.io.*; import java.util.*; import org.carrot2.core.Document; import org.carrot2.core.ProcessingException; import org.carrot2.util.*; import org.carrot2.util.resource.ClassResource; import org.carrot2.util.resource.IResource; import org.carrot2.shaded.guava.common.base.Predicate; import org.carrot2.shaded.guava.common.collect.*; /** * Handles data of test collections developed by Fondazione Ugo Bordoni. */ class FubTestCollection { /** * The total number of Ambient topics. */ int topicCount; /** * Documents by topic id. */ final Map<Integer, List<Document>> documentsByTopicId; /** * Numbers of documents for each subtopic. */ final Map<String, Integer> subtopicSizes; /** * Human-readable descriptions of topics. */ final Map<String, String> subtopicLabels; public FubTestCollection(String basePath) { /** [topicId][resultIndex] = subopicId */ final int [][] resultSubtopicIds = loadSubtopicMapping(new ClassResource( AmbientDocumentSource.class, basePath + "/STRel.txt")); documentsByTopicId = loadDocuments(new ClassResource(AmbientDocumentSource.class, basePath + "/results.txt"), resultSubtopicIds); subtopicSizes = prepareSubtopicSizes(resultSubtopicIds); subtopicLabels = loadSubtopicLabels(new ClassResource( AmbientDocumentSource.class, basePath + "/subTopics.txt")); } protected int getTopicCount() { return topicCount; } protected List<Document> getDocumentsForTopic(int topicId, int requestedResults, final int minTopicSize, final boolean includeDocumentsWithoutTopic) throws ProcessingException { // Filter the results final List<Document> documents = Lists.newArrayList(Collections2.filter( documentsByTopicId.get(topicId), new Predicate<Document>() { public boolean apply(Document document) { // For now there is only one topic per document in Ambient final String documentTopic = getTopic(document); return subtopicSizes.get(documentTopic) >= minTopicSize && (includeDocumentsWithoutTopic || !documentTopic.endsWith(".0")); } })); if (documents.size() >= requestedResults) { return documents.subList(0, requestedResults); } else { return documents; } } @SuppressWarnings("unchecked") protected Set<Object> getTopicIds(final List<Document> documents) { final Set<Object> topicIds = Sets.newHashSet(); for (Document document : documents) { topicIds.addAll((Collection<? extends Object>) document .<Object> getField(Document.PARTITIONS)); } return topicIds; } @SuppressWarnings("unchecked") protected static String getTopic(Document document) { return ((List<String>) document.getField(Document.PARTITIONS)).get(0); } /** * Returns a human-readable label for a subtopic. */ String getTopicLabel(String topicId) { return subtopicLabels.get(topicId); } /** * Loads human-readable labels for subtopics. */ private static Map<String, String> loadSubtopicLabels(IResource subtopicLabelsResource) { final Map<String, String> labels = Maps.newHashMap(); BufferedReader reader = null; try { reader = new BufferedReader(new InputStreamReader(subtopicLabelsResource .open(), "UTF-8")); String line = reader.readLine(); // discard first line while ((line = reader.readLine()) != null) { String [] split = line.split("\\t"); if (split.length > 1) { labels.put(split[0], split[1]); } } } catch (Exception e) { throw ExceptionUtils.wrapAsRuntimeException(e); } finally { if (reader != null) { CloseableUtils.close(reader); } } return labels; } /** * Prepares a map with subtopic sizes, keyed by subtopic string. */ private static Map<String, Integer> prepareSubtopicSizes(int [][] resultSubtopicIds) { final Map<String, Integer> map = Maps.newHashMap(); for (int topic = 1; topic < resultSubtopicIds.length; topic++) { for (int result = 1; result < resultSubtopicIds[topic].length; result++) { MapUtils.increment(map, buildTopicId(topic, resultSubtopicIds[topic][result])); } } return map; } /** * Loads all Ambient documents. */ private static Map<Integer, List<Document>> loadDocuments(IResource resultsResource, int [][] resultSubtopicIds) { final Map<Integer, List<Document>> documents = Maps.newHashMap(); BufferedReader reader = null; try { reader = new BufferedReader(new InputStreamReader(resultsResource.open(), "UTF-8")); String line = reader.readLine(); // discard first line while ((line = reader.readLine()) != null) { final String [] split = line.split("\\t"); final String [] topicSplit = split[0].split("\\."); final int topicId = Integer.parseInt(topicSplit[0]); final int resultIndex = Integer.parseInt(topicSplit[1]); // Build document final Document document = new Document(); document.setField(Document.CONTENT_URL, split[1]); document.setField(Document.TITLE, split[2]); if (split.length > 3) { document.setField(Document.SUMMARY, split[3]); } document .setField( Document.PARTITIONS, ImmutableList .of(buildTopicId( topicId, resultSubtopicIds[topicId].length > resultIndex ? resultSubtopicIds[topicId][resultIndex] : 0))); // Add to list List<Document> topicList = documents.get(topicId); if (topicList == null) { topicList = Lists.newArrayList(); documents.put(topicId, topicList); } topicList.add(document); } } catch (Exception e) { throw ExceptionUtils.wrapAsRuntimeException(e); } finally { CloseableUtils.close(reader); } final List<Document> allDocuments = Lists.newArrayList(); for (List<Document> docList : documents.values()) { allDocuments.addAll(docList); } Document.assignDocumentIds(allDocuments); return documents; } private static String buildTopicId(final int topic, final int subtopic) { return topic + "." + subtopic; } /** * Loads topic mapping. */ private int [][] loadSubtopicMapping(IResource resultsMappingResource) { final Map<Integer, Map<Integer, Integer>> topics = Maps.newHashMap(); BufferedReader reader = null; try { reader = new BufferedReader(new InputStreamReader(resultsMappingResource .open(), "UTF-8")); reader.readLine(); // discard first line String line; while ((line = reader.readLine()) != null) { final String [] split = line.split("[\\t.]"); final int topicId = Integer.parseInt(split[0]); final int subtopicId = Integer.parseInt(split[1]); final int resultId = Integer.parseInt(split[3]); Map<Integer, Integer> topicMap = topics.get(topicId); if (topicMap == null) { topicMap = Maps.newHashMap(); topics.put(topicId, topicMap); } topicMap.put(resultId, subtopicId); } } catch (Exception e) { throw ExceptionUtils.wrapAsRuntimeException(e); } finally { if (reader != null) { CloseableUtils.close(reader); } } this.topicCount = topics.size(); int [][] resultSubtopicIds = new int [topics.size() + 1] []; for (int topic = 1; topic < resultSubtopicIds.length; topic++) { final Map<Integer, Integer> results = topics.get(topic); resultSubtopicIds[topic] = new int [Collections.max(results.keySet()) + 1]; for (int result = 1; result < resultSubtopicIds[topic].length; result++) { Integer subtopic = results.get(result); if (subtopic != null) { resultSubtopicIds[topic][result] = subtopic; } } } return resultSubtopicIds; } }