package org.wikipedia.miner.util;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
import java.util.Random;
import java.util.Set;
import org.wikipedia.miner.annotation.preprocessing.PreprocessedDocument;
import org.wikipedia.miner.model.Article;
public class TopicIndexingSet extends ArrayList<TopicIndexingSet.Item> {
public TopicIndexingSet() {
super() ;
}
public void add(PreprocessedDocument doc, HashSet<Integer> topicIds) {
add(new Item(doc, topicIds)) ;
}
public TopicIndexingSet getRandomSubset(int size) {
if (size > size())
throw new IllegalArgumentException("requested size " + size + " is larger than " + size()) ;
Random r = new Random() ;
HashSet<Integer> usedIndexes = new HashSet<Integer>() ;
TopicIndexingSet subset = new TopicIndexingSet() ;
while (subset.size() < size) {
int index = r.nextInt(size()) ;
if (usedIndexes.contains(index))
continue ;
Item i = get(index) ;
subset.add(i) ;
usedIndexes.add(index) ;
}
return subset ;
}
public class Item {
PreprocessedDocument _doc ;
HashSet<Integer> _topicIds ;
public Item(PreprocessedDocument doc) {
_doc = doc ;
_topicIds = new HashSet<Integer>() ;
}
public Item(PreprocessedDocument doc, HashSet<Integer> topicIds) {
_doc = doc ;
_topicIds = topicIds ;
}
public PreprocessedDocument getDocument() {
return _doc ;
}
public void addTopic(Article art) {
_topicIds.add(art.getId()) ;
}
public void addTopic(int id) {
_topicIds.add(id) ;
}
public boolean isTopic(Article art) {
return _topicIds.contains(art.getId()) ;
}
public boolean isTopic(int id) {
return _topicIds.contains(id) ;
}
public Set<Integer> getTopicIds() {
return _topicIds ;
}
}
}