package org.deri.vocidex.extract; import java.util.Collection; import java.util.Iterator; import java.util.NoSuchElementException; import org.deri.vocidex.SPARQLRunner; import org.deri.vocidex.VocidexDocument; import org.deri.vocidex.describers.LOVTermMetricsDescriber; import org.deri.vocidex.describers.LOVVocabularyDescriber; import com.hp.hpl.jena.query.Dataset; import com.hp.hpl.jena.rdf.model.Resource; /** * Extracts indexable {@link VocidexDocument} instances from a * dataset containing the LOV dump. Will create one doucment for * each vocabulary, and one document for every term defined in * those vocabularies. * * @author Richard Cyganiak */ public class LOVExtractor implements Extractor { private final Dataset dataset; private final SPARQLRunner source; private final LOVVocabularyDescriber vocabularyDescriber; private final LOVTermMetricsDescriber termMetricsDescriber; public LOVExtractor(Dataset dataset) { this.dataset = dataset; this.source = new SPARQLRunner(dataset); this.vocabularyDescriber = new LOVVocabularyDescriber(source); this.termMetricsDescriber = new LOVTermMetricsDescriber(source); } public Collection<Resource> listVocabularies() { return source.getURIs("list-lov-vocabularies.sparql", null, null, "vocab"); } public Collection<Resource> listDefinedTerms(Resource vocabulary) { return source.getURIs("lov-vocabulary-terms.sparql", "vocab", vocabulary, "term"); } private SPARQLRunner getSPARQLRunnerForVocabulary(Resource vocabulary) { return new SPARQLRunner(dataset.getNamedModel(vocabulary.getURI())); } /** * An iterator over all vocabularies, classes and properties in the dataset. For each * vocabulary, we first return a result representing the vocabulary itself. This * is done by creating a {@link VocidexDocument} around the {@link LOVVocabularyDescriber} * result. Then we return all terms defined in the vocabulary by using * {@link VocabularyTermExtractor} with an {@link LOVWrapper} around it. */ @Override public Iterator<VocidexDocument> iterator() { return new Iterator<VocidexDocument>() { private final Iterator<VocidexDocument> vocabIterator = new DescriberIterator(listVocabularies(), vocabularyDescriber); private VocidexDocument currentVocabularyDocument = null; private Iterator<VocidexDocument> currentDocIterator = null; @Override public boolean hasNext() { if (currentVocabularyDocument != null) return true; if (currentDocIterator != null && currentDocIterator.hasNext()) return true; if (!vocabIterator.hasNext()) return false; // Document for the vocabulary itself currentVocabularyDocument = vocabIterator.next(); Resource vocab = currentVocabularyDocument.getURI(); // Extractor for all terms mentioned in the vocabulary graph VocabularyTermExtractor ex = new VocabularyTermExtractor( getSPARQLRunnerForVocabulary(vocab), currentVocabularyDocument.getRoot().get("prefix").getTextValue()); // Keep only the documents actually defined in that vocabulary, // and enrich them with some extra vocabulary information currentDocIterator = new LOVWrapper( ex, listDefinedTerms(vocab), currentVocabularyDocument.getRoot(),termMetricsDescriber).iterator(); // At least the vocabulary document always exists, so return true return true; } @Override public VocidexDocument next() { // hasNext() prepares for the next vocabulary if necessary if (!hasNext()) throw new NoSuchElementException(); // Return vocabulary document first if (currentVocabularyDocument != null) { VocidexDocument result = currentVocabularyDocument; currentVocabularyDocument = null; return result; } // Then return documents from the iterator over its terms return currentDocIterator.next(); } @Override public void remove() { throw new UnsupportedOperationException(); } }; } }