package com.yahoo.glimmer.indexing;
/*
* Copyright (c) 2012 Yahoo! Inc. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.
* You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, software distributed under the License is
* distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and limitations under the License.
* See accompanying LICENSE file.
*/
import it.unimi.dsi.io.FastBufferedReader;
import it.unimi.dsi.io.WordReader;
import it.unimi.dsi.lang.MutableString;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.semanticweb.yars.nx.BNode;
import org.semanticweb.yars.nx.Resource;
import org.semanticweb.yars.nx.namespace.RDF;
import com.yahoo.glimmer.indexing.RDFDocumentFactory.IndexType;
import com.yahoo.glimmer.indexing.RDFDocumentFactory.RdfCounters;
import com.yahoo.glimmer.indexing.RDFDocumentFactory.ResourceHashLookupException;
/**
* A RDF document.
*
* <p>
* We delay the actual parsing until it is actually necessary, so operations
* like getting the document URI will not require parsing.
*/
class VerticalDocument extends RDFDocument {
private static final Log LOG = LogFactory.getLog(VerticalDocument.class);
private List<List<String>> fields = new ArrayList<List<String>>();
protected VerticalDocument(VerticalDocumentFactory factory) {
super(factory);
fields = new ArrayList<List<String>>(factory.getFieldCount());
while (fields.size() < factory.getFieldCount()) {
fields.add(new ArrayList<String>());
}
}
@Override
public IndexType getIndexType() {
return IndexType.VERTICAL;
}
protected void ensureParsed_(Iterator<Relation> relations) throws IOException {
// clear fields
for (List<String> field : fields) {
field.clear();
}
while (relations.hasNext()) {
Relation relation = relations.next();
String predicate = relation.getPredicate().toString();
// Check if prefix is on blacklist
if (RDFDocumentFactory.isOnPredicateBlacklist(predicate)) {
factory.incrementCounter(RdfCounters.BLACKLISTED_TRIPLES, 1);
continue;
}
// Determine whether we need to index, and the field
int fieldIndex = factory.getFieldIndex(predicate);
if (fieldIndex == -1) {
factory.incrementCounter(RdfCounters.UNINDEXED_PREDICATE_TRIPLES, 1);
continue;
}
List<String> fieldForPredicate = fields.get(fieldIndex);
if (relation.getObject() instanceof Resource || relation.getObject() instanceof BNode) {
// Encode the resource URI or bnode ID using the resources hash
String objectId;
try {
objectId = factory.lookupResource(relation.getObject().toString(), true);
} catch (ResourceHashLookupException rhle) {
factory.incrementCounter(RdfCounters.OBJECT_NOT_IN_HASH, 1);
LOG.info("Object not in hash:" + relation.getContext().toString());
continue;
}
fieldForPredicate.add(objectId);
if (predicate.equals(RDF.TYPE.toString())) {
// If the predicate is RDF type and the object is a Resource
// we use the ontology(if set)
// to also index all super types.
factory.incrementCounter(RdfCounters.RDF_TYPE_TRIPLES, 1);
for (String ancestor : factory.getAncestors(relation.getObject().toString())) {
String ancestorId;
try {
ancestorId = factory.lookupResource(ancestor, true);
} catch (ResourceHashLookupException rhle) {
factory.incrementCounter(RdfCounters.ANCESTOR_OBJECT_NOT_IN_HASH, 1);
LOG.info("Ancestor(" + ancestor + ") of " + relation.getObject().toString()
+ " not in resources hash function!. Was the same ontology used with the PrepTool?");
continue;
}
if (ancestorId == null) {
throw new IllegalStateException();
}
fieldForPredicate.add(ancestorId);
}
}
} else {
String object = relation.getObject().toString();
// Iterate over the words of the value
FastBufferedReader fbr = new FastBufferedReader(object.toCharArray());
MutableString word = new MutableString();
MutableString nonWord = new MutableString();
while (fbr.next(word, nonWord)) {
if (word != null && !word.equals("")) {
if (CombinedTermProcessor.getInstance().processTerm(word)) {
fieldForPredicate.add(word.toString());
}
}
}
fbr.close();
}
factory.incrementCounter(RdfCounters.INDEXED_TRIPLES, 1);
}
}
@Override
public WordReader content(final int field) throws IOException {
factory.ensureFieldIndex(field);
ensureParsed();
return new WordArrayReader(fields.get(field));
}
}