/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.jena.hadoop.rdf.mapreduce.count.namespaces; import java.io.IOException; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; import org.apache.jena.graph.Node ; import org.apache.jena.hadoop.rdf.mapreduce.TextCountReducer; import org.apache.jena.hadoop.rdf.types.AbstractNodeTupleWritable; import org.apache.jena.hadoop.rdf.types.NodeWritable; /** * Abstract mapper class for mappers which split node tuple values and extract * the namespace URIs they use and outputs pairs of namespaces keys with a long * value of 1. Can be used in conjunction with a {@link TextCountReducer} to * count the usages of each unique namespace. * * * * @param <TKey> * @param <TValue> * @param <T> */ public abstract class AbstractNodeTupleNamespaceCountMapper<TKey, TValue, T extends AbstractNodeTupleWritable<TValue>> extends Mapper<TKey, T, Text, LongWritable> { private LongWritable initialCount = new LongWritable(1); protected static final String NO_NAMESPACE = null; @Override protected void map(TKey key, T value, Context context) throws IOException, InterruptedException { NodeWritable[] ns = this.getNodes(value); for (NodeWritable n : ns) { String namespace = this.extractNamespace(n); if (namespace != null) { context.write(new Text(namespace), this.initialCount); } } } /** * Extracts the namespace from a node * <p> * Finds the URI for the node (if any) and then invokes * {@link #extractNamespace(String)} to extract the actual namespace URI. * </p> * <p> * Derived classes may override this to change the logic of how namespaces * are extracted. * </p> * * @param nw * Node * @return Namespace */ protected String extractNamespace(NodeWritable nw) { Node n = nw.get(); if (n.isBlank() || n.isVariable()) return NO_NAMESPACE; if (n.isLiteral()) { String dtUri = n.getLiteralDatatypeURI(); if (dtUri == null) return NO_NAMESPACE; return extractNamespace(dtUri); } return extractNamespace(n.getURI()); } /** * Extracts the namespace from a URI * <p> * First tries to extract a hash based namespace. If that is not possible it * tries to extract a slash based namespace, if this is not possible then * the full URI is returned. * </p> * <p> * Derived classes may override this to change the logic of how namespaces * are extracted. * </p> * * @param uri * URI * @return Namespace */ protected String extractNamespace(String uri) { if (uri.contains("#")) { // Extract hash namespace return uri.substring(0, uri.lastIndexOf('#') + 1); } else if (uri.contains("/")) { // Ensure that this is not immediately after the scheme component or // at end of URI int index = uri.lastIndexOf('/'); int schemeSepIndex = uri.indexOf(':'); if (index - schemeSepIndex <= 2 || index == uri.length() - 1) { // Use full URI return uri; } // Otherwise safe to extract slash namespace return uri.substring(0, uri.lastIndexOf('/') + 1); } else { // Use full URI return uri; } } /** * Gets the nodes of the tuple whose namespaces are to be counted * * @param tuple * Tuple * @return Nodes */ protected abstract NodeWritable[] getNodes(T tuple); }