/* Copyright 2012-2014 Fabian Steeg, hbz. Licensed under the Eclipse Public License 1.0 */ package org.lobid.lodmill.hadoop; import static java.lang.String.format; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.StringReader; import java.net.URI; import java.util.ArrayList; import java.util.List; import java.util.Set; import java.util.SortedSet; import java.util.concurrent.TimeUnit; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.MapFile; import org.apache.hadoop.io.MapFile.Reader; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.elasticsearch.action.admin.cluster.state.ClusterStateRequest; import org.elasticsearch.action.admin.indices.delete.DeleteIndexRequest; import org.elasticsearch.client.Client; import org.elasticsearch.client.IndicesAdminClient; import org.elasticsearch.client.Requests; import org.elasticsearch.client.transport.TransportClient; import org.elasticsearch.common.settings.ImmutableSettings; import org.elasticsearch.common.transport.InetSocketTransportAddress; import org.elasticsearch.hadoop.mr.EsOutputFormat; import org.lobid.lodmill.JsonLdConverter; import org.lobid.lodmill.JsonLdConverter.Format; import org.lobid.lodmill.hadoop.CollectSubjects.CollectSubjectsMapper; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.google.common.collect.Sets; import com.google.common.collect.SortedSetMultimap; import com.google.common.collect.TreeMultimap; import com.google.common.io.CharStreams; import com.hp.hpl.jena.graph.Triple; import com.hp.hpl.jena.rdf.model.Model; import com.hp.hpl.jena.rdf.model.ModelFactory; /** * Convert RDF represented as N-Triples to JSON-LD for elasticsearch indexing. * * @author Fabian Steeg (fsteeg) */ public class NTriplesToJsonLd implements Tool { private static final int NODES = 4; // e.g. 4 nodes in cluster private static final int SLOTS = 8; // e.g. 8 cores per node private static final String NEWLINE = "\n"; static final String INDEX_NAME = "index.name"; static final String INDEX_TYPE = "index.type"; private static final Logger LOG = LoggerFactory .getLogger(NTriplesToJsonLd.class); private Configuration conf; private String indexName; private Client client = CLIENT; private String aliasSuffix = "-testing"; private boolean update = false; private static final String ES_NODE = "193.30.112.171"; // TODO pass params private static final InetSocketTransportAddress NODE_1 = new InetSocketTransportAddress(ES_NODE, 9300); private static final InetSocketTransportAddress NODE_2 = new InetSocketTransportAddress("193.30.112.172", 9300); private static final TransportClient TC = new TransportClient( ImmutableSettings.settingsBuilder().put("cluster.name", "quaoar") .put("client.transport.sniff", false) .put("client.transport.ping_timeout", 20, TimeUnit.SECONDS).build()); private static final Client CLIENT = TC.addTransportAddress(NODE_1) .addTransportAddress(NODE_2); /** JSON key to use internally to identify the ES parent during indexing. */ public static final String INTERNAL_PARENT = "internal_parent"; /** JSON key to use internally to identify the ES document during indexing. */ public static final String INTERNAL_ID = "internal_id"; /** * @param args Generic command-line arguments passed to {@link ToolRunner}. */ public static void main(final String[] args) { try { int res = ToolRunner.run(new NTriplesToJsonLd(), args); System.exit(res); } catch (Exception e) { e.printStackTrace(); } } @Override public int run(String[] args) throws Exception { if (args.length != 5) { System.err .println("Usage: NTriplesToJsonLd" + " <input path> <index name> <index type> <target subjects prefix> <index alias suffix>"); System.exit(-1); } conf = getConf(); indexName = args[1]; String indexType = args[2]; aliasSuffix = args[4]; update = args[0].toLowerCase().contains("update"); if (update) { getNewestIndex(); } else createIndex(); conf.setStrings("mapred.textoutputformat.separator", NEWLINE); conf.setStrings("target.subject.prefix", args[3]); conf.set(INDEX_TYPE, indexType); conf.setBoolean("mapred.map.tasks.speculative.execution", false); conf.setBoolean("mapred.reduce.tasks.speculative.execution", false); conf.set("es.nodes", ES_NODE + ":9200"); conf.set(INDEX_NAME, indexName); conf.set("es.resource", indexName + "/" + indexType); conf.set("es.input.json", "yes"); conf.set("es.mapping.id", INTERNAL_ID); if (indexType.equals("json-ld-lobid-item")) conf.set("es.mapping.parent", INTERNAL_PARENT); final String mapFileName = CollectSubjects.mapFileName(indexName); conf.setStrings("map.file.name", mapFileName); final Job job = Job.getInstance(conf); final Path mapFilePath = new Path(mapFileName); if (FileSystem.get(conf).exists(mapFilePath)) { job.addCacheFile(mapFilePath.toUri()); } job.setNumReduceTasks(NODES * SLOTS); job.setJarByClass(NTriplesToJsonLd.class); job.setJobName("LobidToJsonLd"); FileInputFormat.addInputPaths(job, args[0]); job.setOutputFormatClass(EsOutputFormat.class); job.setMapperClass(NTriplesToJsonLdMapper.class); job.setReducerClass(NTriplesToJsonLdReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); setIndexRefreshInterval(CLIENT, "-1"); LOG.info(String.format("Process: index %s, type %s", indexName, indexType)); boolean success = job.waitForCompletion(true); if (success) { if (!aliasSuffix.equals("NOALIAS") && !update && !aliasSuffix.toLowerCase().contains("test")) updateAliases(indexName, aliasSuffix); client.admin().indices().prepareRefresh(indexName).execute().actionGet(); setIndexRefreshInterval(CLIENT, "1000"); } System.exit(success ? 0 : 1); return 0; } private void getNewestIndex() { String indexNameWithoutTimestamp = indexName.replaceAll("20.*", ""); final SortedSetMultimap<String, String> indices = groupByIndexCollection(); for (String prefix : indices.keySet()) { final SortedSet<String> indicesForPrefix = indices.get(prefix); final String newestIndex = indicesForPrefix.last(); if (newestIndex.startsWith(indexNameWithoutTimestamp)) indexName = newestIndex; } LOG.info("Going to UPDATE existing index " + indexName); } private void createIndex() { IndicesAdminClient adminClient = CLIENT.admin().indices(); if (!adminClient.prepareExists(indexName).execute().actionGet().isExists()) { LOG.info("Going to CREATE new index " + indexName); adminClient.prepareCreate(indexName).setSource(config()).execute() .actionGet(); } } private static String config() { String res = null; try { final InputStream config = Thread.currentThread().getContextClassLoader() .getResourceAsStream("index-config.json"); try (InputStreamReader reader = new InputStreamReader(config, "UTF-8")) { res = CharStreams.toString(reader); } } catch (IOException e) { LOG.error(e.getMessage(), e); } return res; } private void setIndexRefreshInterval(Client client, String setting) { client .admin() .indices() .prepareUpdateSettings(indexName) .setSettings( ImmutableSettings.settingsBuilder().put("index.refresh_interval", setting)).execute().actionGet(); } private void updateAliases(final String name, final String suffix) { final SortedSetMultimap<String, String> indices = groupByIndexCollection(); for (String prefix : indices.keySet()) { final SortedSet<String> indicesForPrefix = indices.get(prefix); final String newIndex = indicesForPrefix.last(); final String newAlias = prefix + suffix; LOG.info(format("Prefix '%s', newest index: %s", prefix, newIndex)); removeOldAliases(indicesForPrefix, newAlias); if (!name.equals(newAlias) && !newIndex.equals(newAlias)) createNewAlias(newIndex, newAlias); deleteOldIndices(name, indicesForPrefix); } } private SortedSetMultimap<String, String> groupByIndexCollection() { final SortedSetMultimap<String, String> indices = TreeMultimap.create(); for (String index : client.admin().indices().prepareStats().execute() .actionGet().getIndices().keySet()) { final String[] nameAndTimestamp = index.split("-(?=\\d)"); indices.put(nameAndTimestamp[0], index); } return indices; } private void removeOldAliases(final SortedSet<String> indicesForPrefix, final String newAlias) { for (String name : indicesForPrefix) { final Set<String> aliases = aliases(name); for (String alias : aliases) { if (alias.equals(newAlias)) { LOG.info(format("Delete alias index,alias: %s,%s", name, alias)); client.admin().indices().prepareAliases().removeAlias(name, alias) .execute().actionGet(); } } } } private void createNewAlias(final String newIndex, final String newAlias) { LOG.info(format("Create alias index,alias: %s,%s", newIndex, newAlias)); client.admin().indices().prepareAliases().addAlias(newIndex, newAlias) .execute().actionGet(); } private void deleteOldIndices(final String name, final SortedSet<String> allIndices) { if (allIndices.size() >= 3) { final List<String> list = new ArrayList<>(allIndices); list.remove(name); for (String indexToDelete : list.subList(0, list.size() - 2)) { if (aliases(indexToDelete).isEmpty()) { LOG.info(format("Deleting index: " + indexToDelete)); client.admin().indices() .delete(new DeleteIndexRequest(indexToDelete)).actionGet(); } } } } private Set<String> aliases(final String name) { final ClusterStateRequest clusterStateRequest = Requests.clusterStateRequest().nodes(true).indices(name); return Sets.newHashSet(client.admin().cluster().state(clusterStateRequest) .actionGet().getState().getMetaData().aliases().keysIt()); } /** * Map subject URIs of N-Triples to the triples. * * @author Fabian Steeg (fsteeg) */ static final class NTriplesToJsonLdMapper extends Mapper<LongWritable, Text, Text, Text> { private Reader reader; private String prefix; private Set<String> predicates; @Override protected void setup(Context context) throws IOException, InterruptedException { super.setup(context); prefix = context.getConfiguration().get(CollectSubjects.PREFIX_KEY); predicates = CollectSubjects.PREDICATES; final String rawMapFile = context.getConfiguration().get("map.file.name"); final URI mapFile = findMapFile(context.getCacheFiles(), rawMapFile); if (mapFile != null) initMapFileReader(new Path(mapFile)); else LOG.warn("No subjects cache files found!"); } private static URI findMapFile(final URI[] localCacheFiles, final String rawMapFileName) { if (localCacheFiles == null || rawMapFileName == null) return null; for (URI uri : localCacheFiles) if (uri.toString().contains(rawMapFileName)) return uri; return null; } private void initMapFileReader(final Path mapFilePath) throws IOException, FileNotFoundException { LOG.info("Reading map file from: " + mapFilePath); reader = new MapFile.Reader(mapFilePath, CollectSubjects.MAP_FILE_CONFIG); if (reader == null) throw new IllegalStateException(String.format( "Could not load map file data from %s", mapFilePath)); } @Override public void map(final LongWritable key, final Text value, final Context context) throws IOException, InterruptedException { final String val = value.toString().trim(); if (val.isEmpty()) return; final Triple triple = CollectSubjectsMapper.asTriple(val); if (triple != null) mapSubjectsToTheirTriples(value, context, val, triple); } private void mapSubjectsToTheirTriples(final Text value, final Context context, final String val, final Triple triple) throws IOException, InterruptedException { final String subject = triple.getSubject().isBlank() ? CollectSubjects.blankSubjectLabel( val, context.getInputSplit()) : triple.getSubject().toString(); if (subjectIsUriToBeCollected(triple) && !objectIsUnresolvedBlankNode(triple)) context.write(new Text(wrapped(subject.trim())), value); if (predicates.contains(triple.getPredicate().toString()) && reader != null) writeAdditionalSubjects(subject, value, context); } private boolean subjectIsUriToBeCollected(final Triple triple) { String subjectString = triple.getSubject().toString(); return triple.getSubject().isURI() && (subjectString.startsWith(prefix == null ? "" : prefix) // && !subjectString.endsWith("/about")); } private static boolean objectIsUnresolvedBlankNode(final Triple triple) { return triple.getObject().isBlank() && !CollectSubjects.TO_RESOLVE.contains(triple.getPredicate() .toString()); } private void writeAdditionalSubjects(final String subject, final Text value, final Context context) throws IOException, InterruptedException { final Writable res = reader.get(new Text(subject), new Text()); if (res != null) { for (String subj : res.toString().split(",")) context.write(new Text(wrapped(subj.trim())), value); } } private static String wrapped(final String string) { return "<" + string + ">"; } } /** * Reduce all N-Triples with a common subject to a JSON-LD representation. * * @author Fabian Steeg (fsteeg) */ static final class NTriplesToJsonLdReducer extends Reducer<Text, Text, Text, Text> { @Override public void reduce(final Text key, final Iterable<Text> values, final Context context) throws IOException, InterruptedException { final String triples = concatTriples(values); final String id = key.toString().substring(1, key.toString().length() - 1); final String parentProperty = context.getConfiguration().get("es.mapping.parent") == null ? null : CollectSubjects.PARENTS.iterator().next(); final String jsonLd = new JsonLdConverter(Format.N_TRIPLE).toJsonLd(triples, parentProperty, id); context.write(new Text(""), new Text(jsonLd)); } private static String concatTriples(final Iterable<Text> values) { final StringBuilder builder = new StringBuilder(); for (Text value : values) { final String triple = fixInvalidUriLiterals(value); try { validate(triple); builder.append(triple).append(NEWLINE); } catch (Exception e) { LOG.error(String.format("Could not read triple '%s': %s, skipping", triple, e.getMessage()), e); } } return builder.toString(); } private static void validate(final String val) { final Model model = ModelFactory.createDefaultModel(); model.read(new StringReader(val), null, Format.N_TRIPLE.getName()); } private static String fixInvalidUriLiterals(Text value) { return value.toString().replaceAll("\"\\s*?(http[s]?://[^\"]+)s*?\"", "<$1>"); } } @Override public Configuration getConf() { return conf; } @Override public void setConf(Configuration conf) { this.conf = conf; } }