/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.indexer.field;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Random;
import java.util.Set;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.ObjectWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
/**
* Creates custom FieldWritable objects from a text file containing field
* information including field name, value, and optional boost and fields type
* (as needed by FieldWritable objects).
*
* An input text file to CustomFields would be tab separated and would look
* similar to this:
*
* <pre>
* http://www.apache.org\tlang\ten\t5.0\tCONTENT
* http://lucene.apache.org\tlang\tde
* </pre>
*
* The only required fields are url, name and value. Custom fields are
* configured through the custom-fields.xml file in the classpath. The config
* file allow you to set defaults for whether a field is indexed, stored, and
* tokenized, boosts on a field, and whether a field can output multiple values
* under the same key.
*
* The purpose of the CustomFields job is to allow better integration with
* technologies such as Hadoop Streaming. Streaming jobs can be created in any
* programming language, can output the text file needed by the CustomFields
* job, and those fields can then be included in the index.
*
* The concept of custom fields requires two separate pieces. The indexing piece
* and the query piece. The indexing piece is handled by the CustomFields job.
* The query piece is handled by the query-custom plugin.
*
* <b>Important:</b><br> <i>Currently, because of the way the query plugin
* architecture works, custom fields names must be added to the fields parameter
* in the query-custom plugin plugin.xml file in order to be queried.</i>
*
* The CustomFields tool accepts one or more directories containing text files
* in the appropriate custom field format. These files are then turned into
* FieldWritable objects to be included in the index.
*/
public class CustomFields
extends Configured
implements Tool {
public static final Log LOG = LogFactory.getLog(CustomFields.class);
/**
* MapReduce job that converts text values into FieldWritable objects.
*
* @param inputs The directories with text files to convert.
* @param output The converter output directory.
*
* @throws IOException If an error occurs while converting.
*/
private void runConverter(Path[] inputs, Path output)
throws IOException {
JobConf converter = new NutchJob(getConf());
converter.setJobName("CustomFields Converter");
for (int i = 0; i < inputs.length; i++) {
FileInputFormat.addInputPath(converter, inputs[i]);
}
FileOutputFormat.setOutputPath(converter, output);
converter.setInputFormat(TextInputFormat.class);
converter.setMapperClass(Converter.class);
converter.setReducerClass(Converter.class);
converter.setMapOutputKeyClass(Text.class);
converter.setMapOutputValueClass(FieldWritable.class);
converter.setOutputKeyClass(Text.class);
converter.setOutputValueClass(FieldWritable.class);
converter.setOutputFormat(SequenceFileOutputFormat.class);
LOG.info("Starting converter job");
try {
JobClient.runJob(converter);
}
catch (IOException e) {
LOG.error(StringUtils.stringifyException(e));
throw e;
}
LOG.info("Finished converter job.");
}
/**
* Aggregated multiple FieldWritable objects with the same name. Depending on
* settings in the custom-fields.xml file, a field may one or more fields.
* This jobs aggregates fields and then collects based on the configuration
* settings.
*
* @param basicFields The basicfields FieldWritable objects.
* @param converted The converted custom field objects.
* @param output The final output directory for custom field objects.
*
* @throws IOException If an error occurs while converting.
*/
private void runCollector(Path basicFields, Path converted, Path output)
throws IOException {
JobConf collector = new NutchJob(getConf());
collector.setJobName("CustomFields Collector");
FileInputFormat.addInputPath(collector, converted);
FileInputFormat.addInputPath(collector, basicFields);
FileOutputFormat.setOutputPath(collector, output);
collector.setInputFormat(SequenceFileInputFormat.class);
collector.setMapOutputKeyClass(Text.class);
collector.setMapOutputValueClass(ObjectWritable.class);
collector.setMapperClass(Collector.class);
collector.setReducerClass(Collector.class);
collector.setOutputKeyClass(Text.class);
collector.setOutputValueClass(FieldWritable.class);
collector.setOutputFormat(SequenceFileOutputFormat.class);
LOG.info("Starting collector job");
try {
JobClient.runJob(collector);
}
catch (IOException e) {
LOG.error(StringUtils.stringifyException(e));
throw e;
}
LOG.info("Finished collector job.");
}
/**
* Converts text values into FieldWritable objects.
*/
public static class Converter
extends Configured
implements Mapper<LongWritable, Text, Text, FieldWritable>,
Reducer<Text, FieldWritable, Text, FieldWritable> {
private JobConf conf;
private Map<String, boolean[]> flagMap = new HashMap<String, boolean[]>();
private Set<String> multiFields = new HashSet<String>();
public Converter() {
}
public Converter(Configuration conf) {
setConf(conf);
}
public void configure(JobConf conf) {
try {
// get the file system and the configuration file from the classpath
this.conf = conf;
FileSystem fs = FileSystem.get(conf);
String configFile = conf.get("custom.fields.config",
"custom-fields.xml");
LOG.info("Reading configuration field configuration from " + configFile);
Properties customFieldProps = new Properties();
InputStream fis = conf.getConfResourceAsInputStream(configFile);
if (fis == null) {
throw new IOException("Was unable to open " + configFile);
}
// load the configuration file as properties
customFieldProps.loadFromXML(fis);
// loop through the properties setting field flags
Enumeration propKeys = customFieldProps.keys();
while (propKeys.hasMoreElements()) {
String prop = (String)propKeys.nextElement();
if (prop.endsWith(".name")) {
String propName = prop.substring(0, prop.length() - 5);
String name = customFieldProps.getProperty(prop);
String indexedProp = customFieldProps.getProperty(propName
+ ".indexed");
String storedProp = customFieldProps.getProperty(propName
+ ".stored");
String tokProp = customFieldProps.getProperty(propName
+ ".tokenized");
boolean indexed = (indexedProp.equalsIgnoreCase("yes")
|| indexedProp.equalsIgnoreCase("true") || indexedProp.equalsIgnoreCase("on"));
boolean stored = (storedProp.equalsIgnoreCase("yes")
|| storedProp.equalsIgnoreCase("true") || storedProp.equalsIgnoreCase("on"));
boolean tokenized = (tokProp.equalsIgnoreCase("yes")
|| tokProp.equalsIgnoreCase("true") || tokProp.equalsIgnoreCase("on"));
boolean[] flags = {indexed, stored, tokenized};
flagMap.put(name, flags);
String multiProp = customFieldProps.getProperty(propName + ".multi");
boolean multi = (multiProp.equalsIgnoreCase("yes")
|| multiProp.equalsIgnoreCase("true") || multiProp.equalsIgnoreCase("on"));
if (multi) {
multiFields.add(name);
}
}
}
}
catch (Exception e) {
LOG.error("Error loading custom field properties:\n"
+ StringUtils.stringifyException(e));
}
}
public void map(LongWritable key, Text value,
OutputCollector<Text, FieldWritable> output, Reporter reporter)
throws IOException {
// split the file on tabs
String line = value.toString();
String[] fields = line.split("\t");
if (fields.length >= 3) {
// fields must be in a specific order, default values for optional fields
String url = fields[0];
String fieldName = fields[1];
String fieldVal = fields[2];
String fieldScore = (fields.length > 3 ? fields[3] : null);
String fieldType = (fields.length > 4 ? fields[4] : "CONTENT").toUpperCase();
// creates the FieldWritable objects and collects
boolean[] flags = flagMap.get(fieldName);
if (flags != null) {
FieldWritable field = null;
if (fieldScore != null) {
field = new FieldWritable(fieldName, fieldVal,
FieldType.valueOf(fieldType), Float.parseFloat(fieldScore),
flags[0], flags[1], flags[2]);
}
else {
field = new FieldWritable(fieldName, fieldVal,
FieldType.valueOf(fieldType), flags[0], flags[1], flags[2]);
}
output.collect(new Text(url), field);
}
}
}
public void reduce(Text key, Iterator<FieldWritable> values,
OutputCollector<Text, FieldWritable> output, Reporter reporter)
throws IOException {
// if multiple fields are allowed collect all of them, if not allowed
// and multiple fields are present all of the values are ignored
Set<String> multiSet = new HashSet<String>();
while (values.hasNext()) {
FieldWritable field = values.next();
String name = field.getName();
boolean isMulti = multiFields.contains(name);
if (isMulti || (!isMulti && !multiSet.contains(name))) {
output.collect(key, field);
multiSet.add(name);
}
else {
LOG.info("Ignoring multiple " + name + " fields for "
+ key.toString());
}
}
}
public void close() {
}
}
/**
* Aggregates FieldWritable objects by the same name for the same URL. These
* objects are them filtered for multiple values against configuration
* settings.
*/
public static class Collector
extends Configured
implements Mapper<Text, Writable, Text, ObjectWritable>,
Reducer<Text, ObjectWritable, Text, FieldWritable> {
private JobConf conf;
public void configure(JobConf conf) {
this.conf = conf;
}
public void close() {
}
public void map(Text key, Writable value,
OutputCollector<Text, ObjectWritable> output, Reporter reporter)
throws IOException {
ObjectWritable objWrite = new ObjectWritable();
objWrite.set(value);
output.collect(key, objWrite);
}
public void reduce(Text key, Iterator<ObjectWritable> values,
OutputCollector<Text, FieldWritable> output, Reporter reporter)
throws IOException {
FieldsWritable basicFields = null;
List<FieldWritable> customFields = new ArrayList<FieldWritable>();
while (values.hasNext()) {
ObjectWritable objWrite = values.next();
Object obj = objWrite.get();
if (obj instanceof FieldWritable) {
customFields.add((FieldWritable)obj);
}
else if (obj instanceof FieldsWritable) {
basicFields = (FieldsWritable)obj;
}
}
if (basicFields != null && customFields.size() > 0) {
for (int i = 0; i < customFields.size(); i++) {
output.collect(key, customFields.get(i));
}
}
}
}
void createFields(Path basicFields, Path[] inputs, Path output)
throws IOException {
Configuration conf = getConf();
FileSystem fs = FileSystem.get(conf);
Path tempFields = new Path(output + "-"
+ Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
runConverter(inputs, tempFields);
runCollector(basicFields, tempFields, output);
fs.delete(tempFields, true);
}
public static void main(String[] args)
throws Exception {
int res = ToolRunner.run(NutchConfiguration.create(), new CustomFields(),
args);
System.exit(res);
}
/**
* Runs the CustomFields job.
*/
public int run(String[] args)
throws Exception {
Options options = new Options();
Option helpOpts = OptionBuilder.withArgName("help").withDescription(
"show this help message").create("help");
Option outputOpts = OptionBuilder.withArgName("output").hasArg().withDescription(
"the output index directory").create("output");
Option inputOpts = OptionBuilder.withArgName("input").hasArgs().withDescription(
"the input directories with text field files").create("input");
Option basicFieldOpts = OptionBuilder.withArgName("basicfields").hasArg().withDescription(
"the basicfields to use").create("basicfields");
options.addOption(helpOpts);
options.addOption(inputOpts);
options.addOption(basicFieldOpts);
options.addOption(outputOpts);
CommandLineParser parser = new GnuParser();
try {
CommandLine line = parser.parse(options, args);
if (line.hasOption("help") || !line.hasOption("output")
|| !line.hasOption("basicfields")) {
HelpFormatter formatter = new HelpFormatter();
formatter.printHelp("CustomFields", options);
return -1;
}
String[] inputs = line.getOptionValues("input");
Path[] inputPaths = new Path[inputs.length];
for (int i = 0; i < inputs.length; i++) {
inputPaths[i] = new Path(inputs[i]);
}
String output = line.getOptionValue("output");
String basicFields = line.getOptionValue("basicfields");
createFields(new Path(basicFields), inputPaths, new Path(output));
return 0;
}
catch (Exception e) {
LOG.fatal("CustomFields: " + StringUtils.stringifyException(e));
return -2;
}
}
}