/**
* Copyright 2008 the original author or authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package net.sf.katta.indexing;
import java.io.File;
import java.io.IOException;
import java.util.Random;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapRunnable;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Index;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriter.MaxFieldLength;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
/**
* Illustrates how to implement a indexer as hadoop map reduce job.
*/
public class IndexerJob {
public static void main(String[] args) throws IOException {
if (args.length != 3) {
String usage = "IndexerJob <in text file/dir> <out katta index dir> <numOfShards>";
System.out.println(usage);
System.exit(1);
}
IndexerJob indexerJob = new IndexerJob();
String input = args[0];
String output = args[1];
int numOfShards = Integer.parseInt(args[2]);
indexerJob.startIndexer(input, output, numOfShards);
}
public void startIndexer(String path, String finalDestination, int numOfShards) throws IOException {
// create job conf with class pointing into job jar.
JobConf jobConf = new JobConf(IndexerJob.class);
jobConf.setJobName("indexer");
jobConf.setMapRunnerClass(Indexer.class);
// alternative use a text file and a TextInputFormat
jobConf.setInputFormat(SequenceFileInputFormat.class);
Path input = new Path(path);
FileInputFormat.setInputPaths(jobConf, input);
// we just set the output path to make hadoop happy.
FileOutputFormat.setOutputPath(jobConf, new Path(finalDestination));
// setting the folder where lucene indexes will be copied when finished.
jobConf.set("finalDestination", finalDestination);
// important to switch spec exec off.
// We dont want to have something duplicated.
jobConf.setSpeculativeExecution(false);
// The num of map tasks is equal to the num of input splits.
// The num of input splits by default is equal to the num of hdf blocks
// for the input file(s). To get the right num of shards we need to
// calculate the best input split size.
FileSystem fs = FileSystem.get(input.toUri(), jobConf);
FileStatus[] status = fs.globStatus(input);
long size = 0;
for (FileStatus fileStatus : status) {
size += fileStatus.getLen();
}
long optimalSplisize = size / numOfShards;
jobConf.set("mapred.min.split.size", "" + optimalSplisize);
// give more mem to lucene tasks.
jobConf.set("mapred.child.java.opts", "-Xmx2G");
jobConf.setNumMapTasks(1);
jobConf.setNumReduceTasks(0);
JobClient.runJob(jobConf);
}
public static class Indexer implements MapRunnable<LongWritable, Text, Text, Text> {
private JobConf _conf;
public void configure(JobConf conf) {
_conf = conf;
}
@SuppressWarnings("deprecation")
public void run(RecordReader<LongWritable, Text> reader, OutputCollector<Text, Text> output, final Reporter report)
throws IOException {
LongWritable key = reader.createKey();
Text value = reader.createValue();
String tmp = _conf.get("hadoop.tmp.dir");
long millis = System.currentTimeMillis();
String shardName = "" + millis + "-" + new Random().nextInt();
File file = new File(tmp, shardName);
report.progress();
// TODO sg this should be configurable
Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT);
IndexWriter indexWriter = new IndexWriter(FSDirectory.open(file), analyzer, MaxFieldLength.UNLIMITED);
indexWriter.setMergeFactor(100000);
report.setStatus("Adding documents...");
while (reader.next(key, value)) {
report.progress();
Document doc = new Document();
String text = "" + value.toString();
Field contentField = new Field("content", text, Store.YES, Index.ANALYZED);
doc.add(contentField);
indexWriter.addDocument(doc);
}
report.setStatus("Done adding documents.");
Thread t = new Thread() {
public boolean stop = false;
@Override
public void run() {
while (!stop) {
// Makes sure hadoop is not killing the task in case the
// optimization
// takes longer than the task timeout.
report.progress();
try {
sleep(10000);
} catch (InterruptedException e) {
// don't need to do anything.
stop = true;
}
}
}
};
t.start();
report.setStatus("Optimizing index...");
indexWriter.optimize();
report.setStatus("Done optimizing!");
report.setStatus("Closing index...");
indexWriter.close();
report.setStatus("Closing done!");
FileSystem fileSystem = FileSystem.get(_conf);
report.setStatus("Starting copy to final destination...");
Path destination = new Path(_conf.get("finalDestination"));
fileSystem.copyFromLocalFile(new Path(file.getAbsolutePath()), destination);
report.setStatus("Copy to final destination done!");
report.setStatus("Deleting tmp files...");
FileUtil.fullyDelete(file);
report.setStatus("Deleting tmp files done!");
t.interrupt();
}
}
}