package com.mongodb.hadoop.examples.shakespeare;
import com.mongodb.DB;
import com.mongodb.MongoClient;
import com.mongodb.MongoClientURI;
import com.mongodb.gridfs.GridFS;
import com.mongodb.gridfs.GridFSInputFile;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import java.io.File;
import java.io.IOException;
import java.io.OutputStream;
import java.util.Scanner;
import java.util.regex.Pattern;
/**
* A tool that splits Shakepeare's complete works into separate files
* and uploads each to GridFS.
*/
public class PrepareShakespeare implements Tool {
private Configuration conf;
public PrepareShakespeare() {
conf = new Configuration();
}
private void printUsage() {
// CHECKSTYLE:OFF
System.err.println(
"USAGE: hadoop jar mongo-hadoop-shakespeare.jar "
+ getClass().getName()
+ " <inputFile> <connection-string with database>");
// CHECKSTYLE:ON
}
@Override
public int run(final String[] args) throws Exception {
if (args.length < 2) {
printUsage();
return 1;
}
String inputFilePath = args[0];
String mongoURI = args[1];
MongoClientURI uri = new MongoClientURI(mongoURI);
MongoClient client = new MongoClient(uri);
DB gridfsDB = client.getDB(uri.getDatabase());
GridFS gridFS = new GridFS(gridfsDB);
Scanner scanner = new Scanner(new File(inputFilePath));
// Each work is dated with a year.
Pattern delimiter = Pattern.compile("^\\d{4}", Pattern.MULTILINE);
scanner.useDelimiter(delimiter);
int numWorks = 0;
// Drop database before uploading anything.
gridfsDB.dropDatabase();
try {
for (; scanner.hasNext(); ++numWorks) {
String nextWork = scanner.next();
// Skip legal notice/intro.
if (0 == numWorks) {
continue;
}
Scanner titleScanner = new Scanner(nextWork);
String workTitle = null;
while (titleScanner.hasNextLine()) {
String line = titleScanner.nextLine();
if (!line.isEmpty()) {
// Work title is first non-blank line.
workTitle = line;
break;
}
}
if (null == workTitle) {
throw new IOException("Could not find a title!");
}
GridFSInputFile file = gridFS.createFile(workTitle);
// Set chunk size low enough that we get multiple chunks.
file.setChunkSize(1024 * 10);
OutputStream os = file.getOutputStream();
os.write(nextWork.getBytes());
os.close();
}
} finally {
scanner.close();
client.close();
}
System.out.printf("Wrote %d works to GridFS.\n", numWorks);
return 0;
}
@Override
public void setConf(final Configuration conf) {
this.conf = conf;
}
@Override
public Configuration getConf() {
return conf;
}
public static void main(final String[] args) throws Exception {
System.exit(ToolRunner.run(new PrepareShakespeare(), args));
}
}