package storm.cookbook.tfidf.functions;
import java.io.BufferedReader;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URL;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.ContentHandler;
import au.com.bytecode.opencsv.CSVReader;
import backtype.storm.tuple.Values;
import storm.trident.operation.BaseFunction;
import storm.trident.operation.TridentCollector;
import storm.trident.operation.TridentOperationContext;
import storm.trident.tuple.TridentTuple;
public class DocumentFetchFunction extends BaseFunction {
private static final long serialVersionUID = 1L;
private List<String> mimeTypes;
private Map<String, String> testData = new HashMap<String, String>();
private boolean testMode = false;
Logger LOG = LoggerFactory.getLogger(DocumentFetchFunction.class);
public DocumentFetchFunction(String[] supportedMimeTypes) {
mimeTypes = Arrays.asList(supportedMimeTypes);
}
@Override
public void prepare(Map conf, TridentOperationContext context) {
if (conf.get(backtype.storm.Config.TOPOLOGY_DEBUG).equals(true)) {
testMode = true;
CSVReader reader;
try {
reader = new CSVReader(new BufferedReader(
new InputStreamReader(DocumentFetchFunction.class.getResourceAsStream("docs.csv"))));
List<String[]> myEntries = reader.readAll();
for (String[] row : myEntries) {
testData.put(row[0], row[1]);
}
} catch (Exception e) {
e.printStackTrace();
}
}
}
public void execute(TridentTuple tuple, TridentCollector collector) {
String url = tuple.getStringByField("url");
if (testMode) {
LOG.debug("Generating fake document for testing");
String contents = testData.get(url);
if (contents != null)
collector.emit(new Values(contents, url.trim(), "twitter"));
} else {
try {
Parser parser = new AutoDetectParser();
Metadata metadata = new Metadata();
ParseContext parseContext = new ParseContext();
URL urlObject = new URL(url);
ContentHandler handler = new BodyContentHandler(10 * 1024 * 1024);
parser.parse((InputStream) urlObject.getContent(), handler, metadata, parseContext);
String[] mimeDetails = metadata.get("Content-Type").split(";");
if ((mimeDetails.length > 0) && (mimeTypes.contains(mimeDetails[0]))) {
collector.emit(new Values(handler.toString(), url.trim(), "twitter"));
}
} catch (Exception e) {
}
}
}
}