package storm.cookbook.tfidf;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.StringReader;
import org.apache.lucene.analysis.StopAnalyzer;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.util.Version;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;
import org.tartarus.snowball.ext.PorterStemmer;
import org.xml.sax.ContentHandler;
public class TikaSample {
static PorterStemmer stemmer = new PorterStemmer();
public static void main(String[] args) throws Exception {
// parse out the directory that we want to crawl
if (args.length != 1) {
showUsageAndExit();
}
File directory = new File(args[0]);
if (!directory.isDirectory()) {
showUsageAndExit();
}
parseAllFilesInDirectory(directory);
}
private static void parseAllFilesInDirectory(File directory) throws Exception {
for (File file : directory.listFiles()) {
if (file.isDirectory()) {
parseAllFilesInDirectory(file);
} else {
Parser parser = new AutoDetectParser();
Metadata metadata = new Metadata();
ParseContext parseContext = new ParseContext();
ContentHandler handler = new BodyContentHandler(10 * 1024 * 1024);
parser.parse(new FileInputStream(file), handler, metadata, parseContext);
System.out.println("-------------------------------------------------------");
System.out.println("File: " + file);
for (String name : metadata.names()) {
System.out.println("metadata: " + name + " - " + metadata.get(name));
}
if (metadata.get("Content-Type").contains("pdf")) {
// printTerms(handler.toString());
for (String name : metadata.names()) {
System.out.println("metadata: " + name + " - " + metadata.get(name));
}
// System.out.println("Content: " + handler.toString());
}
}
}
}
private static void printTerms(String documentContents) {
try {
TokenStream ts = new StopFilter(Version.LUCENE_30,
new StandardTokenizer(Version.LUCENE_30, new StringReader(documentContents)),
StopAnalyzer.ENGLISH_STOP_WORDS_SET);
TermAttribute termAtt = ts.getAttribute(TermAttribute.class);
ts.getAttribute(TermAttribute.class);
while (ts.incrementToken()) {
stemmer.setCurrent(termAtt.term().toLowerCase());
stemmer.stem();
System.out.println(stemmer.getCurrent());
}
ts.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
private static void showUsageAndExit() {
System.err.println("Usage: java TikaSample <directory to crawl>");
System.exit(1);
}
}