package lia.tika; /** * Copyright Manning Publications Co. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific lan */ import java.io.FileInputStream; import java.io.InputStream; import java.io.File; import java.io.IOException; import java.util.Date; import java.util.Set; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.ArrayList; import java.util.Collections; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.sax.BodyContentHandler; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.config.TikaConfig; import org.xml.sax.ContentHandler; import lia.meetlucene.Indexer; // From chapter 7 public class TikaIndexer extends Indexer { private boolean DEBUG = false; //1 static Set<String> textualMetadataFields //2 = new HashSet<String>(); //2 static { //2 textualMetadataFields.add(Metadata.TITLE); //2 textualMetadataFields.add(Metadata.AUTHOR); //2 textualMetadataFields.add(Metadata.COMMENTS); //2 textualMetadataFields.add(Metadata.KEYWORDS); //2 textualMetadataFields.add(Metadata.DESCRIPTION); //2 textualMetadataFields.add(Metadata.SUBJECT); //2 } public static void main(String[] args) throws Exception { if (args.length != 2) { throw new IllegalArgumentException("Usage: java " + TikaIndexer.class.getName() + " <index dir> <data dir>"); } TikaConfig config = TikaConfig.getDefaultConfig(); //3 List<String> parsers = new ArrayList<String>(config.getParsers().keySet()); //3 Collections.sort(parsers); //3 Iterator<String> it = parsers.iterator(); //3 System.out.println("Mime type parsers:"); //3 while(it.hasNext()) { //3 System.out.println(" " + it.next()); //3 } //3 System.out.println(); //3 String indexDir = args[0]; String dataDir = args[1]; long start = new Date().getTime(); TikaIndexer indexer = new TikaIndexer(indexDir); int numIndexed = indexer.index(dataDir, null); indexer.close(); long end = new Date().getTime(); System.out.println("Indexing " + numIndexed + " files took " + (end - start) + " milliseconds"); } public TikaIndexer(String indexDir) throws IOException { super(indexDir); } protected Document getDocument(File f) throws Exception { Metadata metadata = new Metadata(); metadata.set(Metadata.RESOURCE_NAME_KEY, f.getName()); // 4 // If you know content type (eg because this document // was loaded from an HTTP server), then you should also // set Metadata.CONTENT_TYPE // If you know content encoding (eg because this // document was loaded from an HTTP server), then you // should also set Metadata.CONTENT_ENCODING InputStream is = new FileInputStream(f); // 5 Parser parser = new AutoDetectParser(); // 6 ContentHandler handler = new BodyContentHandler(); // 7 ParseContext context = new ParseContext(); // 8 context.set(Parser.class, parser); // 8 try { parser.parse(is, handler, metadata, // 9 new ParseContext()); // 9 } finally { is.close(); } Document doc = new Document(); doc.add(new Field("contents", handler.toString(), // 10 Field.Store.NO, Field.Index.ANALYZED)); // 10 if (DEBUG) { System.out.println(" all text: " + handler.toString()); } for(String name : metadata.names()) { //11 String value = metadata.get(name); if (textualMetadataFields.contains(name)) { doc.add(new Field("contents", value, //12 Field.Store.NO, Field.Index.ANALYZED)); } doc.add(new Field(name, value, Field.Store.YES, Field.Index.NO)); //13 if (DEBUG) { System.out.println(" " + name + ": " + value); } } if (DEBUG) { System.out.println(); } doc.add(new Field("filename", f.getCanonicalPath(), //14 Field.Store.YES, Field.Index.NOT_ANALYZED)); return doc; } } /* #1 Change to true to see all text #2 Which metadata fields are textual #3 List all mime types handled by Tika #4 Create Metadata for the file #5 Open the file #6 Automatically determines file type #7 Extracts metadata and body text #8 Setup ParseContext #9 Does all the work! #10 Index body content #11 Index metadata fields #12 Append to contents field #13 Separately store metadata fields #14 Index file path */