TikaIndexer.java example

Explorer
l4ia-master
- src
  - lia
package lia.tika;

/**
 * Copyright Manning Publications Co.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific lan      
*/

import java.io.FileInputStream;
import java.io.InputStream;
import java.io.File;
import java.io.IOException;
import java.util.Date;
import java.util.Set;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.ArrayList;
import java.util.Collections;

import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.config.TikaConfig;

import org.xml.sax.ContentHandler;

import lia.meetlucene.Indexer;

// From chapter 7
public class TikaIndexer extends Indexer {

  private boolean DEBUG = false;                     //1

  static Set<String> textualMetadataFields           //2
        = new HashSet<String>();                     //2
  static {                                           //2
    textualMetadataFields.add(Metadata.TITLE);       //2
    textualMetadataFields.add(Metadata.AUTHOR);      //2
    textualMetadataFields.add(Metadata.COMMENTS);    //2
    textualMetadataFields.add(Metadata.KEYWORDS);    //2
    textualMetadataFields.add(Metadata.DESCRIPTION); //2
    textualMetadataFields.add(Metadata.SUBJECT);     //2
  }

  public static void main(String[] args) throws Exception {
    if (args.length != 2) {
      throw new IllegalArgumentException("Usage: java " + TikaIndexer.class.getName()
        + " <index dir> <data dir>");
    }

    TikaConfig config = TikaConfig.getDefaultConfig();  //3
    List<String> parsers = new ArrayList<String>(config.getParsers().keySet());  //3
    Collections.sort(parsers);                         //3
    Iterator<String> it = parsers.iterator();          //3
    System.out.println("Mime type parsers:");          //3
    while(it.hasNext()) {                              //3
      System.out.println("  " + it.next());            //3
    }                                                  //3
    System.out.println();                              //3

    String indexDir = args[0];
    String dataDir = args[1];

    long start = new Date().getTime();
    TikaIndexer indexer = new TikaIndexer(indexDir);
    int numIndexed = indexer.index(dataDir, null);
    indexer.close();
    long end = new Date().getTime();

    System.out.println("Indexing " + numIndexed + " files took "
      + (end - start) + " milliseconds");
  }

  public TikaIndexer(String indexDir) throws IOException {
    super(indexDir);
  }

  protected Document getDocument(File f) throws Exception {

    Metadata metadata = new Metadata();
    metadata.set(Metadata.RESOURCE_NAME_KEY, f.getName());   // 4

    // If you know content type (eg because this document
    // was loaded from an HTTP server), then you should also
    // set Metadata.CONTENT_TYPE

    // If you know content encoding (eg because this
    // document was loaded from an HTTP server), then you
    // should also set Metadata.CONTENT_ENCODING

    InputStream is = new FileInputStream(f);      // 5
    Parser parser = new AutoDetectParser();       // 6
    ContentHandler handler = new BodyContentHandler(); // 7
    ParseContext context = new ParseContext();   // 8
    context.set(Parser.class, parser);           // 8

    try {
      parser.parse(is, handler, metadata,      // 9
                   new ParseContext());        // 9
    } finally {
      is.close();
    }

    Document doc = new Document();

    doc.add(new Field("contents", handler.toString(),           // 10
                      Field.Store.NO, Field.Index.ANALYZED));   // 10

    if (DEBUG) {
      System.out.println("  all text: " + handler.toString());
    }
    
    for(String name : metadata.names()) {         //11
      String value = metadata.get(name);

      if (textualMetadataFields.contains(name)) {
        doc.add(new Field("contents", value,      //12
                          Field.Store.NO, Field.Index.ANALYZED));
      }

      doc.add(new Field(name, value, Field.Store.YES, Field.Index.NO)); //13

      if (DEBUG) {
        System.out.println("  " + name + ": " + value);
      }
    }

    if (DEBUG) {
      System.out.println();
    }

    doc.add(new Field("filename", f.getCanonicalPath(),     //14
             Field.Store.YES, Field.Index.NOT_ANALYZED));

    return doc;
  }
}

/*
  #1 Change to true to see all text
  #2 Which metadata fields are textual
  #3 List all mime types handled by Tika
  #4 Create Metadata for the file
  #5 Open the file
  #6 Automatically determines file type
  #7 Extracts metadata and body text
  #8 Setup ParseContext
  #9 Does all the work!
  #10 Index body content
  #11 Index metadata fields
  #12 Append to contents field
  #13 Separately store metadata fields
  #14 Index file path
*/