/*
* Copyright Robert Newson
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.github.rnewson.couchdb.lucene;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import org.apache.lucene.document.Document;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.DublinCore;
import org.apache.tika.metadata.HttpHeaders;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Property;
import java.io.IOException;
import java.io.InputStream;
import static com.github.rnewson.couchdb.lucene.util.Utils.text;
public final class Tika {
public static final Tika INSTANCE = new Tika();
private static final String DC = "_dc.";
private static final Logger log = LogManager.getLogger(Tika.class);
private final org.apache.tika.Tika tika = new org.apache.tika.Tika();
private Tika() {
tika.setMaxStringLength(-1);
}
public void parse(final InputStream in, final String contentType, final String fieldName, final Document doc)
throws IOException {
final Metadata md = new Metadata();
md.set(HttpHeaders.CONTENT_TYPE, contentType);
try {
// Add body text.
doc.add(text(fieldName, tika.parseToString(in, md), false));
} catch (final IOException e) {
log.warn("Failed to index an attachment.", e);
return;
} catch (final TikaException e) {
log.warn("Failed to parse an attachment.", e);
return;
}
// Add DC attributes.
addDublinCoreAttributes(md, doc);
}
private void addAttribute(final String namespace, final String attributeName, final Metadata md, final Document doc) {
if (md.get(attributeName) != null) {
doc.add(text(namespace + attributeName, md.get(attributeName), false));
}
}
private void addAttribute(final String namespace, final Property property, final Metadata md, final Document doc) {
if (md.get(property) != null) {
doc.add(text(namespace + property.getName(), md.get(property), false));
}
}
private void addDublinCoreAttributes(final Metadata md, final Document doc) {
addAttribute(DC, DublinCore.CONTRIBUTOR, md, doc);
addAttribute(DC, DublinCore.COVERAGE, md, doc);
addAttribute(DC, DublinCore.CREATOR, md, doc);
addAttribute(DC, DublinCore.DATE, md, doc);
addAttribute(DC, DublinCore.DESCRIPTION, md, doc);
addAttribute(DC, DublinCore.FORMAT, md, doc);
addAttribute(DC, DublinCore.IDENTIFIER, md, doc);
addAttribute(DC, DublinCore.LANGUAGE, md, doc);
addAttribute(DC, DublinCore.MODIFIED, md, doc);
addAttribute(DC, DublinCore.PUBLISHER, md, doc);
addAttribute(DC, DublinCore.RELATION, md, doc);
addAttribute(DC, DublinCore.RIGHTS, md, doc);
addAttribute(DC, DublinCore.SOURCE, md, doc);
addAttribute(DC, DublinCore.SUBJECT, md, doc);
addAttribute(DC, DublinCore.TITLE, md, doc);
addAttribute(DC, DublinCore.TYPE, md, doc);
}
}