/**
*
* Copyright 2009-2013 The MITRE Corporation.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*
* **************************************************************************
* NOTICE This software was produced for the U. S. Government under Contract No.
* W15P7T-12-C-F600, and is subject to the Rights in Noncommercial Computer
* Software and Noncommercial Computer Software Documentation Clause
* 252.227-7014 (JUN 1995)
*
* (c) 2012 The MITRE Corporation. All Rights Reserved.
* **************************************************************************
*/
package org.opensextant.xtext.converters;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.html.BoilerpipeContentHandler;
import org.apache.tika.parser.html.HtmlParser;
import org.apache.tika.sax.BodyContentHandler;
import org.opensextant.util.TextUtils;
import org.opensextant.xtext.ConvertedDocument;
import org.xml.sax.ContentHandler;
import net.htmlparser.jericho.StartTag;
import org.slf4j.LoggerFactory;
import org.slf4j.Logger;
/**
* A Tika HTML parser that reduces large amounts of empty lines in converted
* HTML text.
*
* @author Marc C. Ubaldino, MITRE, ubaldino at mitre dot org
*/
public class TikaHTMLConverter extends ConverterAdapter {
public static final int MAX_HTML_FILE_SIZE = 0x80000; // 0.5 MB
HtmlParser parser = new HtmlParser();
private boolean scrubHTMLArticle = false;
private int maxHTMLDocumentSize = MAX_HTML_FILE_SIZE;
private static Logger log = LoggerFactory.getLogger(TikaHTMLConverter.class);
/**
* Initially useuful metadata is just date and times pushed through meta tags or meta-equiv tags
* Override as needed.
*
* @param m metadata key
* @return if field is generally useful.
*/
public static boolean isUsefulMeta(String m) {
String tag = m.toLowerCase();
if (tag.contains("date") || tag.contains("time")) {
return true;
}
/* Ignore duplicative tags */
if (tag.startsWith("twitter:") || tag.startsWith("fb:")) {
return false;
}
/* Other generic metadata worth tracking:
*
*/
if (tag.contains("description") || tag.contains("subject") || tag.contains("keywords")) {
return true;
}
if (tag.startsWith("article:")) {
return true;
}
log.debug("HTTP meta tag found meta={}", m);
// NOTE: http-equiv, and certain other RDF/tagging mechanisms will be dropped here.
// For the same reason Tika does not support them. Too many wild metadata schemes.
return false;
}
/**
* Initialize a reusable HTML parser.
*
* @param article_only
* true if you want to scrub HTML
* @throws IOException
* on err
*/
public TikaHTMLConverter(boolean article_only) throws IOException {
scrubHTMLArticle = article_only;
}
/**
* Initialize a reusable HTML parser.
*
* @param article_only
* true if you want to scrub HTML
* @param docSize
* a maximum raw HTML document size
* @throws IOException
* on err
*/
public TikaHTMLConverter(boolean article_only, int docSize) throws IOException {
this(article_only);
maxHTMLDocumentSize = docSize;
}
/**
* a barebones HTML parser.
*
* <pre>
* TODO: mis-encoded HTML entities are not decoded
* properly. E.g., finding "–" (82xx range is dashes, quotes) for
* example, does not decode correctly unless the page encoding is declared as UTF-8.
* </pre>
*/
@Override
protected ConvertedDocument conversionImplementation(InputStream input, File doc)
throws IOException {
Metadata metadata = new Metadata();
HashMap<String, String> moreMetadata = new HashMap<>();
// HTML Conversion here is simply not resetting its internal buffers
// Its just accumulating and error out when it reaches MAX
ContentHandler handler = new BodyContentHandler(maxHTMLDocumentSize);
BoilerpipeContentHandler scrubbingHandler = null;
if (scrubHTMLArticle) {
scrubbingHandler = new BoilerpipeContentHandler(handler);
}
try {
parser.parse(input, (scrubHTMLArticle ? scrubbingHandler : handler), metadata,
new ParseContext());
if (doc != null) {
parseHTMLMetadata(doc, moreMetadata);
}
} catch (Exception xerr) {
throw new IOException("Unable to parse content", xerr);
} finally {
input.close();
}
ConvertedDocument textdoc = new ConvertedDocument(doc);
textdoc.addTitle(metadata.get(TikaCoreProperties.TITLE));
String text = null;
if (scrubHTMLArticle) {
text = scrubbingHandler.getTextDocument().getText(true, false);
} else {
text = handler.toString();
}
textdoc.setText(TextUtils.reduce_line_breaks(text));
// -- Improve CHAR SET encoding answer.
byte[] data = textdoc.buffer.getBytes();
if (TextUtils.isASCII(data)) {
textdoc.setEncoding("ASCII");
} else {
// Okay, okay... let Tika name whatever encoding it found or guessed
// at.
textdoc.setEncoding(metadata.get(Metadata.CONTENT_ENCODING));
}
// Indicate if we tried to filter the article at all.
//
textdoc.addProperty("filtered", scrubHTMLArticle);
textdoc.addProperty("converter", TikaHTMLConverter.class.getName());
if (!moreMetadata.isEmpty()) {
for (String k : moreMetadata.keySet()) {
textdoc.addUserProperty(k, moreMetadata.get(k));
}
}
return textdoc;
}
/**
* Heuristics for pulling in metadata that Tika neglects for various reasons.
* This adds found meta tags to given metadata.
*
* TODO: InputStream is difficult to reset after tika parser reads it. So just using the file object,
* Jericho reads the raw file again.
*
* @param doc file object for document
* @param metadata metadata map to backfill
* @throws IOException
*/
private void parseHTMLMetadata(File doc, Map<String, String> md) throws IOException {
net.htmlparser.jericho.Source htmlDoc = new net.htmlparser.jericho.Source(doc);
List<net.htmlparser.jericho.StartTag> tags = htmlDoc.getAllStartTags("meta");
for (StartTag t : tags) {
String n = t.getAttributeValue("name");
String p = t.getAttributeValue("property");
if (p == null && n == null) {
log.debug("Unmatched metadata in HTML {}", t.toString());
continue;
}
String key = p != null ? p : n;
if (!isUsefulMeta(key)) {
continue;
}
/* hopefully value is in content field */
String v = t.getAttributeValue("content");
if (v == null) {
continue;
}
md.put(key, v);
}
}
}