/*
* Copyright 2013 Eolya Consulting - http://www.eolya.fr/
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package fr.eolya.extraction.tika;
import java.io.BufferedReader;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.UnsupportedEncodingException;
import java.io.Writer;
import java.net.MalformedURLException;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.TransformerConfigurationException;
import javax.xml.transform.sax.SAXTransformerFactory;
import javax.xml.transform.sax.TransformerHandler;
import javax.xml.transform.stream.StreamResult;
import org.apache.commons.io.FileUtils;
import org.apache.tika.detect.DefaultDetector;
import org.apache.tika.detect.Detector;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.PasswordProvider;
import org.apache.tika.parser.html.BoilerpipeContentHandler;
import org.apache.tika.sax.ExpandedTitleContentHandler;
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.ContentHandler;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.safety.Cleaner;
import org.jsoup.safety.Whitelist;
import org.jsoup.select.Elements;
import de.jetwick.snacktory.ArticleTextExtractor;
import de.jetwick.snacktory.JResult;
import de.jetwick.snacktory.OutputFormatter;
import de.l3s.boilerpipe.extractors.ArticleExtractor;
import de.l3s.boilerpipe.extractors.CanolaExtractor;
import de.l3s.boilerpipe.extractors.DefaultExtractor;
import com.developpez.adiguba.shell.Shell;
import fr.eolya.extraction.htmlformater.IHtmlFormater;
/**
* Wraps Apache Tika library in order to allow a simple usage and add or improve some features.
*
* @author Eolya Consulting - http://www.eolya.fr/
*/
public class TikaWrapper {
public static String OUTPUT_FORMAT_XML = "xml";
public static String OUTPUT_FORMAT_HTML = "html";
public static String OUTPUT_FORMAT_TEXT = "text";
public static String OUTPUT_FORMAT_TEXT_MAIN = "text_main";
public static String OUTPUT_FORMAT_TEXT_MAIN_SNACKTORY = "text_main_snacktory";
public static String OUTPUT_FORMAT_TEXT_MAIN_BOILERPIPE_DEFAULT = "text_main_boilerpipe_default";
public static String OUTPUT_FORMAT_TEXT_MAIN_BOILERPIPE_ARTICLE = "text_main_boilerpipe_article";
public static String OUTPUT_FORMAT_TEXT_MAIN_BOILERPIPE_CANOLA = "text_main_boilerpipe_canola";
public static String CONTENT_TYPE_PDF = "application/pdf";
public static String CONTENT_TYPE_SWF = "application/x-shockwave-flash";
public static String CONTENT_TYPE_HTML = "text/html";
public static String CONTENT_TYPE_DJVU = "image/vnd.djvu ";
private static String META_TITLE = "title";
private static String META_AUTHOR = "Author";
private static String META_CREATED = "Creation-Date";
private static String META_MODIFIED = "modified";
private static String META_CONTENTTYPE = "Content-Type";
private static String META_CONTENTSIZE = "Content-Size";
private class OutputType {
public void process(InputStream input, OutputStream output, Metadata metadata) throws Exception {
Parser p = parser;
ContentHandler handler = getContentHandler(output, metadata);
p.parse(input, handler, metadata, context);
}
protected ContentHandler getContentHandler(OutputStream output, Metadata metadata) throws Exception {
throw new UnsupportedOperationException();
}
}
private final OutputType XML = new OutputType() {
@Override
protected ContentHandler getContentHandler(OutputStream output, Metadata metadata) throws Exception {
return getTransformerHandler(output, "xml", encoding, prettyPrint);
}
};
private final OutputType HTML = new OutputType() {
@Override
protected ContentHandler getContentHandler(OutputStream output, Metadata metadata) throws Exception {
return new ExpandedTitleContentHandler(getTransformerHandler(output, "html", encoding, prettyPrint));
}
};
private final OutputType TEXT = new OutputType() {
@Override
protected ContentHandler getContentHandler(OutputStream output, Metadata metadata) throws Exception {
return new BodyContentHandler(getOutputWriter(output, encoding));
}
};
private final OutputType TEXT_MAIN = new OutputType() {
@Override
protected ContentHandler getContentHandler(OutputStream output, Metadata metadata) throws Exception {
return new BoilerpipeContentHandler(getOutputWriter(output, encoding));
}
};
/**
* Returns a output writer with the given encoding.
*
* @see <a href="https://issues.apache.org/jira/browse/TIKA-277">TIKA-277</a>
* @param output output stream
* @param encoding output encoding,
* or <code>null</code> for the platform default
* @return output writer
* @throws UnsupportedEncodingException
* if the given encoding is not supported
*/
private static Writer getOutputWriter(OutputStream output, String encoding)
throws UnsupportedEncodingException {
if (encoding != null) {
return new OutputStreamWriter(output, encoding);
} else if (System.getProperty("os.name").toLowerCase().startsWith("mac os x")) {
// TIKA-324: Override the default encoding on Mac OS X
return new OutputStreamWriter(output, "UTF-8");
} else {
return new OutputStreamWriter(output);
}
}
/**
* Returns a transformer handler that serializes incoming SAX events
* to XHTML or HTML (depending the given method) using the given output
* encoding.
*
* @see <a href="https://issues.apache.org/jira/browse/TIKA-277">TIKA-277</a>
* @param output output stream
* @param method "xml" or "html"
* @param encoding output encoding,
* or <code>null</code> for the platform default
* @return {@link System#out} transformer handler
* @throws TransformerConfigurationException
* if the transformer can not be created
*/
private static TransformerHandler getTransformerHandler(OutputStream output, String method, String encoding, boolean prettyPrint)
throws TransformerConfigurationException {
SAXTransformerFactory factory = (SAXTransformerFactory)SAXTransformerFactory.newInstance();
TransformerHandler handler = factory.newTransformerHandler();
handler.getTransformer().setOutputProperty(OutputKeys.METHOD, method);
handler.getTransformer().setOutputProperty(OutputKeys.INDENT, prettyPrint ? "yes" : "no");
if (encoding != null) {
handler.getTransformer().setOutputProperty(OutputKeys.ENCODING, encoding);
}
handler.setResult(new StreamResult(output));
return handler;
}
private ParseContext context;
private Parser parser;
private boolean prettyPrint = true;
private Detector detector;
private OutputType type = null;
private String outputFormat;
private IHtmlFormater formater;
private String tmpPath = null;
private String pdfToTextPath = null;
private String swfToHtmlPath = null;
private String djVuTextPath = null;
private String contentType;
private Metadata metadata;
private ByteArrayOutputStream output;
private HashMap<String, String> meta;
private HashMap<String, String> meta2;
private String text;
/**
* Output character encoding, or <code>null</code> for platform default
*/
private String encoding = null;
/**
* Password for opening encrypted documents, or <code>null</code>.
*/
private String password = null;
public TikaWrapper(String outputFormat, String outputEncoding) throws Exception {
encoding = outputEncoding;
if (encoding==null || "".equals(encoding)) encoding = "UTF-8";
context = new ParseContext();
detector = new DefaultDetector();
parser = new AutoDetectParser(detector);
this.outputFormat = outputFormat;
//this.contentType = contentType;
this.formater = null;
context.set(Parser.class, parser);
context.set(PasswordProvider.class, new PasswordProvider() {
public String getPassword(Metadata metadata) {
return password;
}
});
}
public TikaWrapper(String outputFormat) throws Exception {
this(outputFormat, "UTF-8");
}
public void process(InputStream input) throws MalformedURLException {
process(input, null);
}
public void process(InputStream input, String contentType) throws MalformedURLException {
try {
this.contentType = contentType;
if (OUTPUT_FORMAT_XML.equals(outputFormat)) {
type = XML;
} else if (OUTPUT_FORMAT_HTML.equals(outputFormat)) {
type = HTML;
} else if (OUTPUT_FORMAT_TEXT.equals(outputFormat)) {
type = TEXT;
} else if (OUTPUT_FORMAT_TEXT_MAIN.equals(outputFormat)) {
type = TEXT_MAIN;
} else {
if (contentType==null || "".equals(contentType)) throw new Exception("Incoherent parameters (missing content-type)");
if (!CONTENT_TYPE_HTML.equals(contentType) &&
(
OUTPUT_FORMAT_TEXT_MAIN_SNACKTORY.equals(outputFormat) ||
OUTPUT_FORMAT_TEXT_MAIN_BOILERPIPE_DEFAULT.equals(outputFormat) ||
OUTPUT_FORMAT_TEXT_MAIN_BOILERPIPE_ARTICLE.equals(outputFormat) ||
OUTPUT_FORMAT_TEXT_MAIN_BOILERPIPE_CANOLA.equals(outputFormat)
)
) {
throw new Exception("Incoherent parameters (text/html content-type expected)");
}
}
text = null;
meta2 = null;
metadata = null;
meta = null;
if (usePftToText()) {
processWithPdfToText(input);
} else if (useSwfToHtml()) {
processWithSwfToHtml(input);
} else if (useDjVuText()) {
processWithDjVuText(input);
} else if (useAlternateHtmlParser()) {
htmlToText(input);
} else {
metadata = new Metadata();
processWithTika(TikaInputStream.get(input));
}
}
catch(Exception e) {}
}
private void processWithTika(InputStream input) {
try {
output = new ByteArrayOutputStream();
try {
type.process(input, output, metadata);
} finally {
input.close();
}
}
catch(Exception e) {}
}
private void htmlToText(InputStream input) {
String rawData = convertStreamToString(input);
try {
Document doc = Jsoup.parse(rawData);
meta2 = new HashMap<String, String>();
if (OUTPUT_FORMAT_TEXT_MAIN_SNACKTORY.equals(outputFormat)) {
ArticleTextExtractor extractor = new ArticleTextExtractor();
OutputFormatter outputFormater = new OutputFormatter(10);
outputFormater.setNodesToKeepCssSelector("p,h1,h2,h3,h4,h5,h6");
extractor.setOutputFormatter(outputFormater);
JResult res = extractor.extractContent(rawData);
text = res.getText();
meta2.put(META_TITLE, res.getTitle());
//date = res.getDate(); // yyyy/mm/dd
/*
date = SHelper.completeDate(SHelper.estimateDate(url));
if (date!=null) {
Pattern p = Pattern.compile("^([0-9]{4})\\/([0-9]{2})\\/([0-9]{2})");
Matcher m = p.matcher(date);
if (m.find()) {
date = m.group(1) + "-" + m.group(2) + "-" + m.group(3) + " 00:00:00";
}
else {
date = "";
}
} else {
date = "";
}
*/
//imageUrl = res.getImageUrl();
//imageUrl = HttpUtils.urlGetAbsoluteURL(url, res.getImageUrlBestMatch());
} else {
if (OUTPUT_FORMAT_TEXT_MAIN_BOILERPIPE_ARTICLE.equals(outputFormat))
text = ArticleExtractor.INSTANCE.getText(rawData);
if (OUTPUT_FORMAT_TEXT_MAIN_BOILERPIPE_DEFAULT.equals(outputFormat))
text = DefaultExtractor.INSTANCE.getText(rawData);
if (OUTPUT_FORMAT_TEXT_MAIN_BOILERPIPE_CANOLA.equals(outputFormat))
text = CanolaExtractor.INSTANCE.getText(rawData);
if (doc!=null) {
meta2.put(META_TITLE, doc.select("title").text());
}
}
if (doc!=null) {
if (getMetaContent(doc, "Author")!=null && !"".equals(getMetaContent(doc, "Author"))) meta2.put(META_AUTHOR, getMetaContent(doc, "Author"));
String creationDate = getMetaContent(doc, "CreationDate");
if (creationDate!=null) {
// 20130322143113Z00'00' -> 2013-03-22T14:31:13Z
Pattern p = Pattern.compile("[0-9]{14}Z[0-9]{2}'[0-9]{2}'");
Matcher m = p.matcher(creationDate);
if (m.find()) {
String value = String.format("%1$s-%2$s-%3$sT%4$s:%5$s:%6$sZ",
creationDate.substring(0, 4), creationDate.substring(4, 6), creationDate.substring(6, 8), creationDate.substring(8, 10), creationDate.substring(10, 12), creationDate.substring(12, 14));
meta2.put(META_CREATED, value);
} else {
// 20130322143113+02'00' -> 2013-03-22T14:31:13Z
p = Pattern.compile("[0-9]{14}\\+[0-9]{2}'[0-9]{2}'");
m = p.matcher(creationDate);
if (m.find()) {
String value = String.format("%1$s-%2$s-%3$sT%4$s:%5$s:%6$sZ",
creationDate.substring(0, 4), creationDate.substring(4, 6), creationDate.substring(6, 8), creationDate.substring(8, 10), creationDate.substring(10, 12), creationDate.substring(12, 14));
meta2.put(META_CREATED, value);
}
}
}
}
meta2.put(META_CONTENTSIZE, String.valueOf(rawData.length()));
meta2.put(META_CONTENTTYPE, CONTENT_TYPE_HTML);
}
catch (Exception e) {
e.printStackTrace();
}
}
private static String convertStreamToString(InputStream input) {
try {
InputStreamReader is = new InputStreamReader(input);
StringBuilder sb=new StringBuilder();
BufferedReader br = new BufferedReader(is);
String read = br.readLine();
while(read != null) {
sb.append(read);
read =br.readLine();
}
return sb.toString();
}
catch(Exception e) {
return null;
}
}
public String getText() {
if (output!=null) return output.toString();
return text;
}
public String getMetaAuthor() {
return getMetas()!=null ? getMetas().get(META_AUTHOR) : null;
}
public String getMetaCreated() {
return getMetas()!=null ? getMetas().get(META_CREATED) : null;
}
public String getMetaTitle() {
return getMetas()!=null ? getMetas().get(META_TITLE) : null;
}
public String getMetaModified() {
return getMetas()!=null ? getMetas().get(META_MODIFIED) : null;
}
public String getMetaContentType() {
if (getMetas()==null) return null;
String value = getMetas().get(META_CONTENTTYPE);
if (value!=null && value.indexOf(";")!=-1) value = value.substring(0, value.indexOf(";")).trim();
return value;
}
public String getMetaCharSet() {
if (getMetas()==null) return null;
String value = getMetas().get(META_CONTENTTYPE);
if (value!=null && value.indexOf(";")!=-1)
value = value.substring(value.indexOf(";")+1).trim();
else
value = null;
return value;
}
public Map<String, String> getMetas() {
if (meta2!=null) return meta2;
if (meta==null && metadata!=null) {
meta = new HashMap<String, String>();
String[] names = metadata.names();
for (String name : names) {
for(String value : metadata.getValues(name)) {
meta.put(name, value);
}
}
}
return meta;
}
public void setTempPath(String tempPath) {
this.tmpPath = tempPath;
}
public void setPdfToTextPath(String pdfToTextPath) {
this.pdfToTextPath = pdfToTextPath;
}
private boolean usePftToText() {
return (pdfToTextPath!=null && !"".equals(pdfToTextPath) && CONTENT_TYPE_PDF.equals(contentType));
}
public void setSwfToHtmlPath(String swfToHtmlPath) {
this.swfToHtmlPath = swfToHtmlPath;
}
private boolean useSwfToHtml() {
return (swfToHtmlPath!=null && !"".equals(swfToHtmlPath) && CONTENT_TYPE_SWF.equals(contentType));
}
public void setDjVuTextPath(String djVuTextPath) {
this.djVuTextPath = djVuTextPath;
}
private boolean useDjVuText() {
return (djVuTextPath!=null && !"".equals(djVuTextPath) && CONTENT_TYPE_DJVU.equals(contentType));
}
public void setHtmlFormater(IHtmlFormater formater) {
this.formater = formater;
}
public boolean useAlternateHtmlParser() {
return (CONTENT_TYPE_HTML.equals(contentType) &&
(
OUTPUT_FORMAT_TEXT_MAIN_SNACKTORY.equals(outputFormat) ||
OUTPUT_FORMAT_TEXT_MAIN_BOILERPIPE_DEFAULT.equals(outputFormat) ||
OUTPUT_FORMAT_TEXT_MAIN_BOILERPIPE_ARTICLE.equals(outputFormat) ||
OUTPUT_FORMAT_TEXT_MAIN_BOILERPIPE_CANOLA.equals(outputFormat)
)
);
}
private String getMetaContent(Document doc, String metaName) {
Elements e = doc.select("meta[name=" + metaName + "]");
if (e==null || e.first()==null) return null;
return e.first().attr("content");
}
private boolean writeToFile(File tempFile, InputStream input) {
try {
OutputStream out=new FileOutputStream(tempFile);
byte buf[]=new byte[1024];
int len;
while((len=input.read(buf))>0)
out.write(buf,0,len);
out.close();
input.close();
}
catch (Exception e) {
if (tempFile!=null && tempFile.exists()) tempFile.delete();
e.printStackTrace();
return false;
}
return true;
}
private void processWithPdfToText(InputStream input) {
File tempFile = null;
File tempFile2 = null;
try {
if (input!=null && pdfToTextPath!=null && !"".equals(pdfToTextPath)) {
// Get a local copy of the file
tempFile = createTempFile("tmp", ".pdf", tmpPath);
if (!writeToFile(tempFile, input)) return;
meta2 = new HashMap<String, String>();
meta2.put(META_CONTENTSIZE, String.valueOf(tempFile.length()));
tempFile2 = createTempFile("tmp", ".html", tmpPath);
Shell sh = new Shell();
// Convert with PDFTOTEXT - pdftotext -enc UTF-8 -raw -q -htmlmeta -eol unix in.pdf out.html
sh.exec(pdfToTextPath, "-enc", "UTF-8", "-raw", "-q", "-htmlmeta", "-eol", "unix", tempFile.getAbsolutePath(), tempFile2.getAbsolutePath()).consumeAsString();
tempFile.delete();
// Load in string and add the <meta http-equiv='Content-Type' content='text/html; charset=utf-8'> line
InputStreamReader fr1 = new InputStreamReader(new FileInputStream(tempFile2), "UTF-8");
BufferedReader br1 = new BufferedReader(fr1);
StringBuilder sb = new StringBuilder();
while(br1.ready()){
String line = br1.readLine();
sb.append(line).append("\n");
if ("</head>".equals(line))
{
sb.append("<meta http-equiv='Content-Type' content='text/html; charset=utf-8'>").append("\n");
}
}
br1.close() ;
tempFile2.delete();
meta2.put(META_CONTENTTYPE, CONTENT_TYPE_PDF);
text = sb.toString();
Document doc = Jsoup.parse(text);
if (doc!=null) {
meta2.put(META_TITLE, doc.select("title").text());
meta2.put(META_AUTHOR, getMetaContent(doc, "Author"));
String creationDate = getMetaContent(doc, "CreationDate");
if (creationDate!=null) {
// 20130322143113Z00'00' -> 2013-03-22T14:31:13Z
Pattern p = Pattern.compile("[0-9]{14}Z[0-9]{2}'[0-9]{2}'");
Matcher m = p.matcher(creationDate);
if (m.find()) {
String value = String.format("%1$s-%2$s-%3$sT%4$s:%5$s:%6$sZ",
creationDate.substring(0, 4), creationDate.substring(4, 6), creationDate.substring(6, 8), creationDate.substring(8, 10), creationDate.substring(10, 12), creationDate.substring(12, 14));
meta2.put(META_CREATED, value);
} else {
// 20130322143113+02'00' -> 2013-03-22T14:31:13Z
p = Pattern.compile("[0-9]{14}\\+[0-9]{2}'[0-9]{2}'");
m = p.matcher(creationDate);
if (m.find()) {
String value = String.format("%1$s-%2$s-%3$sT%4$s:%5$s:%6$sZ",
creationDate.substring(0, 4), creationDate.substring(4, 6), creationDate.substring(6, 8), creationDate.substring(8, 10), creationDate.substring(10, 12), creationDate.substring(12, 14));
meta2.put(META_CREATED, value);
}
}
}
if (OUTPUT_FORMAT_TEXT.equals(outputFormat)) {
Document doc2 = new Cleaner(Whitelist.basic()).clean(doc);
text = doc2.body().text();
}
}
}
}
catch (Exception e) {
if (tempFile!=null && tempFile.exists()) tempFile.delete();
if (tempFile2!=null && tempFile2.exists()) tempFile2.delete();
e.printStackTrace();
text = null;
meta2 = null;
}
}
public void processWithSwfToHtml(InputStream input)
{
File tempFile = null;
File tempFile2 = null;
try {
if (input!=null && swfToHtmlPath!=null && !"".equals(swfToHtmlPath)) {
// Get a local copy of the file
tempFile = File.createTempFile("tmp", ".swf");
if (!writeToFile(tempFile, input)) return;
// Convert with SWF2HTML
tempFile2 = File.createTempFile("tmp", ".html");
Shell sh = new Shell();
sh.exec(swfToHtmlPath, "-o", tempFile2.getAbsolutePath(), tempFile.getAbsolutePath()).consumeAsString();
tempFile.delete();
String data = FileUtils.readFileToString(tempFile2, "UTF-8");
tempFile2.delete();
meta2 = new HashMap<String, String>();
meta2.put(META_CONTENTSIZE, String.valueOf(data.length()));
meta2.put(META_CONTENTTYPE, CONTENT_TYPE_SWF);
if (OUTPUT_FORMAT_TEXT.equals(outputFormat)) {
if (formater!=null) {
data = formater.getPlainText(data);
} else {
data = Jsoup.parse(data).body().text();
}
}
text = data;
}
}
catch (Exception e) {
if (tempFile!=null && tempFile.exists()) tempFile.delete();
if (tempFile2!=null && tempFile2.exists()) tempFile2.delete();
e.printStackTrace();
}
}
private void processWithDjVuText(InputStream input) {
// TODO : http://djvu.sourceforge.net/doc/man/djvutxt.html
// djvutxt inputdjvufile outputtxtfile
// http://www.global-language.com/CENTURY/
File tempFile = null;
File tempFile2 = null;
try {
if (input!=null && djVuTextPath!=null && !"".equals(djVuTextPath)) {
// Get a local copy of the file
tempFile = createTempFile("tmp", ".pdf", tmpPath);
if (!writeToFile(tempFile, input)) return;
// Convert with SWF2HTML
tempFile2 = File.createTempFile("tmp", ".txt");
Shell sh = new Shell();
sh.exec(djVuTextPath, tempFile.getAbsolutePath(), tempFile2.getAbsolutePath()).consumeAsString();
tempFile.delete();
String data = FileUtils.readFileToString(tempFile2, "UTF-8");
tempFile2.delete();
text = data;
}
}
catch (Exception e) {
if (tempFile!=null && tempFile.exists()) tempFile.delete();
if (tempFile2!=null && tempFile2.exists()) tempFile2.delete();
e.printStackTrace();
}
}
private static File createTempFile(String prefix, String suffix, String directory) throws IOException {
File tmpFile = null;
if (directory == null)
directory = "";
if (!"".equals(directory))
tmpFile = new File(directory);
if (tmpFile == null || !tmpFile.exists() || !tmpFile.isDirectory())
return File.createTempFile(prefix, suffix);
else
return File.createTempFile(prefix, suffix, tmpFile);
}
}