package org.juxtasoftware.util; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.OutputStreamWriter; import java.io.Reader; import java.util.Iterator; import java.util.List; import javax.xml.bind.JAXBException; import org.apache.commons.io.IOUtils; import org.apache.commons.lang.StringEscapeUtils; import org.apache.tika.detect.DefaultDetector; import org.apache.tika.exception.TikaException; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.HttpHeaders; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.sax.BodyContentHandler; import org.docx4j.convert.in.xhtml.XHTMLImporter; import org.docx4j.openpackaging.exceptions.Docx4JException; import org.docx4j.openpackaging.packages.WordprocessingMLPackage; import org.docx4j.openpackaging.parts.WordprocessingML.NumberingDefinitionsPart; import org.juxtasoftware.Constants; import org.juxtasoftware.model.PageMark; import org.restlet.data.MediaType; import org.restlet.engine.header.ContentType; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.xml.sax.SAXException; import org.xml.sax.helpers.DefaultHandler; import eu.interedition.text.Range; /** * Wrapper around Apache Tika. Used to extract main body text for files of lots * of different types. Sopported: * doc, docx, rtf, html, pdf, odt, epub * * @author loufoster * */ public class ConversionUtils { protected static final Logger LOG = LoggerFactory.getLogger( Constants.WS_LOGGER_NAME ); /** * Convert the plain text witness stream to HTML, including markup for page breaks and line numbers * @param reader * @param range * @param marks * @return * @throws IOException */ public static File witnessToHtml(Reader reader, Range range, List<PageMark> marks) throws IOException { File out = File.createTempFile("wit", "dat"); out.deleteOnExit(); OutputStreamWriter osw = new OutputStreamWriter(new FileOutputStream(out), "UTF-8"); if ( range == null ) { range = new Range(0, Integer.MAX_VALUE); } // get the page number marks and select the first one that is // within the requested range. Iterator<PageMark> markItr = marks.iterator(); PageMark currMark = null; if (markItr.hasNext()) { currMark = markItr.next(); while (currMark.getOffset() < range.getStart()) { if (markItr.hasNext()) { currMark = markItr.next(); } } } // stream witness text from db into file incuding line num/page break markup long pos = 0; StringBuilder line = new StringBuilder(); while (pos <= range.getEnd()) { int data = reader.read(); if (data == -1) { break; } else { if (pos >= range.getStart() && (pos + 1) <= range.getEnd()) { if (currMark != null && currMark.getOffset() == pos) { line.append(currMark.toHtml()); currMark = null; if (markItr.hasNext()) { currMark = markItr.next(); } } if (data == '\n') { line.append("<br/>"); osw.write(line.toString()); line = new StringBuilder(); } else { line.append(StringEscapeUtils.escapeHtml(Character.toString((char) data))); } } pos++; } } IOUtils.closeQuietly(osw); IOUtils.closeQuietly(reader); return out; } public static MediaType determineMediaType( File f ) { TikaInputStream tis = null; String mimeType = null; try { tis = TikaInputStream.get(f); Metadata md = new Metadata(); ParseContext context = new ParseContext(); DefaultDetector detector = new DefaultDetector(); Parser parser = new AutoDetectParser(detector); context.set(Parser.class, parser); parser.parse(tis, new DefaultHandler(), md, context); mimeType = md.get(HttpHeaders.CONTENT_TYPE); } catch (Exception e) { } finally { IOUtils.closeQuietly(tis); } MediaType mt = ContentType.readMediaType(mimeType); return mt; } public static File convertHtmlToDocx( Reader htmlReader ) throws IOException, JAXBException, Docx4JException { File outFile = File.createTempFile("edition", "docx"); outFile.deleteOnExit(); WordprocessingMLPackage wordMLPackage = WordprocessingMLPackage.createPackage(); NumberingDefinitionsPart ndp = new NumberingDefinitionsPart(); wordMLPackage.getMainDocumentPart().addTargetPart(ndp); ndp.unmarshalDefaultNumbering(); // Convert the XHTML, and add it into the empty docx we made wordMLPackage.getMainDocumentPart().getContent().addAll( XHTMLImporter.convert(htmlReader, null, wordMLPackage) ); wordMLPackage.save( outFile ); return outFile; } /** * Autodetect the content of the input stream, and extract the main content as TXT. * Results will be streamed into a temporary file. * * @param srcInoutStream * @return * @throws IOException * @throws TikaException * @throws SAXException */ public static File convertToText( InputStream srcInputStream ) throws IOException, SAXException, TikaException { File txtFile = null; OutputStreamWriter osw = null; try { // create the UTF-8 temp file for holding the extracted text content txtFile = File.createTempFile("txt", "dat"); txtFile.deleteOnExit(); osw = new OutputStreamWriter(new FileOutputStream( txtFile ), "UTF-8" ); // Set up Tika Metadata md = new Metadata(); ParseContext context = new ParseContext(); DefaultDetector detector = new DefaultDetector(); Parser parser = new AutoDetectParser(detector); context.set(Parser.class, parser); BodyContentHandler handler = new BodyContentHandler(osw); // convert input to Tika-friendly input and strip the text // results will wind up in the temp file TikaInputStream tis = TikaInputStream.get(srcInputStream); parser.parse(tis, handler, md, context); } finally { IOUtils.closeQuietly(osw); } try { EncodingUtils.stripUnknownUTF8(txtFile); } catch (IOException e) { LOG.warn("Unable to strip unknown UTF8 characters from auto-transformed source",e); } return txtFile; } }