package com.neumino.pdftounusualhtml; import java.awt.image.BufferedImage; import java.io.BufferedWriter; import java.io.FileOutputStream; import java.io.FileWriter; import java.io.IOException; import java.sql.ResultSet; import java.sql.SQLException; import java.util.ArrayList; import java.util.List; import org.apache.pdfbox.exceptions.CryptographyException; import org.apache.pdfbox.exceptions.InvalidPasswordException; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.common.PDStream; import org.apache.pdfbox.pdmodel.font.PDFont; import org.apache.pdfbox.pdmodel.font.PDFontDescriptor; import org.apache.pdfbox.util.PDFImageWriter; import org.apache.pdfbox.util.PDFOperator; import org.apache.pdfbox.util.PDFTextStripper; import org.apache.pdfbox.util.TextPosition; import org.apache.commons.lang3.StringEscapeUtils; import com.google.gson.Gson; public class Pdf2Json extends PDFTextStripper{ // parameters private float zoom = (float) 1.5f; private String pathToImagemagick; private int marginTopBackground = 0; private int resolution = 72; //default resolution // group by line private StringBuffer currentLine = new StringBuffer(); private int lineMarginTop = 0; private int lineMarginLeft = 0; private int lineCurrentWidth = 0; private int lineHeight = 0; private int wordMarginLeft = 0; private int wordCurrentWidth = 0; private int currentFontSizePx = 0; private String currentFontString = ""; private Structure structure = new Structure(); private int idLine = 0; public Pdf2Json(float zoom, String pathToImagemagick) throws IOException { this.zoom = zoom; this.pathToImagemagick = pathToImagemagick; } /** * Convert a PDF file to HTML * * @param fileName Path to the file * * @throws IOException If there is an error processing the operation. */ public void convert(String pathToPdf) throws Exception { int positionDotPdf = pathToPdf.lastIndexOf(".pdf"); if (positionDotPdf == -1) { System.err.println("File doesn't have .pdf extension"); System.exit(1); } int positionLastSlash = pathToPdf.lastIndexOf("/"); String pathToDirectory; if (positionLastSlash == -1) { positionLastSlash = 0; pathToDirectory = ""; } else { positionLastSlash++; pathToDirectory = pathToPdf.substring(0, positionLastSlash); } String fileName = pathToPdf.substring(positionLastSlash, positionDotPdf); PDDocument document = null; try { document = PDDocument.load(pathToPdf); if(document.isEncrypted()){ try { document.decrypt(""); } catch( CryptographyException e ) { System.err.println( "\n\n Error: Document is encrypted with a password.\n" ); return ; } catch( IOException e ) { System.err.println( "\n\n Error: Document is encrypted with a password.\n" ); return ; } catch( InvalidPasswordException e ) { System.err.println( "\n\n Error: Document is encrypted with a password.\n" ); return ; } } List allPages = document.getDocumentCatalog().getAllPages(); int nbPage = allPages.size(); int exitVal = 0; int density = (int) (resolution*zoom); String imageName = fileName+".png"; if (nbPage == 1) { imageName = fileName+"-0.png"; } String command = pathToImagemagick+" -density "+density+" "+pathToPdf+" "+pathToDirectory+imageName; try { exitVal = ProcessTimeout.exec(command); } catch(IOException e) { System.err.println(e.toString()); e.printStackTrace(); exitVal = 1; //System.exit(1); } if (exitVal == -2) { System.err.println("Time out"); } else if (exitVal != 0) { System.err.println("Error, could not generate images"); } else if (exitVal == 0) { structure = new Structure(); for( int i=0; i<nbPage; i++ ) { idLine = 0; lineMarginTop = 0; lineMarginLeft = 0; lineCurrentWidth = 0; lineHeight = 0; wordMarginLeft = 0; wordCurrentWidth = 0; currentFontSizePx = 0; PDPage page = (PDPage)allPages.get( i ); BufferedImage image = page.convertToImage(BufferedImage.TYPE_INT_RGB, resolution); Page newPage = new Page((int) (zoom*image.getWidth()), (int) (zoom*image.getHeight()), marginTopBackground); marginTopBackground += (int) (zoom*image.getHeight()); structure.add(newPage); PDStream contents = page.getContents(); if( contents != null ) { try { this.processStream( page, page.findResources(), page.getContents().getStream() ); } catch (NumberFormatException e) { //trigerred once... System.err.println("\n\n Warning: NumberFormatException\n"); e.printStackTrace(); } } Word newWord = new Word(StringEscapeUtils.escapeJava(currentLine.toString()), wordMarginLeft, wordCurrentWidth); structure.addWordToLastPage(newWord); structure.updateLastLine(lineCurrentWidth); } //Save the structure in the file Gson gson = new Gson(); String json = gson.toJson(structure); try{ FileWriter fstream = new FileWriter(pathToDirectory+fileName+"_words.txt"); BufferedWriter out = new BufferedWriter(fstream); out.write(json); out.close(); } catch (Exception e){//Catch exception if any System.err.println("Error: " + e.getMessage()); } } } catch(IOException e) { System.err.println( "Could not open file"); } finally { if( document != null ) { document.close(); } } } /** * A method provided as an event interface to allow a subclass to perform * some specific functionality when text needs to be processed. * * @param text The text to be processed */ protected void processTextPosition( TextPosition text ) { try { int marginLeft = (int)((text.getXDirAdj())*zoom); int fontSizePx = Math.round(text.getFontSizeInPt()/72*resolution*zoom); int marginTop; marginTop = (int)((text.getYDirAdj())*zoom-fontSizePx*0.75); // 0.75 is purely experimental int height = fontSizePx; int width = (int) (text.getWidthDirAdj()*zoom); String fontString = ""; PDFont font = text.getFont(); PDFontDescriptor fontDescriptor = font.getFontDescriptor(); if (fontDescriptor != null) { fontString = fontDescriptor.getFontName(); } int widthSpace = (int) (text.getWidthOfSpace()*zoom); String charToAdd = text.getCharacter().replace("<", "<").replace(">", ">").replace("'", "\'"); try { processChar(charToAdd, marginLeft, marginTop, height, width, fontSizePx, fontString, widthSpace); } catch (SQLException e) { e.printStackTrace(); } } catch (IOException e) { e.printStackTrace(); } } private void processChar(String charToAdd, int marginLeft, int marginTop, int height, int width, int fontSizePx, String fontString, int widthSpace) throws IOException, SQLException { // We have a new line if ((lineMarginTop != marginTop) || (lineHeight != height) || (currentFontSizePx != fontSizePx) || ((marginLeft+width)-lineMarginLeft < 0)) { boolean display = true; if ((currentLine.equals("")) || (currentLine.equals(" "))) { // if there is nothing to display, we do not need to add a bloc display = false; } if (lineMarginTop != 0) { if (display) { Word newWord = new Word(StringEscapeUtils.escapeJava(currentLine.toString()), wordMarginLeft, wordCurrentWidth); structure.addWordToLastPage(newWord); } structure.updateLastLine(lineCurrentWidth); } Line newLine = new Line(idLine, marginLeft, marginTop, 0, height); idLine++; structure.add(newLine); lineMarginTop = marginTop; lineMarginLeft = marginLeft; lineCurrentWidth = width; lineHeight = height; wordCurrentWidth = width; wordMarginLeft = marginLeft; currentFontSizePx = fontSizePx; currentFontString = fontString; currentLine = new StringBuffer(); } else if ((charToAdd.equals(" ")) || (marginLeft-(wordMarginLeft+wordCurrentWidth) > widthSpace-3) || (marginLeft+width-wordMarginLeft < 0) || (currentFontString != fontString)) { // if we are on the same line but with a new word if (lineMarginTop != 0) { // condition could be skipped, there shouldn't be a character on the top left of the pdf currentLine.append(" "); } boolean display = true; if ((currentLine.equals("")) || (currentLine.equals(" "))) { // if there is nothing to display, we do not need to add a bloc display = false; } if (display) { Word newWord = new Word(StringEscapeUtils.escapeJava(currentLine.toString()), wordMarginLeft, wordCurrentWidth); structure.addWordToLastPage(newWord); } lineCurrentWidth = (marginLeft+width)-lineMarginLeft; wordCurrentWidth = width; wordMarginLeft = marginLeft; currentFontSizePx = fontSizePx; currentFontString = fontString; currentLine = new StringBuffer(); } else { lineCurrentWidth = (marginLeft+width)-lineMarginLeft; wordCurrentWidth = (marginLeft+width)-wordMarginLeft; } if (!charToAdd.equals(" ")) { currentLine.append(charToAdd); } }}