/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.pdfbox.util; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.OutputStreamWriter; import java.io.Writer; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; /** * Highlighting of words in a PDF document with an XML file. * * @author slagraulet (slagraulet@cardiweb.com) * @author <a href="mailto:ben@benlitchfield.com">Ben Litchfield</a> * @version $Revision: 1.7 $ * * @see <a href="http://partners.adobe.com/public/developer/en/pdf/HighlightFileFormat.pdf"> * Adobe Highlight File Format</a> */ public class PDFHighlighter extends PDFTextStripper { private Writer highlighterOutput = null; //private Color highlightColor = Color.YELLOW; private String[] searchedWords; private ByteArrayOutputStream textOS = null; private Writer textWriter = null; private static final String ENCODING = "UTF-16"; /** * Default constructor. * * @throws IOException If there is an error constructing this class. */ public PDFHighlighter() throws IOException { super(ENCODING); super.setLineSeparator( "" ); super.setPageSeparator( "" ); super.setWordSeparator( "" ); super.setShouldSeparateByBeads( false ); super.setSuppressDuplicateOverlappingText( false ); } /** * Generate an XML highlight string based on the PDF. * * @param pdDocument The PDF to find words in. * @param highlightWord The word to search for. * @param xmlOutput The resulting output xml file. * * @throws IOException If there is an error reading from the PDF, or writing to the XML. */ public void generateXMLHighlight(PDDocument pdDocument, String highlightWord, Writer xmlOutput ) throws IOException { generateXMLHighlight( pdDocument, new String[] { highlightWord }, xmlOutput ); } /** * Generate an XML highlight string based on the PDF. * * @param pdDocument The PDF to find words in. * @param sWords The words to search for. * @param xmlOutput The resulting output xml file. * * @throws IOException If there is an error reading from the PDF, or writing to the XML. */ public void generateXMLHighlight(PDDocument pdDocument, String[] sWords, Writer xmlOutput ) throws IOException { highlighterOutput = xmlOutput; searchedWords = sWords; highlighterOutput.write("<XML>\n<Body units=characters " + //color and mode are not implemented by the highlight spec //so don't include them for now //" color=#" + getHighlightColorAsString() + //" mode=active " + */ " version=2>\n<Highlight>\n"); textOS = new ByteArrayOutputStream(); textWriter = new OutputStreamWriter( textOS, ENCODING); writeText(pdDocument, textWriter); highlighterOutput.write("</Highlight>\n</Body>\n</XML>"); highlighterOutput.flush(); } /** * {@inheritDoc} */ protected void endPage( PDPage pdPage ) throws IOException { textWriter.flush(); String page = new String( textOS.toByteArray(), ENCODING ); textOS.reset(); //page = page.replaceAll( "\n", "" ); //page = page.replaceAll( "\r", "" ); //page = CCRStringUtil.stripChar(page, '\n'); //page = CCRStringUtil.stripChar(page, '\r'); // Traitement des listes � puces (caract�res sp�ciaux) if (page.indexOf("a") != -1) { page = page.replaceAll("a[0-9]{1,3}", "."); } for (int i = 0; i < searchedWords.length; i++) { Pattern pattern = Pattern.compile(searchedWords[i], Pattern.CASE_INSENSITIVE); Matcher matcher = pattern.matcher(page); while( matcher.find() ) { int begin = matcher.start(); int end = matcher.end(); highlighterOutput.write(" <loc " + "pg=" + (getCurrentPageNo()-1) + " pos=" + begin + " len="+ (end - begin) + ">\n"); } } } /** * Command line application. * * @param args The command line arguments to the application. * * @throws IOException If there is an error generating the highlight file. */ public static void main(String[] args) throws IOException { PDFHighlighter xmlExtractor = new PDFHighlighter(); PDDocument doc = null; try { if( args.length < 2 ) { usage(); } String[] highlightStrings = new String[ args.length - 1]; System.arraycopy( args, 1, highlightStrings, 0, highlightStrings.length ); doc = PDDocument.load( args[0] ); xmlExtractor.generateXMLHighlight( doc, highlightStrings, new OutputStreamWriter( System.out ) ); } finally { if( doc != null ) { doc.close(); } } } private static void usage() { System.err.println( "usage: java " + PDFHighlighter.class.getName() + " <pdf file> word1 word2 word3 ..." ); System.exit( 1 ); } /** * Get the color to highlight the strings with. Default is Color.YELLOW. * * @return The color to highlight strings with. */ /*public Color getHighlightColor() { return highlightColor; }**/ /** * Get the color to highlight the strings with. Default is Color.YELLOW. * * @param color The color to highlight strings with. */ /*public void setHighlightColor(Color color) { this.highlightColor = color; }**/ /** * Set the highlight color using HTML like rgb string. The string must be 6 characters long. * * @param color The color to use for highlighting. Should be in the format of "FF0000". */ /*public void setHighlightColor( String color ) { highlightColor = Color.decode( color ); }**/ /** * Get the highlight color as an HTML like string. This will return a string of six characters. * * @return The current highlight color. For example FF0000 */ /*public String getHighlightColorAsString() { //BJL: kudos to anyone that has a cleaner way of doing this! String red = Integer.toHexString( highlightColor.getRed() ); String green = Integer.toHexString( highlightColor.getGreen() ); String blue = Integer.toHexString( highlightColor.getBlue() ); return (red.length() < 2 ? "0" + red : red) + (green.length() < 2 ? "0" + green : green) + (blue.length() < 2 ? "0" + blue : blue); }**/ }