PDFHandler.java example

Explorer
platform2-master
- platform2-BRANCH_PLATFORM_2
  - src
package com.idega.block.websearch.business;

import java.util.zip.InflaterInputStream;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.io.*;

/**
 * <p><code>PDFHandler</code> Content handler for PDF documents. </p>
* This class is a part of the websearch webcrawler and search engine block. <br>
* It is based on the <a href="http://lucene.apache.org">Lucene</a> java search engine from the Apache group and loosly <br>
* from the work of David Duddleston of i2a.com.<br>
*
* @copyright Idega Software 2002
* @author <a href="mailto:eiki@idega.is">Eirikur Hrafnsson</a>
 */

import java.util.List;


public class PDFHandler implements ContentHandler {
    
    private InputStream in;
    
    /*
     * Input cache.  This is much faster than calling down to a synchronized
     * method of BufferedReader for each byte.  Measurements done 5/30/97
     * show that there's no point in having a bigger buffer:  Increasing
     * the buffer to 8192 had no measurable impact for a program discarding
     * one character at a time (reading from an http URL to a local machine).
     */
    private byte buf[] = new byte[256];
    private int pos;
    private int len;
    /*
    tracks position relative to the beginning of the
    document.
     */
    private int currentPosition;
    
    // 1996.07.10 15:08:56 PST
    SimpleDateFormat dateFormatter;
    
    // Content Data
    private String author;
    private long published;
    private String keywords;
    private String description;
    private String title;
    private StringBuffer contents;
    
    // Flags
    private boolean streamHit = false;
    private boolean parseNextStream = false;
    
    // Compression
    private static final int NONE = 0;
    private static final int FLATE = 1;
    private static final int LZW = 2;
    private int compression = NONE;
    
    
    // TOKENS
    private static final char[] AUTHOR = "/Author".toCharArray();
    private static final char[] CREATIONDATE = "/CreationDate".toCharArray();
    private static final char[] ENDSTREAM = "endstream".toCharArray();
    private static final char[] KEYWORDS = "/Keywords".toCharArray();
    private static final char[] STREAM = "stream".toCharArray();
    private static final char[] SUBJECT = "/Subject".toCharArray();
    private static final char[] TITLE = "/Title".toCharArray();
    private static final char[] NEWLINE = {'\n'};
    private static final char[] RETURN = {'\r'};
    private static final char[] PARAMSTART = {'<','<'};
    
    private static final char[][] tokens = {
        AUTHOR, CREATIONDATE, ENDSTREAM, KEYWORDS, STREAM, SUBJECT,
        TITLE, PARAMSTART
    };
    
    /**
     * PdfParser constructor comment.
     */
    public PDFHandler() {
        this.contents = new StringBuffer();
        this.published = -1;
        
        // 19960710150856
        this.dateFormatter = new SimpleDateFormat("yyyyMMddHHmmss");
    }
    /**
     * Look for tokens.  This is not effiecent.
     * Should use low, hi method with ordered array. NEED TO RECODE
     */
    private char[] findToken() throws IOException {
        
        
        // flags if token still matches.
        boolean[] match = new boolean[tokens.length];
        for (int i = 0; i < match.length; i++) {
            match[i] = true;
        }
        
        // how many tokens still match;
        int matchCount = tokens.length;
        
        // current position to look for char match in tokens
        int charPosition = 0;
        
        // look for matching tokens.
        while (true) {
            int b = read();
            if (b == -1 ) {
				break;
			}
            char ch = (char)b;
            
            
            // loop through all tokens
            for (int i = 0; i < tokens.length; i++) {
                // check to see if match flag is true for this token
                
                if (match[i] == true) {
                    // get the token
                    char[] token = tokens[i];
                    // check if char array of token is in bounds
                    if (charPosition >= token.length) {
                        // out of bounds, check to see if other tokens still match
                        if (matchCount >= 2) {
                            // other tokens still match, set this one to false.
                            match[i] = false;
                            matchCount--;
                        } else {
                            // last matching token;
                            return token;
                        }
                        // token is in bounds, check for match on char at charPosition.
                    } else {
                        if (token[charPosition] != ch) {
                            // did not match, set match to false;
                            match[i] = false;
                            matchCount--;
                        }
                    }
                }
            }
            if (matchCount <= 0 ) {
				break;
			}
            
            charPosition++;
            
        }
        
        return null;
    }
    /**
     * Parse Content. [24] 320:1
     */
    public String getAuthor() {
        return this.author;
    }
    /**
     * Return categories (from META tags)
     */
    public String getCategories() {
        return null;
    }
    /**
     * Parse Content. [24] 320:1
     */
    public String getContents() {
        return this.contents.toString();
    }
    /**
     * Parse Content. [24] 320:1
     */
    public String getDescription() {
        return this.description;
    }
    /**
     *	Return META HREF
     */
    public String getHREF() {
        return null;
    }
    /**
     * Parse Content. [24] 320:1
     */
    public String getKeywords() {
        return this.keywords;
    }
    /**
     * Return links
     */
    public List getLinks() {
        return null;
    }
    /**
     * Parse Content. [24] 320:1
     */
    public long getPublished() {
        return this.published;
    }
    /**
     * Return boolean true if links are to be followed
     */
    public boolean getRobotFollow() {
        return false;
    }
    /**
     * Return boolean true it this is to be indexed
     */
    public boolean getRobotIndex() {
        return true;
    }
    /**
     * Parse Content. [24] 320:1
     */
    public String getTitle() {
        return this.title;
    }
    /**
     * Check for new line chars
     */
    private boolean isNewLineChar(char ch) {
        switch (ch) {
            case '\n' :
                return true;
            case '\r' :
                return true;
            default :
                return false;
        }
        
    }
    /**
     * Insert the method's description here.
     * Creation date: (2/21/2001 7:50:24 PM)
     * @param args java.lang.String[]
     */
    public static void main(String[] args) {
        
        //System.out.println("test");
        try {
          
            String path = "/Users/eiki/Desktop/documents.pdf";
            PDFHandler p = new PDFHandler();
            p.parse(new FileInputStream(path));
            System.out.println("Title: " + p.getTitle());
            System.out.println("Author: " + p.getAuthor());
            System.out.println("Published " + p.getPublished());
            System.out.println("Keywords: " + p.getKeywords());
            System.out.println("Description: " + p.getDescription());
            System.out.println("Content: " + p.getContents());
        } catch (Exception e) {e.printStackTrace();}
    }
    /**
     * Parse Content. [24] 320:1
     */
    private boolean nextLine() throws IOException {
        //System.out.println("look for new line");
        while (true) {
            int b = read();
            if (b == -1 ) {
				return false;
			}
            if (isNewLineChar((char)b)) {
				return true;
			}
        }
        
    }
    /**
     * Parse Content.
     */
    public void parse(InputStream in) {
        
        
        //System.out.println("mark supported" + in.markSupported());
        
        try {
            this.in = new BufferedInputStream(in);
            reset();
            parseContent();
            //System.out.println("Title: " + getTitle());
            //System.out.println("Author: " + getAuthor());
            //System.out.println("Published " + getPublished());
            //System.out.println("Keywords: " + getKeywords());
            //System.out.println("Description: " + getDescription());
            //System.out.println("Content: " + getContents());
            
            //int b;
            //while ((b = in.read()) != -1) {
            //System.out.print((byte)b + ".");
            //System.out.print((char)b + "*");
            //}
            
        } catch (Exception e) {e.printStackTrace();}
    }
    /**
     * Parse Content. [24] 320:1
     */
    private void parseContent() throws IOException {
        Thread curThread = Thread.currentThread();
        while (true) {
            if (curThread.isInterrupted()) {
                curThread.interrupt(); // resignal the interrupt
                break;
            }
            char[] token;
            while (true) {
                token = findToken();
                if (token != null) {
                    //System.out.println("found a token : " + token);
                    if (token == AUTHOR) {
                        this.author = parseData();
                    } else if (token == CREATIONDATE) {
                        this.published = parseDate();
                    } else if (token == KEYWORDS) {
                        this.keywords = parseData();
                    } else if (token == SUBJECT) {
                        this.description = parseData();
                    } else if (token == TITLE) {
                        this.title = parseData();
                    } else if (token == PARAMSTART) {
                        //System.out.println("param set mark");
                        this.in.mark(10000);
                        //parseDataParams();
                    } else if (token == STREAM) {
                        if (!this.streamHit) {
                            //System.out.println("new stream hit");
                            // first time this stream has been hit
                            // go back and parseDataParams.
                            this.in.reset();
                            this.streamHit = true;
                            parseDataParams();
                        } else {
                            //System.out.println("second stream hit");
                            if (this.parseNextStream) {
                                this.contents.append(parseDataStream());
                                this.parseNextStream = false;
                            }
                            this.streamHit = false;
                        }
                    }
                }
                if (!nextLine()) {
                    //System.out.println("no new line");
                    break;
                }
                //System.out.println("new line");
            }
            //System.out.println("hello");
            break;
            
        }
    }
    /**
     * Look for tokens.  This is not effiecent.
     * Should use low, hi method with ordered array. NEED TO RECODE
     */
    private String parseData() throws IOException {
        
        ByteArrayOutputStream temp = new ByteArrayOutputStream();
        
        // look for start '('
        while (true) {
            int b = read();
            if (b == -1 ) {
				break;
			}
            char ch = (char)b;
            if (ch == '(') {
				break;
			}
        }
        while (true) {
            int b = read();
            if (b == -1 ) {
				break;
			}
            char ch = (char)b;
            if (ch == ')') {
				break;
			}
            temp.write(b);
        }
        
        return new String(temp.toByteArray());
    }
    /**
     * Look for tokens.  This is not effiecent.
     * Should use low, hi method with ordered array. NEED TO RECODE
     */
    private String parseDataParams() throws IOException {
        
        ByteArrayOutputStream temp = new ByteArrayOutputStream();
        
        boolean end = false;
        int b = read();
        while (true) {
            // check to see if new line;
            if ((char)b == '>') {
                b = read();
                if ((char)b == '>') {
                    end = true;
                    break;
                } else {
                    temp.write(b);
                }
            } else {
                temp.write(b);
            }
            if (end) {
				break;
			}
            b = read();
        }
        String params = new String(temp.toByteArray());
        //System.out.println(params.length());
        //System.out.println(params);
        if (params.length() < 38
        && params.indexOf("0 R") != -1
        && params.indexOf("/Length ") != -1)  {
            if (params.indexOf("/FlateDecode") != -1) {
				this.compression = FLATE;
			}
            if (params.indexOf("/LZWDecode") != -1) {
				this.compression = LZW;
			}
            this.parseNextStream = true;
            //System.out.println();
            //System.out.println(params);
        }
        
        return new String(temp.toByteArray());
    }
    /**
     * Look for tokens.  This is not effiecent.
     * Should use low, hi method with ordered array. NEED TO RECODE
     */
    private String parseDataStream() throws IOException {
        
        ByteArrayOutputStream temp = new ByteArrayOutputStream();
        ByteArrayOutputStream tmp = new ByteArrayOutputStream(ENDSTREAM.length);
        boolean endstream = false;
        
        int b = read();
        char ch = (char)b;
        while (true) {
            // check to see if new line;
            if (isNewLineChar(ch)) {
                // check to see if it is endstream
                tmp.reset();
                boolean notMatch = false;
                for (int i = 0; i < ENDSTREAM.length; i++) {
                    b = read();
                    tmp.write(b);
                    if ((char)b != ENDSTREAM[i]) {
                        // not endsteam break..
                        notMatch = true;
                        tmp.writeTo(temp);
                        break;
                    }
                }
                if (!notMatch) {
					endstream = true;
				}
            } else {
                // not new line append byte
                temp.write(b);
                b = read();
                ch = (char)b;
            }
            if (endstream) {
				break; // endstream found
			}
        }
        
        // Uncompress if flateDecode is used
        if (this.compression == FLATE) {
            //System.out.println("FlateDecode = " +flateDecode);
            ByteArrayInputStream bis = new ByteArrayInputStream(temp.toByteArray());
            InflaterInputStream iin = new InflaterInputStream(bis);
            temp.reset();
            while ((b = iin.read()) != -1) {
                temp.write(b);
            }
        }
        
        //System.out.println(temp.size());
        //System.out.println(new String(temp.toByteArray()));
        
        // parse content out from formating data. Content is wrapped in a
        // bunch of ()
        
        // look for start '('
        ByteArrayInputStream bis = new ByteArrayInputStream(temp.toByteArray());
        tmp.reset();
        boolean end = false;
        while (true) {
            b = bis.read();
            if (b == -1 ) {
				break;
			}
            if ((char)b == '(') {
                while (true) {
                    b = bis.read();
                    if (b == -1 ) {end = true; break;}
                    // look for end ')'
                    if ((char)b == ')') {
						break;
					}
                    tmp.write(b);
                }
            }
            if (end) {
				break;
			}
        }
        
        // reset flateDecode flag
        this.compression = NONE;
        //System.out.println(tmp.size());
        //System.out.println(new String(tmp.toByteArray()));
        return new String(tmp.toByteArray());
    }
    /**
     * Look for tokens.  This is not effiecent.
     * Should use low, hi method with ordered array. NEED TO RECODE
     */
    private long parseDate() throws IOException {
        
        try {
            String date = parseData();
            return this.dateFormatter.parse(date.substring(2, date.length())).getTime();
        } catch(ParseException e) {
            e.printStackTrace();
            return -1;
        }
    }
    private final int read() throws IOException {
        
        ++this.currentPosition;
        return this.in.read();
        
        //return in.read();
    /*
    if (pos >= len) {
     
        // This loop allows us to ignore interrupts if the flag
        // says so
        for (;;) {
            try {
                len = in.read(buf);
                System.out.println("next");
                break;
            } catch (InterruptedIOException ex) {
                throw ex;
            }
        }
        if (len <= 0) {
            return -1; // eof
        }
        pos = 0;
    }
    ++currentPosition;
    return buf[pos++];
     */
    }
    private final char readCh() throws IOException {
        
        ++this.currentPosition;
        return (char)this.in.read();
    /*
    if (pos >= len) {
     
        // This loop allows us to ignore interrupts if the flag
        // says so
        for (;;) {
            try {
                len = in.read(buf);
                System.out.println("next");
                break;
            } catch (InterruptedIOException ex) {
                throw ex;
            }
        }
        if (len <= 0) {
            return -1; // eof
        }
        pos = 0;
    }
    ++currentPosition;
    return buf[pos++];
     */
    }
    /**
     *	Return contents
     */
    private void reset() {
        
        // Content
        this.title = null;
        this.description = null;
        this.keywords = null;
        this.author = null;
        
        this.contents.setLength(0);
        this.published = -1;
        
        
        // Flags
        this.streamHit = false;
        this.parseNextStream = false;
        this.compression = NONE;
        
        //buf[] = new byte[256];
        //pos = 0;
        //len = 0;
        //currentPosition = 0;
        
        
    }
}