package com.idega.block.websearch.business; import java.util.zip.InflaterInputStream; import java.text.ParseException; import java.text.SimpleDateFormat; import java.io.*; /** * <p><code>PDFHandler</code> Content handler for PDF documents. </p> * This class is a part of the websearch webcrawler and search engine block. <br> * It is based on the <a href="http://lucene.apache.org">Lucene</a> java search engine from the Apache group and loosly <br> * from the work of David Duddleston of i2a.com.<br> * * @copyright Idega Software 2002 * @author <a href="mailto:eiki@idega.is">Eirikur Hrafnsson</a> */ import java.util.List; public class PDFHandler implements ContentHandler { private InputStream in; /* * Input cache. This is much faster than calling down to a synchronized * method of BufferedReader for each byte. Measurements done 5/30/97 * show that there's no point in having a bigger buffer: Increasing * the buffer to 8192 had no measurable impact for a program discarding * one character at a time (reading from an http URL to a local machine). */ private byte buf[] = new byte[256]; private int pos; private int len; /* tracks position relative to the beginning of the document. */ private int currentPosition; // 1996.07.10 15:08:56 PST SimpleDateFormat dateFormatter; // Content Data private String author; private long published; private String keywords; private String description; private String title; private StringBuffer contents; // Flags private boolean streamHit = false; private boolean parseNextStream = false; // Compression private static final int NONE = 0; private static final int FLATE = 1; private static final int LZW = 2; private int compression = NONE; // TOKENS private static final char[] AUTHOR = "/Author".toCharArray(); private static final char[] CREATIONDATE = "/CreationDate".toCharArray(); private static final char[] ENDSTREAM = "endstream".toCharArray(); private static final char[] KEYWORDS = "/Keywords".toCharArray(); private static final char[] STREAM = "stream".toCharArray(); private static final char[] SUBJECT = "/Subject".toCharArray(); private static final char[] TITLE = "/Title".toCharArray(); private static final char[] NEWLINE = {'\n'}; private static final char[] RETURN = {'\r'}; private static final char[] PARAMSTART = {'<','<'}; private static final char[][] tokens = { AUTHOR, CREATIONDATE, ENDSTREAM, KEYWORDS, STREAM, SUBJECT, TITLE, PARAMSTART }; /** * PdfParser constructor comment. */ public PDFHandler() { this.contents = new StringBuffer(); this.published = -1; // 19960710150856 this.dateFormatter = new SimpleDateFormat("yyyyMMddHHmmss"); } /** * Look for tokens. This is not effiecent. * Should use low, hi method with ordered array. NEED TO RECODE */ private char[] findToken() throws IOException { // flags if token still matches. boolean[] match = new boolean[tokens.length]; for (int i = 0; i < match.length; i++) { match[i] = true; } // how many tokens still match; int matchCount = tokens.length; // current position to look for char match in tokens int charPosition = 0; // look for matching tokens. while (true) { int b = read(); if (b == -1 ) { break; } char ch = (char)b; // loop through all tokens for (int i = 0; i < tokens.length; i++) { // check to see if match flag is true for this token if (match[i] == true) { // get the token char[] token = tokens[i]; // check if char array of token is in bounds if (charPosition >= token.length) { // out of bounds, check to see if other tokens still match if (matchCount >= 2) { // other tokens still match, set this one to false. match[i] = false; matchCount--; } else { // last matching token; return token; } // token is in bounds, check for match on char at charPosition. } else { if (token[charPosition] != ch) { // did not match, set match to false; match[i] = false; matchCount--; } } } } if (matchCount <= 0 ) { break; } charPosition++; } return null; } /** * Parse Content. [24] 320:1 */ public String getAuthor() { return this.author; } /** * Return categories (from META tags) */ public String getCategories() { return null; } /** * Parse Content. [24] 320:1 */ public String getContents() { return this.contents.toString(); } /** * Parse Content. [24] 320:1 */ public String getDescription() { return this.description; } /** * Return META HREF */ public String getHREF() { return null; } /** * Parse Content. [24] 320:1 */ public String getKeywords() { return this.keywords; } /** * Return links */ public List getLinks() { return null; } /** * Parse Content. [24] 320:1 */ public long getPublished() { return this.published; } /** * Return boolean true if links are to be followed */ public boolean getRobotFollow() { return false; } /** * Return boolean true it this is to be indexed */ public boolean getRobotIndex() { return true; } /** * Parse Content. [24] 320:1 */ public String getTitle() { return this.title; } /** * Check for new line chars */ private boolean isNewLineChar(char ch) { switch (ch) { case '\n' : return true; case '\r' : return true; default : return false; } } /** * Insert the method's description here. * Creation date: (2/21/2001 7:50:24 PM) * @param args java.lang.String[] */ public static void main(String[] args) { //System.out.println("test"); try { String path = "/Users/eiki/Desktop/documents.pdf"; PDFHandler p = new PDFHandler(); p.parse(new FileInputStream(path)); System.out.println("Title: " + p.getTitle()); System.out.println("Author: " + p.getAuthor()); System.out.println("Published " + p.getPublished()); System.out.println("Keywords: " + p.getKeywords()); System.out.println("Description: " + p.getDescription()); System.out.println("Content: " + p.getContents()); } catch (Exception e) {e.printStackTrace();} } /** * Parse Content. [24] 320:1 */ private boolean nextLine() throws IOException { //System.out.println("look for new line"); while (true) { int b = read(); if (b == -1 ) { return false; } if (isNewLineChar((char)b)) { return true; } } } /** * Parse Content. */ public void parse(InputStream in) { //System.out.println("mark supported" + in.markSupported()); try { this.in = new BufferedInputStream(in); reset(); parseContent(); //System.out.println("Title: " + getTitle()); //System.out.println("Author: " + getAuthor()); //System.out.println("Published " + getPublished()); //System.out.println("Keywords: " + getKeywords()); //System.out.println("Description: " + getDescription()); //System.out.println("Content: " + getContents()); //int b; //while ((b = in.read()) != -1) { //System.out.print((byte)b + "."); //System.out.print((char)b + "*"); //} } catch (Exception e) {e.printStackTrace();} } /** * Parse Content. [24] 320:1 */ private void parseContent() throws IOException { Thread curThread = Thread.currentThread(); while (true) { if (curThread.isInterrupted()) { curThread.interrupt(); // resignal the interrupt break; } char[] token; while (true) { token = findToken(); if (token != null) { //System.out.println("found a token : " + token); if (token == AUTHOR) { this.author = parseData(); } else if (token == CREATIONDATE) { this.published = parseDate(); } else if (token == KEYWORDS) { this.keywords = parseData(); } else if (token == SUBJECT) { this.description = parseData(); } else if (token == TITLE) { this.title = parseData(); } else if (token == PARAMSTART) { //System.out.println("param set mark"); this.in.mark(10000); //parseDataParams(); } else if (token == STREAM) { if (!this.streamHit) { //System.out.println("new stream hit"); // first time this stream has been hit // go back and parseDataParams. this.in.reset(); this.streamHit = true; parseDataParams(); } else { //System.out.println("second stream hit"); if (this.parseNextStream) { this.contents.append(parseDataStream()); this.parseNextStream = false; } this.streamHit = false; } } } if (!nextLine()) { //System.out.println("no new line"); break; } //System.out.println("new line"); } //System.out.println("hello"); break; } } /** * Look for tokens. This is not effiecent. * Should use low, hi method with ordered array. NEED TO RECODE */ private String parseData() throws IOException { ByteArrayOutputStream temp = new ByteArrayOutputStream(); // look for start '(' while (true) { int b = read(); if (b == -1 ) { break; } char ch = (char)b; if (ch == '(') { break; } } while (true) { int b = read(); if (b == -1 ) { break; } char ch = (char)b; if (ch == ')') { break; } temp.write(b); } return new String(temp.toByteArray()); } /** * Look for tokens. This is not effiecent. * Should use low, hi method with ordered array. NEED TO RECODE */ private String parseDataParams() throws IOException { ByteArrayOutputStream temp = new ByteArrayOutputStream(); boolean end = false; int b = read(); while (true) { // check to see if new line; if ((char)b == '>') { b = read(); if ((char)b == '>') { end = true; break; } else { temp.write(b); } } else { temp.write(b); } if (end) { break; } b = read(); } String params = new String(temp.toByteArray()); //System.out.println(params.length()); //System.out.println(params); if (params.length() < 38 && params.indexOf("0 R") != -1 && params.indexOf("/Length ") != -1) { if (params.indexOf("/FlateDecode") != -1) { this.compression = FLATE; } if (params.indexOf("/LZWDecode") != -1) { this.compression = LZW; } this.parseNextStream = true; //System.out.println(); //System.out.println(params); } return new String(temp.toByteArray()); } /** * Look for tokens. This is not effiecent. * Should use low, hi method with ordered array. NEED TO RECODE */ private String parseDataStream() throws IOException { ByteArrayOutputStream temp = new ByteArrayOutputStream(); ByteArrayOutputStream tmp = new ByteArrayOutputStream(ENDSTREAM.length); boolean endstream = false; int b = read(); char ch = (char)b; while (true) { // check to see if new line; if (isNewLineChar(ch)) { // check to see if it is endstream tmp.reset(); boolean notMatch = false; for (int i = 0; i < ENDSTREAM.length; i++) { b = read(); tmp.write(b); if ((char)b != ENDSTREAM[i]) { // not endsteam break.. notMatch = true; tmp.writeTo(temp); break; } } if (!notMatch) { endstream = true; } } else { // not new line append byte temp.write(b); b = read(); ch = (char)b; } if (endstream) { break; // endstream found } } // Uncompress if flateDecode is used if (this.compression == FLATE) { //System.out.println("FlateDecode = " +flateDecode); ByteArrayInputStream bis = new ByteArrayInputStream(temp.toByteArray()); InflaterInputStream iin = new InflaterInputStream(bis); temp.reset(); while ((b = iin.read()) != -1) { temp.write(b); } } //System.out.println(temp.size()); //System.out.println(new String(temp.toByteArray())); // parse content out from formating data. Content is wrapped in a // bunch of () // look for start '(' ByteArrayInputStream bis = new ByteArrayInputStream(temp.toByteArray()); tmp.reset(); boolean end = false; while (true) { b = bis.read(); if (b == -1 ) { break; } if ((char)b == '(') { while (true) { b = bis.read(); if (b == -1 ) {end = true; break;} // look for end ')' if ((char)b == ')') { break; } tmp.write(b); } } if (end) { break; } } // reset flateDecode flag this.compression = NONE; //System.out.println(tmp.size()); //System.out.println(new String(tmp.toByteArray())); return new String(tmp.toByteArray()); } /** * Look for tokens. This is not effiecent. * Should use low, hi method with ordered array. NEED TO RECODE */ private long parseDate() throws IOException { try { String date = parseData(); return this.dateFormatter.parse(date.substring(2, date.length())).getTime(); } catch(ParseException e) { e.printStackTrace(); return -1; } } private final int read() throws IOException { ++this.currentPosition; return this.in.read(); //return in.read(); /* if (pos >= len) { // This loop allows us to ignore interrupts if the flag // says so for (;;) { try { len = in.read(buf); System.out.println("next"); break; } catch (InterruptedIOException ex) { throw ex; } } if (len <= 0) { return -1; // eof } pos = 0; } ++currentPosition; return buf[pos++]; */ } private final char readCh() throws IOException { ++this.currentPosition; return (char)this.in.read(); /* if (pos >= len) { // This loop allows us to ignore interrupts if the flag // says so for (;;) { try { len = in.read(buf); System.out.println("next"); break; } catch (InterruptedIOException ex) { throw ex; } } if (len <= 0) { return -1; // eof } pos = 0; } ++currentPosition; return buf[pos++]; */ } /** * Return contents */ private void reset() { // Content this.title = null; this.description = null; this.keywords = null; this.author = null; this.contents.setLength(0); this.published = -1; // Flags this.streamHit = false; this.parseNextStream = false; this.compression = NONE; //buf[] = new byte[256]; //pos = 0; //len = 0; //currentPosition = 0; } }