package uk.co.mmscomputing.imageio.pdf; import java.io.*; import java.util.*; import javax.imageio.ImageIO; import javax.imageio.stream.ImageInputStream; public class PDFScanner implements PDFConstants{ private static int[] charTable; private static Dictionary keywords; private ImageInputStream in; private int c; private int sym; public int intval; public double realval; public String name=""; public String str; public int symbol; private long length; private PDFBody body; public PDFScanner(PDFBody body,ImageInputStream in)throws IOException{ this.body = body; this.in = in; c=' '; symbol=0; length=-1; } public PDFIndirectObject getIndirectObject(int on,int gn){ return body.getIndirectObject(on,gn); } public void seek(long pos)throws IOException{ in.seek(pos); c=' '; symbol=0; rindex=0; scan(); } public long getLength()throws IOException{ if(length==-1){ length = in.length(); if(length==-1){ // we need to go to the end of the file byte[] buf=new byte[32*1024]; length=0; int count; while((count=in.read(buf,0,buf.length))!=-1){ length+=count; } } } return length; } public void find(int token)throws IOException{ do{ scan(); if(symbol==T_EOF){ throw new IOException(getClass().getName()+".find\n\t Cannot find token: "+token+". Unexpected EOF."); } }while(symbol!=token); } private void read()throws IOException{ c=in.read(); // System.err.println(""+c+" "+Integer.toHexString(c)+" "+(char)c); sym=(c!=-1)?charTable[c]:-1; } public String readLine()throws IOException{ int b; String line = ""; while((b=in.read())!=-1){ // System.err.println(""+b+" "+Integer.toHexString(b)+" "+(char)b); if((b=='\n')||(b=='\r')){break;} } return line; } public void scanEOL()throws IOException{ while((c!='\r')&&(c!='\n')&&(c!=-1)){ read(); } if(c=='\r'){ // msdos: \r\n 0D0A read(); if(c=='\n'){ read(); } }else if(c=='\n'){ // unix: \n 0A read(); } } public byte[] scanStream(byte[] buf)throws IOException{ scanEOL(); if(c==-1){ throw new IOException(getClass().getName()+".readStream:\n\t Unexpected EOF."); } buf[0]=(byte)c; int count = in.read(buf,1,buf.length-1)+1; // read stream of bytes if(count!=buf.length){ throw new IOException(getClass().getName()+".readStream:\n\t Not enough bytes in inputstream! ["+count+" != "+buf.length+"]"); } read(); // read next character; should be eol symbol or (e)ndstream return buf; } public byte[] scanStream(int len)throws IOException{ return scanStream(new byte[len]); } public String scanComment()throws IOException{ str=""; read(); while((c!='\n')&&(c!='\r')&&(c!=-1)){ str+=(char)c; read(); } if(c=='\r'){ // msdos: \r\n 0D0A read(); if(c=='\n'){ read(); } }else if(c=='\n'){ // unix: \n 0A read(); } return str; } private void scanName()throws IOException{ str=""; read(); // first character must be letter while((sym==T_CHAR)||(sym==T_INTEGER)){ str+=(char)c; read(); } symbol=T_NAME; // System.err.println("Name = "+str); } private void scanString()throws IOException{ str=""; read(); while(sym!=T_STRING_END){ if((' '<=c)&&(c<=127)){ if(c=='\\'){ // escape character read(); if((c=='\n')||(c=='\r')){ // todo: \r\n read(); continue; }else if(c=='t'){ c='\t'; }else if(c=='b'){ c='\b'; }else if(c=='f'){ c='\f'; }else if(c=='\\'){ c='\\'; }else if(c=='('){ c='('; }else if(c==')'){ c=')'; }else if(('0'<=c)&&(c<='7')){ // todo: octal \d or \dd or \ddd i.e. \245 } } str+=(char)c; }else{ System.err.println(getClass().getName()+".scanLiteral:\n\tInvalid character[0x"+Integer.toHexString(c)+"] in literal."); } read(); } read(); symbol=T_STRING; } private void scanHexString()throws IOException{ char b; int h,l; str = ""; while(c!='>'){ h=0;l=0; if(c==-1){ System.err.println(getClass().getName()+".scanHexString:\n\tMissing '>'"); break; } if(('0'<=c)&&(c<='9')){ h=c-'0'; }else if(('A'<=c)&&(c<='F')){ h=c-'A'+10; }else if(('a'<=c)&&(c<='f')){ h=c-'a'+10; }else{ System.err.println(getClass().getName()+".scanHexString:\n\tWrong character '"+((char)c)+"'"); } read(); if(c==-1){ System.err.println(getClass().getName()+".scanHexString:\n\tMissing '>'"); break; } if(c=='>'){ l=0; }else{ if(('0'<=c)&&(c<='9')){ l=c-'0'; }else if(('A'<=c)&&(c<='F')){ l=c-'A'+10; }else if(('a'<=c)&&(c<='f')){ l=c-'a'+10; }else{ System.err.println(getClass().getName()+".scanHexString:\n\tWrong character '"+((char)c)+"'"); } read(); } b = ((char)((h<<4)|l)); str+=b; } read(); symbol=T_STRING; } private void scanIdentifier()throws IOException{ str=""; while((sym==T_CHAR)||(sym==T_INTEGER)){ str+=(char)c; read(); } name=str; Integer val=(Integer)keywords.get(name); if(val!=null){ symbol=val.intValue(); }else{ symbol=T_NAME; } } private void scanReal()throws IOException{ symbol=T_REAL; realval=intval; scanNumber(); } private void scanNumber()throws IOException{ int i,j; int d; int[] dig=new int[11]; boolean hex; hex=false; dig[0]=c-0x30; i=0; while(true){ if(i==(dig.length-1)){System.err.println(getClass().getName()+".scanNumber:\n\tToo many digits["+i+"] in number.");break;} read(); i+=1; if(c=='.'){break;} if(c<'0'){break;} if(c<='9'){ dig[i]=c-0x30; }else if(('a'<=c)&&(c<='f')){ hex=true;dig[i]=c-'a'+10; }else if(('A'<=c)&&(c<='F')){ hex=true;dig[i]=c-'A'+10; }else{ break; } } if(c=='.'){ read(); scanReal(); return; } if((c=='H')||(c=='h')){ // hexadecimal read(); j=0; while(!((i-j==8)||(i-j==4)||(i-j==2))&&(j<i-1)&&(dig[j]==0)){ j+=1; } if(i>8){j=i-8;} intval=dig[j]; if(intval>=8){intval-=0x10;} j+=1; while(j<i){ intval=intval*0x10+dig[j];j+=1; } symbol=T_INTEGER; }else if(!hex){ // decimal j=0; while((j<i-1)&&(dig[j]==0)){ j+=1; } intval=0; do{ intval=intval*10+dig[j];j+=1; }while(j<i); if(c=='X'){ // char read(); if((intval<0)||(255<intval)){ System.err.println(getClass().getName()+".scanNumber:\n\tChar ordinal["+intval+"] out of range."); } symbol=T_CHAR; }else{ symbol=T_INTEGER; } }else{ System.err.println(getClass().getName()+".scanNumber:\n\tInvalid number."); intval=0; symbol=0; } intval=intval; } public void scanStartXRefNumber()throws IOException{ symbol=0; while(sym==T_WHITE){read();} if(sym!=T_INTEGER){ throw new IOException(getClass().getName()+".scanStartXRefNumber:\n\tMissing startXRef offset."); } scanNumber(); } private void scan1()throws IOException{ symbol=0; while(sym==T_WHITE){read();} switch(sym){ case T_CHAR: scanIdentifier(); break; case T_INTEGER: scanNumberR(); break; case T_NAME: scanName(); break; // /Name case T_STRING_START: scanString(); break; // ( string ) case T_LSS: read(); if(sym==T_LSS){ // << symbol=T_DICTIONARY_START; read(); }else{ // <string as hex> scanHexString(); } break; case T_COMMENT: // %...EOL scanComment(); scan1(); break; case T_GTR: read(); if(sym==T_GTR){ // >> symbol=T_DICTIONARY_END; read(); } break; default: symbol=sym; read(); break; } } private int rmax = 3; private int rindex = 0; public int[] rvalues = new int[rmax]; private int[] rsymbols = new int[rmax]; public int objectNumber,generationNumber; private void scanNumberR1()throws IOException{ if(rindex<rmax){ while(sym==T_WHITE){read();} if(sym==T_INTEGER){ scanNumberR(); }else{ scan1(); rsymbols[rindex++]=symbol; } } } private void scanNumberR()throws IOException{ scanNumber(); rsymbols[rindex] = symbol; rvalues [rindex] = intval; rindex++; scanNumberR1(); } private void remove(){ intval = rvalues[0]; symbol = rsymbols[0]; rindex--; for(int i=0;i<rindex;i++){ rvalues[i]=rvalues[i+1]; rsymbols[i]=rsymbols[i+1]; } } public void scan()throws IOException{ if(rindex>0){ if(rsymbols[rindex-1]==T_INTEGER){ scanNumberR1(); if(symbol==T_R){ // System.err.println("a rindex = "+rindex); rindex-=2; generationNumber = rvalues[rindex]; rindex-=1; objectNumber = rvalues[rindex]; rsymbols[rindex++] = symbol; } } remove(); }else{ scan1(); if(symbol==T_R){ // System.err.println("b rindex = "+rindex); rindex-=2; generationNumber = rvalues[rindex]; rindex-=1; objectNumber = rvalues[rindex]; }else if(rindex>0){ remove(); } } } static{ int i; charTable = new int[256]; for(i=0;i<256;i++){ charTable[i]=T_WHITE; } for(i='!';i<127;i++){ charTable[i]=T_CHAR; } for(i='0';i<='9';i++){ charTable[i]=T_INTEGER; } charTable['%']=T_COMMENT; charTable['(']=T_STRING_START; charTable[')']=T_STRING_END; charTable['<']=T_LSS; charTable['>']=T_GTR; charTable['[']=T_ARRAY_START; charTable[']']=T_ARRAY_END; charTable['{']=T_LBRACE; charTable['}']=T_RBRACE; charTable['/']=T_NAME; keywords = new Hashtable(); keywords.put("null", new Integer(T_NULL)); keywords.put("false", new Integer(T_FALSE)); keywords.put("true", new Integer(T_TRUE)); keywords.put("stream", new Integer(T_STREAM)); keywords.put("endstream", new Integer(T_ENDSTREAM)); keywords.put("obj", new Integer(T_OBJ)); keywords.put("endobj", new Integer(T_ENDOBJ)); keywords.put("R", new Integer(T_R)); keywords.put("xref", new Integer(T_XREF)); keywords.put("trailer", new Integer(T_TRAILER)); keywords.put("startxref", new Integer(T_STARTXREF)); keywords.put("n", new Integer(T_N)); keywords.put("f", new Integer(T_F)); } public static void main(String[] argv){ try{ String test = "335566\n%%EOF\n [1] [1 2 ] [1 2 3] [1 2 3 4] [1 2 3 4 5] [1 2 R] [0 1 2 R] [0 1 R 2 3 R] [0 1 2 3 4 R] [0 1 2 3 4 5 R] [0 1 0 R 2 0 R 3 0][0] [0 1] (Klasse wie geht's\\t?\\\n Hallo) true \\ << 101 10.2 >> [false null] /NA***;_ME<41> %comment\n <4142303961>"; InputStream in = new ByteArrayInputStream(test.getBytes()); ImageInputStream iin = ImageIO.createImageInputStream(in); PDFScanner s = new PDFScanner(null,iin); while(s.symbol!=T_EOF){ s.scan(); switch(s.symbol){ case T_STRING: System.err.println("string = "+s.str); break; case T_NAME: System.err.println("name = "+s.str); break; case T_TRUE: System.err.println("true = "+s.str); break; case T_FALSE: System.err.println("false = "+s.str); break; case T_NULL: System.err.println("null = "+s.str); break; case T_INTEGER: System.err.println("int = "+s.intval); break; case T_REAL: System.err.println("real = "+s.realval); break; case T_ARRAY_START: System.err.println("array start"); break; case T_ARRAY_END: System.err.println("array end"); break; case T_DICTIONARY_START: System.err.println("dict start"); break; case T_DICTIONARY_END: System.err.println("dict end"); break; case T_R: System.err.println("R "+s.objectNumber+" "+s.generationNumber); break; default: System.err.println("symbol = "+s.symbol); break; } } }catch(Exception e){ e.printStackTrace(); } } }