PRTParser.java example

Explorer
tika-master
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.parser.prt;

import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.util.Collections;
import java.util.Set;

import org.apache.poi.util.IOUtils;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.EndianUtils;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;

import static java.nio.charset.StandardCharsets.US_ASCII;

/**
 * A basic text extracting parser for the CADKey PRT (CAD Drawing)
 *  format. It outputs text from note entries.
 */

public class PRTParser extends AbstractParser {

    /** Serial version UID */
    private static final long serialVersionUID = 4659638314375035178L;

    private static final Set<MediaType> SUPPORTED_TYPES = Collections.singleton(MediaType.application("x-prt"));
    public static final String PRT_MIME_TYPE = "application/x-prt";

    public Set<MediaType> getSupportedTypes(ParseContext context) {
       return SUPPORTED_TYPES;
    }

    /**
     * How long do we allow a text run to claim to be, before we
     * decide we're confused and it's not really text after all?
     */
    private static final int MAX_SANE_TEXT_LENGTH = 0x0800;
    
    /*
     * Text types:
     *   00 00 00 00 f0 [3b]f sz sz TEXT     *view name*
     *   00 00 00 00 f0 3f 00 00 00 00 00 00 00 00 sz sz TEXT  *view name*
     *   (anything)  e0 3f sz sz TEXT    *view name*
     *   3x 33 33 33 33 33 e3 3f 0x 00 00 0x 00 00 0x 0x 1f sz sz TEXT    *note entries* 
     *   
     *  Note - all text is null terminated
     */
      
    public void parse(InputStream stream, ContentHandler handler, Metadata metadata, 
          ParseContext context) throws IOException, SAXException, TikaException {
       
       XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
       Last5 l5 = new Last5();
       int read;
       
       // Try to get the creation date, which is YYYYMMDDhhmm
       byte[] header = new byte[30];
       IOUtils.readFully(stream, header);
       byte[] date = new byte[12];
       IOUtils.readFully(stream, date);
       
       String dateStr = new String(date, US_ASCII);
       if(dateStr.startsWith("19") || dateStr.startsWith("20")) {
          String formattedDate = dateStr.substring(0, 4) + "-" + dateStr.substring(4,6) +
             "-" + dateStr.substring(6,8) + "T" + dateStr.substring(8,10) + ":" +
             dateStr.substring(10, 12) + ":00";
          metadata.set(TikaCoreProperties.CREATED, formattedDate);
          // TODO Metadata.DATE is used as modified, should it be here?
          metadata.set(Metadata.DATE, formattedDate);
       }
       metadata.set(Metadata.CONTENT_TYPE, PRT_MIME_TYPE);
       
       // The description, if set, is the next up-to-500 bytes
       byte[] desc = new byte[500];
       IOUtils.readFully(stream, desc);
       String description = extractText(desc, true);
       if(description.length() > 0) {
          metadata.set(TikaCoreProperties.DESCRIPTION, description);
       }
       
       // Now look for text
       while( (read = stream.read()) > -1) {
          if(read == 0xe0 || read == 0xe3 || read == 0xf0) {
             int nread = stream.read();
             if(nread == 0x3f || nread == 0xbf) {
                // Looks promising, check back for a suitable value
                if(read == 0xe3 && nread == 0x3f) {
                   if(l5.is33()) {
                      // Bingo, note text
                      handleNoteText(stream, xhtml);
                   }
                } else if(l5.is00()) {
                   // Likely view name
                   handleViewName(read, nread, stream, xhtml, l5);
                }
             }
          } else {
             l5.record(read);
          }
       }
    }
    
    private void handleNoteText(InputStream stream, XHTMLContentHandler xhtml) 
    throws IOException, SAXException, TikaException {
       // Ensure we have the right padding text
       int read;
       for(int i=0; i<10; i++) {
          read = stream.read();
          if(read >= 0 && read <= 0x0f) {
             // Promising
          } else {
             // Wrong, false detection
             return;
          }
       }
       read = stream.read();
       if(read != 0x1f) {
          // Wrong, false detection
          return;
       }
       
       int length = EndianUtils.readUShortLE(stream);
       if(length <= MAX_SANE_TEXT_LENGTH) {
          // Length sanity check passed
          handleText(length, stream, xhtml);
       }
    }
    
    private void handleViewName(int typeA, int typeB, InputStream stream, 
          XHTMLContentHandler xhtml, Last5 l5) 
    throws IOException, SAXException, TikaException {
       // Is it 8 byte zero padded?
       int maybeLength = EndianUtils.readUShortLE(stream);
       if(maybeLength == 0) {
          // Check the next 6 bytes too
          for(int i=0; i<6; i++) {
             int read = stream.read();
             if(read >= 0 && read <= 0x0f) {
                // Promising
             } else {
                // Wrong, false detection
                return;
             }
          }
          
          byte[] b2 = new byte[2];
          IOUtils.readFully(stream, b2);
          int length = EndianUtils.getUShortLE(b2);
          if(length > 1 && length <= MAX_SANE_TEXT_LENGTH) {
             // Length sanity check passed
             handleText(length, stream, xhtml);
          } else {
             // Was probably something else
             l5.record(b2[0]);
             l5.record(b2[1]);
          }
       } else if(maybeLength > 0 && maybeLength < MAX_SANE_TEXT_LENGTH) {
          // Looks like it's straight into the text
          handleText(maybeLength, stream, xhtml);
       }
    }
    
    private void handleText(int length, InputStream stream, XHTMLContentHandler xhtml) 
    throws IOException, SAXException, TikaException {
       byte[] str = new byte[length];
       IOUtils.readFully(stream, str);
       if(str[length-1] != 0) {
          // Not properly null terminated, must be wrong
          return;
       }
       
       String text = extractText(str, false);
       
       xhtml.startElement("p");
       xhtml.characters(text);
       xhtml.endElement("p");
    }
    
    /**
     * Does our best to turn the bytes into text
     */
    private String extractText(byte[] data, boolean trim) throws TikaException {
       // The text is always stored null terminated, but sometimes
       //  may have extra null padding too
       int length = data.length - 1;
       if(trim) {
          for(int i=0; i<data.length; i++) {
             if(data[i] == 0) {
                length = i;
                break;
             }
          }
       }
       
       // We believe that the text is basically stored as CP437
       // That said, there are a few characters slightly wrong for that...
       String text;
       try {
          text = new String(data, 0, length, "cp437");
       } catch(UnsupportedEncodingException e) {
          throw new TikaException("JVM Broken, core codepage CP437 missing!");
       }
       
       // Fix up the known character issues
       text = text.replace("\u03C6","\u00D8");

       // All done, as best as we can!
       return text;
    }
    
    /**
     * Provides a view on the previous 5 bytes
     */
    private static class Last5 {
       byte[] data = new byte[5];
       int pos = 0;
       
       private void record(int b) {
          data[pos] = (byte)b;
          pos++;
          if(pos >= data.length) {
             pos = 0;
          }
       }
       
       private byte[] get() {
          byte[] ret = new byte[5];
          for(int i=0; i<ret.length; i++) {
             int p = pos - i;
             if(p < 0) { p += ret.length; }
             ret[i] = data[p];
          }
          return ret;
       }
       
       private boolean is33() {
          byte[] last5 = get();
          for(byte b : last5) {
             if(b != 0x33) return false;
          }
          return true;
       }
       
       private boolean is00() {
          byte[] last5 = get();
          for(byte b : last5) {
             if(b != 0x00) return false;
          }
          return true;
       }
    }
}