FormatModule.java example

Explorer
ontopia-master
/*
 * #!
 * Ontopia Classify
 * #-
 * Copyright (C) 2001 - 2013 The Ontopia Project
 * #-
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *      http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * !#
 */

package net.ontopia.topicmaps.classify;

import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.List;
import net.ontopia.utils.OntopiaRuntimeException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * INTERNAL: 
 */
public class FormatModule implements FormatModuleIF {

  // Define a logging category.
  static Logger log = LoggerFactory.getLogger(FormatModule.class.getName());
  
  protected List<FormatModuleIF> modules;
  protected FormatModuleIF fallout_module;
  
  public FormatModule() {
    modules = new ArrayList<FormatModuleIF>();
    modules.add(new XMLFormatModule());
    modules.add(new HTMLFormatModule());
    modules.add(new PDFFormatModule());
    modules.add(new WordFormatModule());
    modules.add(new PowerPointFormatModule());
    modules.add(new OOXMLWordFormatModule());
    modules.add(new OOXMLPowerpointFormatModule());
    fallout_module = new PlainTextFormatModule();
    modules.add(fallout_module);
  }
  
  public boolean matchesContent(ClassifiableContentIF cc) {
    return true;
  }

  public boolean matchesIdentifier(ClassifiableContentIF cc) {
    return true;
  }
  
  public void readContent(ClassifiableContentIF cc, TextHandlerIF handler) {
    // detect document format
    FormatModuleIF fm = detectFormat(cc);
    // read document
    fm.readContent(cc, handler);
  }
  
  public FormatModuleIF detectFormat(ClassifiableContentIF cc) {

    // auto-detect by looking at document content
    for (int i=0; i < modules.size(); i++) {
      FormatModuleIF fm = modules.get(i);
      if (fm.matchesContent(cc)) {
        log.debug("Match content: " + cc.getIdentifier() + ", format: " + fm);
        return fm;
      }
    }

    // auto-detect by looking at document identifier
    for (int i=0; i < modules.size(); i++) {
      FormatModuleIF fm = modules.get(i);
      if (fm.matchesIdentifier(cc)) {
        log.debug("Match uri: " + cc.getIdentifier() + ", format: " + fm);
        return fm;
      }
    }
    
    return fallout_module;
  }
  
  // --------------------------------------------------------------------------
  // extension matching
  // --------------------------------------------------------------------------

  public static boolean matchesExtension(String uri, String[] extensions) {
    if (extensions == null) return false;
    String luri = uri.toLowerCase();
    for (int i=0; i < extensions.length; i++) {
      if (luri.endsWith(extensions[i])) return true;
    }
    return false;
  }

  // --------------------------------------------------------------------------
  // content matching
  // --------------------------------------------------------------------------

  private static byte[][] boms = new byte[][] {
    new byte[] { (byte)0xEF, (byte)0xBB, (byte)0xBF }, // UTF-8
    new byte[] { (byte)0xFE, (byte)0xFF }, // UTF-16 Big Endian
    new byte[] { (byte)0xFF, (byte)0xFE }, // UTF-16 Little Endian
    new byte[] { (byte)0x00, (byte)0x00, (byte)0xFE, (byte)0xFF }, // UTF-32 Big Endian
    new byte[] { (byte)0xFF, (byte)0xFE, (byte)0x00, (byte)0x00 } // UTF-32 Little Endian
  };
  
  private static String[] bomnames = new String[] {
    "UTF-8", "UTF-16BE", "UTF-16LE", "UTF-32BE", "UTF-32LE"
  };

  public static String getCharSetName(int charSet) {
    return bomnames[charSet];
  }

  public static int getOffset(int charSet) {
    return boms[charSet].length-1;
  }
  
  public static int detectCharSet(byte[] content) {
    // check byte order mark
    for (int i=0; i < boms.length; i++) {
      byte[] bom = boms[i];
      if (startsWith(content, bom))
        return i;
    }
    return -1;
  }

  public static byte[] getBytes(String s) {
    try {
      return s.getBytes("UTF-8");
    } catch (UnsupportedEncodingException e) {
      throw new OntopiaRuntimeException(e);
    }
  }

  public static byte[][] getBytes(String[] s) {
    try {
      byte[][] b = new byte[s.length][];
      for (int i=0; i < s.length; i++) {
        b[i] = s[i].getBytes("UTF-8");
      }
      return b;
    } catch (UnsupportedEncodingException e) {
      throw new OntopiaRuntimeException(e);
    }
  }

  public static boolean startsWith(byte[] content, byte[] s) {
    if (content == null || content.length < s.length) return false;
    for (int i=0; i < s.length; i++) {
      if (content[i] != s[i]) return false;
    }
    return true;
  }

  public static boolean startsWithSkipWhitespace(byte[] content, byte[][] ss) {
    int offset = getLeadingWhitespace(content);
    outer:
    for (int i=0; i < ss.length; i++) {
      byte[] s = ss[i];
      for (int o=offset; o < s.length; o++) {
        if (content[o+offset] != s[o]) continue outer;
      }
      return true;
    }
    return false;
  }

  public static boolean startsWithSkipWhitespace(byte[] content, byte[] s) {
    int offset = getLeadingWhitespace(content);
    for (int i=0; i < s.length; i++) {
      if (content[i+offset] != s[i]) return false;
    }
    return true;
  }

  private static int getLeadingWhitespace(byte[] content) {
    // skip leading white space
    int offset = 0;
    for (int i=0; i < content.length; i++) {
      char c = (char)content[i];
      if (Character.isWhitespace(c) || c == '\u0000' || c == '\u00ff' || c == '\u00fe' || c == '\u00ef' || c == '\u00ef' || c == '\u00bb' || c == '\u00bf')
        offset++;
      else
        break;
    }
    return offset;
  }
  
}