/* * #! * Ontopia Classify * #- * Copyright (C) 2001 - 2013 The Ontopia Project * #- * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * !# */ package net.ontopia.topicmaps.classify; import java.io.UnsupportedEncodingException; import java.util.ArrayList; import java.util.List; import net.ontopia.utils.OntopiaRuntimeException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * INTERNAL: */ public class FormatModule implements FormatModuleIF { // Define a logging category. static Logger log = LoggerFactory.getLogger(FormatModule.class.getName()); protected List<FormatModuleIF> modules; protected FormatModuleIF fallout_module; public FormatModule() { modules = new ArrayList<FormatModuleIF>(); modules.add(new XMLFormatModule()); modules.add(new HTMLFormatModule()); modules.add(new PDFFormatModule()); modules.add(new WordFormatModule()); modules.add(new PowerPointFormatModule()); modules.add(new OOXMLWordFormatModule()); modules.add(new OOXMLPowerpointFormatModule()); fallout_module = new PlainTextFormatModule(); modules.add(fallout_module); } public boolean matchesContent(ClassifiableContentIF cc) { return true; } public boolean matchesIdentifier(ClassifiableContentIF cc) { return true; } public void readContent(ClassifiableContentIF cc, TextHandlerIF handler) { // detect document format FormatModuleIF fm = detectFormat(cc); // read document fm.readContent(cc, handler); } public FormatModuleIF detectFormat(ClassifiableContentIF cc) { // auto-detect by looking at document content for (int i=0; i < modules.size(); i++) { FormatModuleIF fm = modules.get(i); if (fm.matchesContent(cc)) { log.debug("Match content: " + cc.getIdentifier() + ", format: " + fm); return fm; } } // auto-detect by looking at document identifier for (int i=0; i < modules.size(); i++) { FormatModuleIF fm = modules.get(i); if (fm.matchesIdentifier(cc)) { log.debug("Match uri: " + cc.getIdentifier() + ", format: " + fm); return fm; } } return fallout_module; } // -------------------------------------------------------------------------- // extension matching // -------------------------------------------------------------------------- public static boolean matchesExtension(String uri, String[] extensions) { if (extensions == null) return false; String luri = uri.toLowerCase(); for (int i=0; i < extensions.length; i++) { if (luri.endsWith(extensions[i])) return true; } return false; } // -------------------------------------------------------------------------- // content matching // -------------------------------------------------------------------------- private static byte[][] boms = new byte[][] { new byte[] { (byte)0xEF, (byte)0xBB, (byte)0xBF }, // UTF-8 new byte[] { (byte)0xFE, (byte)0xFF }, // UTF-16 Big Endian new byte[] { (byte)0xFF, (byte)0xFE }, // UTF-16 Little Endian new byte[] { (byte)0x00, (byte)0x00, (byte)0xFE, (byte)0xFF }, // UTF-32 Big Endian new byte[] { (byte)0xFF, (byte)0xFE, (byte)0x00, (byte)0x00 } // UTF-32 Little Endian }; private static String[] bomnames = new String[] { "UTF-8", "UTF-16BE", "UTF-16LE", "UTF-32BE", "UTF-32LE" }; public static String getCharSetName(int charSet) { return bomnames[charSet]; } public static int getOffset(int charSet) { return boms[charSet].length-1; } public static int detectCharSet(byte[] content) { // check byte order mark for (int i=0; i < boms.length; i++) { byte[] bom = boms[i]; if (startsWith(content, bom)) return i; } return -1; } public static byte[] getBytes(String s) { try { return s.getBytes("UTF-8"); } catch (UnsupportedEncodingException e) { throw new OntopiaRuntimeException(e); } } public static byte[][] getBytes(String[] s) { try { byte[][] b = new byte[s.length][]; for (int i=0; i < s.length; i++) { b[i] = s[i].getBytes("UTF-8"); } return b; } catch (UnsupportedEncodingException e) { throw new OntopiaRuntimeException(e); } } public static boolean startsWith(byte[] content, byte[] s) { if (content == null || content.length < s.length) return false; for (int i=0; i < s.length; i++) { if (content[i] != s[i]) return false; } return true; } public static boolean startsWithSkipWhitespace(byte[] content, byte[][] ss) { int offset = getLeadingWhitespace(content); outer: for (int i=0; i < ss.length; i++) { byte[] s = ss[i]; for (int o=offset; o < s.length; o++) { if (content[o+offset] != s[o]) continue outer; } return true; } return false; } public static boolean startsWithSkipWhitespace(byte[] content, byte[] s) { int offset = getLeadingWhitespace(content); for (int i=0; i < s.length; i++) { if (content[i+offset] != s[i]) return false; } return true; } private static int getLeadingWhitespace(byte[] content) { // skip leading white space int offset = 0; for (int i=0; i < content.length; i++) { char c = (char)content[i]; if (Character.isWhitespace(c) || c == '\u0000' || c == '\u00ff' || c == '\u00fe' || c == '\u00ef' || c == '\u00ef' || c == '\u00bb' || c == '\u00bf') offset++; else break; } return offset; } }