/*
* CATMA Computer Aided Text Markup and Analysis
*
* Copyright (C) 2009 University Of Hamburg
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package de.catma.document.source;
import java.io.IOException;
import java.net.URLConnection;
import java.util.HashMap;
import java.util.Map;
import org.mozilla.universalchardet.UniversalDetector;
import de.catma.document.source.contenthandler.DOCContentHandler;
import de.catma.document.source.contenthandler.DOCXContentHandler;
import de.catma.document.source.contenthandler.HTMLContentHandler;
import de.catma.document.source.contenthandler.PDFContentHandler;
import de.catma.document.source.contenthandler.RTFContentHandler;
import de.catma.document.source.contenthandler.SourceContentHandler;
import de.catma.document.source.contenthandler.TEIContentHandler;
import de.catma.document.source.contenthandler.XMLContentHandler;
import de.catma.document.source.contenthandler.StandardContentHandler;
/**
* Handles the creation of {@link SourceDocument}s.<br>
*
*
* @author marco.petris@web.de
*
*/
public class SourceDocumentHandler {
// mapping of file types -> source content handlers
private Map<FileType, Class<? extends SourceContentHandler>> typeHandlerMap;
/**
* Setup.
*/
public SourceDocumentHandler() {
typeHandlerMap = new HashMap<FileType, Class<? extends SourceContentHandler>>();
registerSourceContentHandler(
FileType.TEXT, StandardContentHandler.class );
registerSourceContentHandler(
FileType.RTF, RTFContentHandler.class );
registerSourceContentHandler(
FileType.PDF, PDFContentHandler.class );
registerSourceContentHandler(
FileType.XPDF, PDFContentHandler.class );
registerSourceContentHandler(
FileType.HTML, HTMLContentHandler.class );
registerSourceContentHandler(
FileType.HTM, HTMLContentHandler.class );
registerSourceContentHandler(
FileType.DOC, DOCContentHandler.class );
registerSourceContentHandler(
FileType.DOCX, DOCXContentHandler.class);
registerSourceContentHandler(
FileType.XML, XMLContentHandler.class);
registerSourceContentHandler(
FileType.TEI, TEIContentHandler.class);
}
/**
* Retrieves a mime type for the specified file
* @param fileName the name of the file
* @param urlConnection a link to the file's raw data
* @param defaultMimeType a default mime type if detection fails
* @return a detected mime type or the default mime type
*/
public String getMimeType(String fileName, URLConnection urlConnection, String defaultMimeType) {
String contentType = urlConnection.getContentType();
return getMimeType(fileName, contentType, defaultMimeType);
}
/**
* Retrieves a mime type for the specified file
* @param fileName the name of the file
* @param contentType the content type of the file
* @param defaultMimeType a default mime type if detection fails
* @return a detected mime type or the default mime type
*/
public String getMimeType(String fileName, String contentType, String defaultMimeType) {
String mimeType = null;
if ((contentType != null) && (!contentType.equals("content/unknown"))) {
String[] contentTypeAttributes = contentType.split(";");
if (contentTypeAttributes.length > 0) {
mimeType = contentTypeAttributes[0];
}
}
if (mimeType == null) {
mimeType = URLConnection.getFileNameMap().getContentTypeFor(fileName);
if (mimeType == null) {
mimeType = defaultMimeType;
}
}
return mimeType;
}
/**
* Retrieves a mime type for the specified file
* @param fileName the name of the file
* @param defaultMimeType a default mime type if detection fails
* @return a detected mime type or the default mime type
*/
public String getMimeType(String fileName, String defaultMimeType) {
String mimeType = URLConnection.getFileNameMap().getContentTypeFor(fileName);
if (mimeType == null) {
mimeType = defaultMimeType;
}
return mimeType;
}
/**
* Tries to detect the encoding of the specified file.
* @param urlConnection a link to the file's raw data
* @param rawData the raw data
* @param defaultEncoding a default encoding
* @return the detected encoding or the default encoding
*/
public String getEncoding(URLConnection urlConnection, byte[] rawData, String defaultEncoding) {
String encoding = urlConnection.getContentEncoding();
if (encoding==null) {
return getEncoding
(encoding, urlConnection.getContentType(), rawData, defaultEncoding);
}
return encoding;
}
/**
* Tries to detect the encoding of the specified file if the given encoding
* is null.
* @param encoding the given encoding
* @param rawData the raw data
* @param defaultEncoding a default encoding
* @return the fiven endcoding, the detected encoding or the default encoding
*/
public String getEncoding(
String encoding, String contentType, byte[] rawData, String defaultEncoding) {
if (encoding==null) {
if (contentType.contains("charset")) {
String[] contentTypeAttributes = contentType.split(";");
String charsetAttribute = null;
for (String attribute : contentTypeAttributes) {
if (attribute.trim().startsWith("charset")) {
charsetAttribute = attribute;
}
}
if (charsetAttribute != null) {
encoding = charsetAttribute.trim().substring(
charsetAttribute.indexOf("=")).toUpperCase();
}
if (encoding.startsWith("=")) {
encoding = encoding.substring(1);
}
}
if (encoding == null) {
UniversalDetector detector = new UniversalDetector(null);
detector.handleData(rawData, 0, rawData.length);
encoding = detector.getDetectedCharset();
if (encoding == null) {
encoding = defaultEncoding;
}
}
}
return encoding;
}
/**
* Registers the {@link SourceContentHandler} with the givent {@link FileType}.
* @param type the type we want to register a handler for
* @param contentHandlerClass The class of the content handler.
* <b> SourceContentHandler need to have a default no arg constructor!</b>
*/
public void registerSourceContentHandler(
FileType type, Class<? extends SourceContentHandler> contentHandlerClass ) {
typeHandlerMap.put( type, contentHandlerClass );
}
/**
* Constructs a Source Document.
* @param id the identifier of the source document
* @param sourceDocumentInfo the meta data of the source document
* @return the source document instance
* @throws IOException access failure
* @throws InstantiationException {@link SourceContentHandler} instantiation failure
* @throws IllegalAccessException {@link SourceContentHandler} instantiation failure
*/
public SourceDocument loadSourceDocument(
String id, SourceDocumentInfo sourceDocumentInfo)
throws IOException, InstantiationException, IllegalAccessException {
FileType fileType =
sourceDocumentInfo.getTechInfoSet().getFileType();
if( fileType == null ) {
throw new IllegalStateException(
"I don't know the type of this file!" );
}
SourceContentHandler handler =
typeHandlerMap.get( fileType ).newInstance();
handler.setSourceDocumentInfo(sourceDocumentInfo);
SourceDocument document =
new SourceDocument(id, handler);
return document;
}
public SourceDocument loadSourceDocument(
String id, SourceContentHandler handler)
throws IOException, InstantiationException, IllegalAccessException {
SourceDocument document =
new SourceDocument(id, handler);
return document;
}
}