/*
* (C) Copyright 2006-2013 Nuxeo SA (http://nuxeo.com/) and others.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* Contributors:
* Olivier Grisel
* Florent Guillaume
*/
package org.nuxeo.ecm.platform.filemanager.service.extension;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.CharacterCodingException;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CodingErrorAction;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.nuxeo.ecm.core.api.Blob;
import org.nuxeo.ecm.core.api.DocumentModel;
import org.nuxeo.ecm.core.api.NuxeoException;
import com.ibm.icu.text.CharsetDetector;
import com.ibm.icu.text.CharsetMatch;
/**
* Imports the string content of a blob as text for the content of the "note" field of a new Note document.
* <p>
* If an existing document with the same title is found the existing Note document is updated instead.
*/
public class NoteImporter extends AbstractFileImporter {
private static final Log log = LogFactory.getLog(NoteImporter.class);
private static final String NOTE_TYPE = "Note";
private static final String NOTE_SCHEMA = "note";
private static final String NOTE_FIELD = "note";
private static final String MT_FIELD = "mime_type";
private static final long serialVersionUID = 1L;
@Override
public String getDefaultDocType() {
return NOTE_TYPE;
}
@Override
public boolean isOverwriteByTitle() {
return true;
}
@Override
public boolean updateDocumentIfPossible(DocumentModel doc, Blob content) {
if (!doc.hasSchema(NOTE_SCHEMA)) {
log.warn("Schema '" + NOTE_SCHEMA + "' is not available for document " + doc);
return false;
}
return super.updateDocumentIfPossible(doc, content);
}
@Override
public void updateDocument(DocumentModel doc, Blob content) {
String string;
try {
string = getString(content);
} catch (IOException e) {
throw new NuxeoException(e);
}
doc.setProperty(NOTE_SCHEMA, NOTE_FIELD, string);
doc.setProperty(NOTE_SCHEMA, MT_FIELD, content.getMimeType());
}
protected String getString(Blob blob) throws IOException {
String s = guessEncoding(blob);
if (s == null) {
s = blob.getString(); // uses default charset
}
return s;
}
protected static String guessEncoding(Blob blob) throws IOException {
// encoding already known?
if (blob.getEncoding() != null) {
return null;
}
// bad mime type?
String mimeType = blob.getMimeType();
if (mimeType == null) {
return null;
}
if (!mimeType.startsWith("text/") && !mimeType.startsWith("application/xhtml")) {
// not a text file, we shouldn't be in the Note importer
return null;
}
byte[] bytes = blob.getByteArray();
List<String> charsets = new ArrayList<>(Arrays.asList("utf-8", "iso-8859-1"));
String CSEQ = "charset=";
int i = mimeType.indexOf(CSEQ);
if (i > 0) {
// charset specified in MIME type
String onlyMimeType = mimeType.substring(0, i).replace(";", "").trim();
blob.setMimeType(onlyMimeType);
String charset = mimeType.substring(i + CSEQ.length());
i = charset.indexOf(";");
if (i > 0) {
charset = charset.substring(0, i);
}
charset = charset.trim().replace("\"", "");
charsets.add(0, charset);
} else {
// charset detected from the actual bytes
CharsetMatch charsetMatch = new CharsetDetector().setText(bytes).detect();
if (charsetMatch != null) {
String charset = charsetMatch.getName();
charsets.add(0, charset);
}
}
// now convert the string according to the charset, and fallback on others if not possible
for (String charset : charsets) {
try {
Charset cs = Charset.forName(charset);
CharsetDecoder d = cs.newDecoder().onMalformedInput(CodingErrorAction.REPORT).onUnmappableCharacter(
CodingErrorAction.REPORT);
CharBuffer cb = d.decode(ByteBuffer.wrap(bytes));
if (cb.length() != 0 && cb.charAt(0) == '\ufeff') {
// remove BOM
cb = cb.subSequence(1, cb.length());
}
return cb.toString();
} catch (IllegalArgumentException e) {
// illegal charset
} catch (CharacterCodingException e) {
// could not decode
}
}
// nothing worked, use platform
return null;
}
}