/** * Copyright 2012 Manning Publications Co. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.manning.cmis.theblend.install; import java.io.File; import java.io.IOException; import java.math.BigDecimal; import java.math.BigInteger; import java.text.DateFormatSymbols; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Arrays; import java.util.Date; import java.util.List; import java.util.Locale; import java.util.Map; import org.apache.chemistry.opencmis.client.api.ObjectType; import org.apache.chemistry.opencmis.client.api.Session; import org.apache.chemistry.opencmis.commons.PropertyIds; import org.apache.chemistry.opencmis.commons.definitions.PropertyDefinition; import org.apache.chemistry.opencmis.commons.enums.BaseTypeId; import org.apache.chemistry.opencmis.commons.enums.Cardinality; import org.apache.chemistry.opencmis.commons.exceptions.CmisObjectNotFoundException; import org.apache.tika.exception.TikaException; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; import org.xml.sax.helpers.DefaultHandler; public class TikaProperties { private static String DATE_FORMAT = "yyyy-MM-dd'T'HH:mm:ss"; private Metadata metadata; private ObjectType docType; public TikaProperties(File file) throws IOException, SAXException, TikaException { TikaInputStream tikaStream = TikaInputStream.get(file); metadata = new Metadata(); ContentHandler handler = new DefaultHandler(); Parser parser = new AutoDetectParser(); ParseContext context = new ParseContext(); try { parser.parse(tikaStream, handler, metadata, context); } finally { try { tikaStream.close(); } catch (Exception ie) { // ignore } } } /** * Sets the document type. */ public void setDocumentType(ObjectType docType) { this.docType = docType; } /** * Identifies the document type from content. */ public ObjectType findDocumentType(Session session) { // get the type id for the MIME type String newTypeId = TikaMappingService .getRepositoryTypeIdFromMIMEType(getMIMEType()); // check if type exists in the repository try { docType = session.getTypeDefinition(newTypeId); } catch (CmisObjectNotFoundException e) { // type not found -> fall back to cmis:document docType = session.getTypeDefinition(BaseTypeId.CMIS_DOCUMENT .value()); } return docType; } /** * Returns the MIME type. */ public String getMIMEType() { String mimetype = metadata.get(Metadata.CONTENT_TYPE); if (mimetype == null) { mimetype = "application/octet-stream"; } return mimetype; } /** * Adds the extracted metadata to the properties. */ public void enrichProperties(Session session, Map<String, Object> properties) { if (docType == null) { findDocumentType(session); } // set document type properties.put(PropertyIds.OBJECT_TYPE_ID, docType.getId()); // iterate over the metadata that Tika extracted and add it to // the document, if the document type supports them for (String metadataName : metadata.names()) { String propertyId = TikaMappingService .getPropertyIdFromTikaMetadata(metadataName); if (propertyId == null) { // there is no mapping for this property continue; } PropertyDefinition<?> propertyDef = docType .getPropertyDefinitions().get(propertyId); if (propertyDef == null) { // the document type doen't support this property continue; } try { switch (propertyDef.getPropertyType()) { case STRING: case ID: case HTML: case URI: if (propertyDef.getCardinality() == Cardinality.SINGLE) { properties.put(propertyId, metadata.get(metadataName)); } else { properties.put(propertyId, Arrays.asList(metadata.getValues(metadataName))); } break; case INTEGER: if (propertyDef.getCardinality() == Cardinality.SINGLE) { properties.put(propertyId, convertInteger(metadata.get(metadataName))); } else { List<BigInteger> list = new ArrayList<BigInteger>(); for (String v : metadata.getValues(metadataName)) { list.add(convertInteger(v)); } properties.put(propertyId, list); } break; case DECIMAL: if (propertyDef.getCardinality() == Cardinality.SINGLE) { properties.put(propertyId, convertDecimal(metadata.get(metadataName))); } else { List<BigDecimal> list = new ArrayList<BigDecimal>(); for (String v : metadata.getValues(metadataName)) { list.add(convertDecimal(v)); } properties.put(propertyId, list); } break; case BOOLEAN: if (propertyDef.getCardinality() == Cardinality.SINGLE) { properties.put(propertyId, convertBoolean(metadata.get(metadataName))); } else { List<Boolean> list = new ArrayList<Boolean>(); for (String v : metadata.getValues(metadataName)) { list.add(convertBoolean(v)); } properties.put(propertyId, list); } break; case DATETIME: if (propertyDef.getCardinality() == Cardinality.SINGLE) { properties.put(propertyId, convertDate(metadata.get(metadataName))); } else { List<Date> list = new ArrayList<Date>(); for (String v : metadata.getValues(metadataName)) { list.add(convertDate(v)); } properties.put(propertyId, list); } break; } } catch (Exception e) { // Tika provided a value that doesn't match the property // definition -> ignore } } } private BigInteger convertInteger(String s) { int dot = s.indexOf('.'); if (dot > -1) { s = s.substring(0, dot); } return new BigInteger(s); } private BigDecimal convertDecimal(String s) { return new BigDecimal(s); } private Boolean convertBoolean(String s) { return Boolean.valueOf(s); } private Date convertDate(String s) throws ParseException { SimpleDateFormat sdf = new SimpleDateFormat(DATE_FORMAT, new DateFormatSymbols(Locale.US)); return sdf.parse(s); } }