/*
* Copyright (C) 2011 - 2012 Interactive Media Management
* Copyright (C) 2015 Allan Lykke Christensen
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package dk.i2m.converge.ejb.services;
import dk.i2m.converge.core.metadata.extract.CannotExtractMetaDataException;
import com.google.common.base.Splitter;
import com.xuggle.xuggler.*;
import dk.i2m.converge.core.ConfigurationKey;
import dk.i2m.converge.core.DataNotFoundException;
import dk.i2m.converge.core.EnrichException;
import dk.i2m.converge.core.content.catalogue.MediaItemRendition;
import dk.i2m.converge.core.metadata.*;
import dk.i2m.converge.core.metadata.extract.ImageInfoMetaDataExtractor;
import dk.i2m.converge.core.metadata.extract.IptcMetaDataExtractor;
import dk.i2m.converge.core.metadata.extract.MetaDataExtractor;
import dk.i2m.converge.core.metadata.extract.Mp3MetaDataExtractor;
import dk.i2m.converge.core.metadata.extract.XmpMetaDataExtractor;
import dk.i2m.converge.ejb.facades.MetaDataFacadeLocal;
import java.io.File;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.*;
import java.util.logging.Level;
import java.util.logging.Logger;
import javax.ejb.EJB;
import javax.ejb.Stateless;
import net.sf.json.JSONObject;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.methods.PostMethod;
import org.apache.commons.httpclient.methods.StringRequestEntity;
import org.apache.pdfbox.cos.COSDocument;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.util.PDFTextStripper;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.extractor.WordExtractor;
/**
* Service bean used for extracting meta data from files.
*
* @author Allan Lykke Christensen
*/
@Stateless
public class MetaDataService implements MetaDataServiceLocal {
private static final Logger LOG = Logger.getLogger(MetaDataService.class.
getName());
/** URL to the OpenCalais service. */
private static final String OPEN_CALAIS_URL =
"http://api.opencalais.com/tag/rs/enrich";
@EJB private ConfigurationServiceLocal cfgService;
@EJB private MetaDataFacadeLocal metaDataFacade;
/** {@inheritDoc } */
@Override
public Map<String, String> extract(String location) {
Map<String, String> metaData = new HashMap<String, String>();
File file = new File(location);
try {
MetaDataExtractor mp3 = new Mp3MetaDataExtractor();
metaData.putAll(mp3.extract(file));
} catch (CannotExtractMetaDataException ex) {
LOG.log(Level.FINE, ex.getMessage());
LOG.log(Level.FINEST, "", ex);
}
try {
MetaDataExtractor xmp = new XmpMetaDataExtractor();
metaData.putAll(xmp.extract(file));
} catch (CannotExtractMetaDataException ex) {
LOG.log(Level.FINE, ex.getMessage());
LOG.log(Level.FINEST, "", ex);
}
try {
MetaDataExtractor iptc = new IptcMetaDataExtractor();
metaData.putAll(iptc.extract(file));
} catch (CannotExtractMetaDataException ex) {
LOG.log(Level.FINE, ex.getMessage());
LOG.log(Level.FINEST, "", ex);
}
try {
MetaDataExtractor imageInfo = new ImageInfoMetaDataExtractor();
metaData.putAll(imageInfo.extract(file));
} catch (CannotExtractMetaDataException ex) {
LOG.log(Level.FINE, ex.getMessage());
LOG.log(Level.FINEST, "", ex);
}
try {
metaData.putAll(extractMediaContainer(location));
} catch (CannotExtractMetaDataException ex) {
LOG.log(Level.FINE, ex.getMessage());
LOG.log(Level.FINEST, "", ex);
}
return metaData;
}
public Map<String, String> extractMediaContainer(String location) throws
CannotExtractMetaDataException {
Map<String, String> properties = new HashMap<String, String>();
try {
IContainer container = IContainer.make();
// Open up the container
if (container.open(location, IContainer.Type.READ, null) < 0) {
throw new CannotExtractMetaDataException("could not open file: "
+ location);
}
if (container.queryStreamMetaData() < 0) {
throw new CannotExtractMetaDataException(
"couldn't query stream meta data for some reason...");
}
for (int i = 0; i < container.getNumProperties(); i++) {
IProperty prop = container.getPropertyMetaData(i);
properties.put(prop.getName(),
container.getPropertyAsString(prop.getName()));
}
properties.put("streams", String.valueOf(container.getNumStreams()));
if (container.getDuration() == Global.NO_PTS) {
properties.put("duration", String.valueOf(
container.getDuration()));
} else {
properties.put("duration", String.valueOf(container.getDuration()
/ 1000));
}
if (container.getStartTime() == Global.NO_PTS) {
properties.put("startTime", String.valueOf(container.
getStartTime()));
} else {
properties.put("startTime", String.valueOf(container.
getStartTime() / 1000));
}
properties.put("bitrate", String.valueOf(container.getBitRate()));
for (String meta : container.getMetaData().getKeys()) {
properties.put("container." + meta, container.getMetaData().
getValue(meta));
}
for (int i = 0; i < container.getNumStreams(); i++) {
IStream stream = container.getStream(i);
// Get the pre-configured decoder that can decode this stream
IStreamCoder coder = stream.getStreamCoder();
for (String meta : stream.getMetaData().getKeys()) {
properties.put("stream." + i + ".meta." + meta, stream.
getMetaData().getValue(meta));
}
properties.put("stream." + i + ".type", coder.getCodecType().
name());
properties.put("stream." + i + ".codec",
coder.getCodecID().name());
properties.put("stream." + i + ".duration",
String.valueOf(stream.getDuration()));
if (coder.getCodecType() == ICodec.Type.CODEC_TYPE_AUDIO) {
properties.put("stream." + i + ".sampleRate",
String.valueOf(coder.getSampleRate()));
properties.put("stream." + i + ".channels",
String.valueOf(coder.getChannels()));
properties.put("stream." + i + ".format", coder.
getSampleFormat().name());
} else if (coder.getCodecType() == ICodec.Type.CODEC_TYPE_VIDEO) {
properties.put("stream." + i + ".width",
String.valueOf(coder.getWidth()));
properties.put("stream." + i + ".height",
String.valueOf(coder.getHeight()));
properties.put("stream." + i + ".format",
coder.getPixelType().name());
properties.put("stream." + i + ".frameRate",
String.valueOf(coder.getFrameRate().getDouble()));
}
}
} catch (UnsatisfiedLinkError ex) {
LOG.log(Level.SEVERE, "Could not extract meta data. {0}", ex.
getMessage());
LOG.log(Level.FINEST, "", ex);
} catch (NoClassDefFoundError ex) {
LOG.log(Level.SEVERE, "Could not extract meta data. {0}", ex.
getMessage());
LOG.log(Level.FINEST, "", ex);
}
return properties;
}
/** {@inheritDoc } */
@Override
public List<Concept> enrich(String story) throws EnrichException {
// OpenCalais support max 100KB per call
int chunkSize = 100000;
Iterable<String> chunks = Splitter.fixedLength(chunkSize).split(story);
List<Concept> concepts = new ArrayList<Concept>();
for (Iterator<String> i = chunks.iterator(); i.hasNext();) {
String chunk = i.next();
concepts.addAll(enrichChunk(chunk));
}
Set<Concept> set = new HashSet<Concept>(concepts);
concepts = new ArrayList<Concept>(set);
return concepts;
}
private List<Concept> enrichChunk(String chunk) throws EnrichException {
if (chunk.trim().isEmpty()) {
return new ArrayList<Concept>();
}
List<Concept> concepts = new ArrayList<Concept>();
PostMethod method = new PostMethod(OPEN_CALAIS_URL);
method.setRequestHeader("x-calais-licenseID",
cfgService.getString(ConfigurationKey.OPEN_CALAIS_API_KEY));
method.setRequestHeader("Content-Type", "text/raw; charset=UTF-8");
method.setRequestHeader("Accept", "application/json");
method.setRequestHeader("enableMetadataType", "SocialTags");
method.setRequestEntity(new StringRequestEntity(chunk));
boolean fail = false;
EnrichException exception = new EnrichException();
try {
HttpClient client = new HttpClient();
int returnCode = client.executeMethod(method);
if (returnCode == HttpStatus.SC_NOT_IMPLEMENTED) {
LOG.log(Level.WARNING, "The Post method is not implemented by this URI");
// still consume the response body
method.getResponseBodyAsString();
} else if (returnCode == HttpStatus.SC_OK) {
JSONObject response = JSONObject.fromObject(method.
getResponseBodyAsString());
List<OpenCalaisMapping> mappings = metaDataFacade.
getOpenCalaisMappings();
for (Object key : response.keySet()) {
String sKey = (String) key;
if (sKey.startsWith("http://d.opencalais.com/")) {
JSONObject entity = response.getJSONObject(sKey);
String typeGroup = (String) entity.get("_typeGroup");
String fieldValue = "";
// Mapping existing concepts
boolean mappingOccured = false;
for (OpenCalaisMapping mapping : mappings) {
try {
fieldValue = (String) entity.get(mapping.
getField());
;
} catch (Exception ex) {
fieldValue = "";
}
if (mapping.getTypeGroup().equals(typeGroup)
&& entity.containsKey(mapping.getField())
&& fieldValue.equals(mapping.getValue())) {
concepts.add(mapping.getConcept());
mappingOccured = true;
}
}
if (!mappingOccured) {
if (((String) entity.get("_typeGroup")).
equalsIgnoreCase("entities")) {
String conceptType =
(String) entity.get("_type");
String conceptName = (String) entity.get("name");
Concept match = null;
try {
match = metaDataFacade.findConceptByName(
conceptName);
} catch (DataNotFoundException dnfe) {
}
if (entity.containsKey("_type")) {
if (conceptType.equalsIgnoreCase("company")
|| conceptType.equalsIgnoreCase(
"organization")) {
if (match == null
|| (!(match instanceof Organisation))) {
match = new Organisation(conceptName,
"");
match = metaDataFacade.create(match);
}
if (match instanceof Organisation) {
concepts.add(match);
}
} else if (conceptType.equalsIgnoreCase(
"person")) {
if (match == null
|| (!(match instanceof Person))) {
match = new Person(conceptName, "");
match = metaDataFacade.create(match);
}
if (match instanceof Person) {
concepts.add(match);
}
} else if (conceptType.equalsIgnoreCase(
"city") || conceptType.
equalsIgnoreCase("country")
|| conceptType.equalsIgnoreCase(
"continent") || conceptType.
equalsIgnoreCase("ProvinceOrState")
|| conceptType.equalsIgnoreCase(
"region")) {
if (match == null
|| (!(match instanceof GeoArea))) {
match = new GeoArea(conceptName, "");
match = metaDataFacade.create(match);
}
if (match instanceof GeoArea) {
concepts.add(match);
}
} else if (conceptType.equalsIgnoreCase(
"facility")) {
if (match == null
|| (!(match instanceof PointOfInterest))) {
match = new PointOfInterest(
conceptName, "");
match = metaDataFacade.create(match);
}
if (match instanceof PointOfInterest) {
concepts.add(match);
}
}
}
}
}
}
}
} else {
LOG.log(Level.WARNING,
"Invalid response received from OpenCalais [{0}] {1}",
new Object[]{returnCode,
method.getResponseBodyAsString()});
}
} catch (Exception e) {
fail = true;
exception = new EnrichException(e);
LOG.log(Level.FINEST, "", e);
} finally {
method.releaseConnection();
}
if (fail) {
throw exception;
}
return concepts;
}
/** {@inheritDoc } */
@Override
public String extractContent(MediaItemRendition mir) {
String contentType = mir.getContentType();
String story = "";
if (contentType == null) {
LOG.log(Level.WARNING, "Content type is null");
return story;
}
if (contentType.equals("application/pdf")) {
// Extract text in PDF
try {
URL originalFile = new URL(mir.getAbsoluteFilename());
PDDocument doc = null;
try {
// Read PDF
PDFParser parser = new PDFParser(originalFile.openStream());
parser.parse();
COSDocument cosDoc = parser.getDocument();
PDDocument pdDoc = new PDDocument(cosDoc);
PDFTextStripper stripper = new PDFTextStripper();
story = stripper.getText(pdDoc);
} catch (IOException ex) {
LOG.log(Level.SEVERE, ex.getMessage());
LOG.log(Level.FINEST, "", ex);
} finally {
if (doc != null) {
try {
doc.close();
} catch (IOException ex) {
LOG.log(Level.SEVERE, ex.getMessage());
LOG.log(Level.FINEST, "", ex);
}
}
}
} catch (MalformedURLException ex) {
}
} else if (contentType.equals("application/msword")
|| contentType.equals(
"application/vnd.openxmlformats-officedocument.wordprocessingml.document")) {
try {
URL originalFile = new URL(mir.getAbsoluteFilename());
HWPFDocument doc = new HWPFDocument(originalFile.openStream());
WordExtractor extractor = new WordExtractor(doc);
story = extractor.getText();
} catch (IOException ex) {
LOG.log(Level.SEVERE, ex.getMessage());
LOG.log(Level.FINEST, "", ex);
}
}
return story;
}
}