/**
* Licensed to The Apereo Foundation under one or more contributor license
* agreements. See the NOTICE file distributed with this work for additional
* information regarding copyright ownership.
*
*
* The Apereo Foundation licenses this file to you under the Educational
* Community License, Version 2.0 (the "License"); you may not use this file
* except in compliance with the License. You may obtain a copy of the License
* at:
*
* http://opensource.org/licenses/ecl2.txt
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*
*/
package org.opencastproject.caption.converters;
import org.opencastproject.caption.api.Caption;
import org.opencastproject.caption.api.CaptionConverter;
import org.opencastproject.caption.api.CaptionConverterException;
import org.opencastproject.caption.api.IllegalTimeFormatException;
import org.opencastproject.caption.api.Time;
import org.opencastproject.caption.impl.CaptionImpl;
import org.opencastproject.caption.impl.TimeImpl;
import org.opencastproject.caption.util.TimeUtil;
import org.apache.commons.io.IOUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerConfigurationException;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
/**
* This is converter for DFXP, XML based caption format. DOM parser is used for both caption importing and exporting,
* while SAX parser is used for determining which languages are present (DFXP can contain multiple languages).
*/
public class DFXPCaptionConverter implements CaptionConverter {
/** logging utility */
private static final Logger logger = LoggerFactory.getLogger(DFXPCaptionConverter.class);
private static final String EXTENSION = "dfxp.xml";
/**
* {@inheritDoc} Parser used for parsing XML document is DOM parser. Language parameter will determine which language
* is searched for and parsed. If there is no matching language, empty collection is returned. If language parameter
* is <code>null</code> first language found is parsed.
*
* @see org.opencastproject.caption.api.CaptionConverter#importCaption(java.io.InputStream, java.lang.String)
*/
@Override
public List<Caption> importCaption(InputStream in, String language) throws CaptionConverterException {
// create new collection
List<Caption> collection = new ArrayList<Caption>();
Document doc;
try {
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
DocumentBuilder builder = factory.newDocumentBuilder();
doc = builder.parse(in);
doc.getDocumentElement().normalize();
} catch (ParserConfigurationException e) {
throw new CaptionConverterException("Could not parse captions", e);
} catch (SAXException e) {
throw new CaptionConverterException("Could not parse captions", e);
} catch (IOException e) {
throw new CaptionConverterException("Could not parse captions", e);
}
// get all <div> elements since they contain information about language
NodeList divElements = doc.getElementsByTagName("div");
Element targetDiv = null;
if (language != null) {
// find first <div> element with matching language
for (int i = 0; i < divElements.getLength(); i++) {
Element n = (Element) divElements.item(i);
if (n.getAttribute("xml:lang").equals(language)) {
targetDiv = n;
break;
}
}
} else {
if (divElements.getLength() > 1) {
// more than one existing <div> element, no language specified
logger.warn("More than one <div> element available. Parsing first one...");
}
if (divElements.getLength() != 0) {
targetDiv = (Element) divElements.item(0);
}
}
// check if we found node
if (targetDiv == null) {
logger.warn("No suitable <div> element found for language {}", language);
} else {
NodeList pElements = targetDiv.getElementsByTagName("p");
// initialize start time
Time time = null;
try {
time = new TimeImpl(0, 0, 0, 0);
} catch (IllegalTimeFormatException e1) {
}
for (int i = 0; i < pElements.getLength(); i++) {
try {
Caption caption = parsePElement((Element) pElements.item(i));
// check time
if (caption.getStartTime().compareTo(time) < 0
|| caption.getStopTime().compareTo(caption.getStartTime()) <= 0) {
logger.warn("Caption with invalid time encountered. Skipping...");
continue;
}
collection.add(caption);
} catch (IllegalTimeFormatException e) {
logger.warn("Caption with invalid time format encountered. Skipping...");
}
}
}
// return collection
return collection;
}
/**
* Parse <p> element which contains one caption.
*
* @param p
* <p> element to be parsed
* @return new {@link Caption} object
* @throws IllegalTimeFormatException
* if time format does not match with expected format for DFXP
*/
private Caption parsePElement(Element p) throws IllegalTimeFormatException {
Time begin = TimeUtil.importDFXP(p.getAttribute("begin").trim());
Time end = TimeUtil.importDFXP(p.getAttribute("end").trim());
// FIXME add logic for duration if end is absent
// get text inside p
String[] textArray = getTextCore(p).split("\n");
return new CaptionImpl(begin, end, textArray);
}
/**
* Returns caption text stripped of all tags.
*
* @param p
* <p> element to be parsed
* @return Caption text with \n as new line character
*/
private String getTextCore(Node p) {
StringBuffer captionText = new StringBuffer();
// get children
NodeList list = p.getChildNodes();
for (int i = 0; i < list.getLength(); i++) {
if (list.item(i).getNodeType() == Node.TEXT_NODE) {
captionText.append(list.item(i).getTextContent());
} else if ("br".equals(list.item(i).getNodeName())) {
captionText.append("\n");
} else {
captionText.append(getTextCore(list.item(i)));
}
}
return captionText.toString().trim();
}
/**
* {@inheritDoc} DOM parser is used to parse template from which whole document is then constructed.
*/
@Override
public void exportCaption(OutputStream outputStream, List<Caption> captions, String language) throws IOException {
// get document builder factory and parse template
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
Document doc = null;
InputStream is = null;
try {
DocumentBuilder builder = factory.newDocumentBuilder();
// load dfxp template from file
is = DFXPCaptionConverter.class.getResourceAsStream("/templates/template.dfxp.xml");
doc = builder.parse(is);
} catch (ParserConfigurationException e) {
// should not happen
throw new RuntimeException(e);
} catch (SAXException e) {
// should not happen unless template is invalid
throw new RuntimeException(e);
} catch (IOException e) {
// should not happen
throw new RuntimeException(e);
} finally {
IOUtils.closeQuietly(is);
}
// retrieve body element
Node bodyNode = doc.getElementsByTagName("body").item(0);
// create new div element with specified language
Element divNode = doc.createElement("div");
divNode.setAttribute("xml:lang", language != null ? language : "und");
bodyNode.appendChild(divNode);
// update document
for (Caption caption : captions) {
Element newNode = doc.createElement("p");
newNode.setAttribute("begin", TimeUtil.exportToDFXP(caption.getStartTime()));
newNode.setAttribute("end", TimeUtil.exportToDFXP(caption.getStopTime()));
String[] captionText = caption.getCaption();
// text part
newNode.appendChild(doc.createTextNode(captionText[0]));
for (int i = 1; i < captionText.length; i++) {
newNode.appendChild(doc.createElement("br"));
newNode.appendChild(doc.createTextNode(captionText[i]));
}
divNode.appendChild(newNode);
}
// initialize stream writer
OutputStreamWriter osw = new OutputStreamWriter(outputStream, "UTF-8");
StreamResult result = new StreamResult(osw);
DOMSource source = new DOMSource(doc);
TransformerFactory tfactory = TransformerFactory.newInstance();
Transformer transformer;
try {
transformer = tfactory.newTransformer();
transformer.transform(source, result);
osw.flush();
} catch (TransformerConfigurationException e) {
// should not happen
throw new RuntimeException(e);
} catch (TransformerException e) {
// should not happen
throw new RuntimeException(e);
} finally {
IOUtils.closeQuietly(osw);
}
}
/**
* {@inheritDoc} Uses SAX parser to quickly read the document and retrieve available languages.
*
* @see org.opencastproject.caption.api.CaptionConverter#getLanguageList(java.io.InputStream)
*/
@Override
public String[] getLanguageList(InputStream input) throws CaptionConverterException {
// create lang list
final List<String> langList = new LinkedList<String>();
// get SAX parser
SAXParserFactory factory = SAXParserFactory.newInstance();
try {
SAXParser parser = factory.newSAXParser();
// create handler
DefaultHandler handler = new DefaultHandler() {
@Override
public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException {
if ("div".equals(qName)) {
// we found div tag - let's make a lookup for language
String lang = attributes.getValue("xml:lang");
if (lang == null) {
// should never happen
logger.warn("Missing xml:lang attribute for div element.");
} else if (langList.contains(lang)) {
logger.warn("Multiple div elements with same language.");
} else {
langList.add(lang);
}
}
}
};
// parse stream
parser.parse(input, handler);
} catch (ParserConfigurationException e) {
// should not happen
throw new RuntimeException(e);
} catch (SAXException e) {
throw new CaptionConverterException("Could not parse captions", e);
} catch (IOException e) {
throw new RuntimeException(e);
}
return langList.toArray(new String[0]);
}
/**
* {@inheritDoc}
*
* @see org.opencastproject.caption.api.CaptionConverter#getExtension()
*/
@Override
public String getExtension() {
return EXTENSION;
}
}