/*
* Copyright (C) 2015 Jan Pokorsky
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package cz.cas.lib.proarc.common.export.cejsh;
import cz.cas.lib.proarc.common.export.ExportUtils;
import cz.cas.lib.proarc.common.export.mets.ValidationErrorHandler;
import cz.cas.lib.proarc.common.fedora.DigitalObjectException;
import cz.cas.lib.proarc.common.fedora.FoxmlUtils;
import cz.cas.lib.proarc.common.mods.custom.ModsConstants;
import cz.cas.lib.proarc.common.object.DescriptionMetadata;
import cz.cas.lib.proarc.common.object.DigitalObjectElement;
import cz.cas.lib.proarc.common.object.MetadataHandler;
import cz.cas.lib.proarc.common.xml.ProarcXmlUtils;
import cz.cas.lib.proarc.common.xml.SimpleNamespaceContext;
import cz.cas.lib.proarc.common.xml.TransformErrorListener;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.StringReader;
import java.io.Writer;
import java.util.ArrayList;
import java.util.Date;
import java.util.GregorianCalendar;
import java.util.List;
import java.util.Properties;
import java.util.TimeZone;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Pattern;
import javax.xml.XMLConstants;
import javax.xml.bind.DatatypeConverter;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Result;
import javax.xml.transform.Source;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerConfigurationException;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import javax.xml.transform.stream.StreamSource;
import javax.xml.validation.Schema;
import javax.xml.validation.SchemaFactory;
import javax.xml.validation.Validator;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpression;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;
import net.lingala.zip4j.core.ZipFile;
import net.lingala.zip4j.exception.ZipException;
import net.lingala.zip4j.model.ZipParameters;
import net.lingala.zip4j.util.Zip4jConstants;
import org.apache.commons.io.Charsets;
import org.w3c.dom.DOMException;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
/**
* Builds the cejsh package of articles.
*
* @author Jan Pokorsky
* @see <a href='https://trac.icm.edu.pl/yadda/Public:BWmeta_format'>BWmeta format</a>
*/
class CejshBuilder {
static final String IMPORT_PROPERTIES_FILENAME = "import.properties";
static final String IMPORTS_NEW_FILENAME = "imports_new";
/**
* The Cejsh namespace.
*/
public static final String NS_BWMETA105 = "http://yadda.icm.edu.pl/bwmeta-1.0.5.xsd";
static final String XSD_FILENAME = "bwmeta-1.0.5.xsd";
static final String P0XML_FILENAME = "p0.xml";
static final String PROP_IMPORT_BWMETA_FILES = "import.import_bwmeta_files";
static final String PROP_IMPORT_CONTENT_FILES = "import.content_files";
static final String PROP_IMPORT_OBJECTS = "import.import_objects";
static final String PROP_IMPORT_INFODATE = "import.info.date";
static final TimeZone UTC = TimeZone.getTimeZone("UTC");
private static Schema SCHEMA_BWMETA;
private static Pattern SAFE_FILENAME_RE;
private static final Logger LOG = Logger.getLogger(CejshBuilder.class.getName());
private final Transformer bwmetaXsl;
private final TransformErrorListener tranformationErrorHandler;
private Validator bwValidator;
private final DocumentBuilder db;
private final XPathExpression issnPath;
private final XPathExpression partNumberPath;
private final XPathExpression dateIssuedPath;
private final XPathExpression reviewedArticlePath;
private final GregorianCalendar gcalendar;
private Title title;
private Volume volume;
private Issue issue;
private Level logLevel;
public CejshBuilder(CejshConfig config) throws TransformerConfigurationException, ParserConfigurationException, XPathExpressionException {
this.gcalendar = new GregorianCalendar(UTC);
this.logLevel = config.getLogLevel();
TransformerFactory xslFactory = TransformerFactory.newInstance();
tranformationErrorHandler = new TransformErrorListener();
bwmetaXsl = xslFactory.newTransformer(new StreamSource(config.getCejshXslUrl()));
if (bwmetaXsl == null) {
throw new TransformerConfigurationException("Cannot load XSL: " + config.getCejshXslUrl());
}
bwmetaXsl.setOutputProperty(OutputKeys.INDENT, "yes");
bwmetaXsl.setErrorListener(tranformationErrorHandler);
DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
dbf.setNamespaceAware(true);
db = dbf.newDocumentBuilder();
XPathFactory xPathFactory = ProarcXmlUtils.defaultXPathFactory();
XPath xpath = xPathFactory.newXPath();
xpath.setNamespaceContext(new SimpleNamespaceContext().add("m", ModsConstants.NS));
issnPath = xpath.compile("m:mods/m:identifier[@type='issn' and not(@invalid)]");
partNumberPath = xpath.compile("m:mods/m:titleInfo/m:partNumber");
dateIssuedPath = xpath.compile("m:mods/m:originInfo/m:dateIssued");
reviewedArticlePath = xpath.compile("m:mods/m:genre[text()='article' and @type='peer-reviewed']");
}
public XPathExpression getDateIssuedPath() {
return dateIssuedPath;
}
public XPathExpression getIssnPath() {
return issnPath;
}
public XPathExpression getPartNumberPath() {
return partNumberPath;
}
public XPathExpression getReviewedArticlePath() {
return reviewedArticlePath;
}
public Article addArticle(DigitalObjectElement article, CejshContext p) {
Document articleDom = getModsDom(article, p);
return addArticle(articleDom, article, p);
}
Article addArticle(Document articleDom, DigitalObjectElement article, CejshContext p) {
if (articleDom == null) {
p.getStatus().error(article, "Missing article MODS!", null, null);
return null;
}
try {
Object isReviewed = getReviewedArticlePath().evaluate(articleDom, XPathConstants.BOOLEAN);
if (!(isReviewed instanceof Boolean) || !((Boolean) isReviewed)) {
LOG.log(logLevel, "Skipped not reviewed article: {0}", article.getPid());
return new Article().setReviewed(false);
}
} catch (XPathExpressionException ex) {
p.getStatus().error(article, "Unexpected error!", null, ex);
}
if (getTitle() == null) {
p.getStatus().error(article, "Missing title!", null, null);
return null;
}
if (getVolume() == null) {
p.getStatus().error(article, "Missing volume!", null, null);
return null;
}
try {
String articleIssn = getIssnPath().evaluate(articleDom);
// XXX check mods vs modsCollection?
Element modsElement = articleDom.getDocumentElement();
return new Article(article, modsElement, articleIssn)
.setReviewed(true);
} catch (Exception ex) {
p.getStatus().error(article, "Unexpected error!", null, ex);
}
return null;
}
String getPackageIssn() {
String issn = getIssue() == null ? null : getIssue().getIssn();
if (issn == null || issn.isEmpty()) {
issn = getTitle() == null ? null : getTitle().getIssn();
if (issn == null || issn.isEmpty()) {
issn = "NA";
}
}
return issn;
}
/**
* Transforms a collection of articles to the bwmeta document.
* @param src modsCollection in MODS format
* @param dst bwmeta document
* @return the error handler
*/
TransformErrorListener createCejshXml(Source src, Result dst) {
bwmetaXsl.reset();
String packageIssn = getPackageIssn();
bwmetaXsl.setParameter("issn", packageIssn);
if (getVolume() != null) {
bwmetaXsl.setParameter("volume", getVolume().getVolumeNumber());
bwmetaXsl.setParameter("volumeId", getVolume().getVolumeId());
bwmetaXsl.setParameter("year", getVolume().getYear());
}
if (getIssue() != null) {
bwmetaXsl.setParameter("issue", getIssue().getIssueNumber());
bwmetaXsl.setParameter("issueId", getIssue().getIssueId());
}
try {
tranformationErrorHandler.reset();
bwmetaXsl.transform(src, dst);
} catch (TransformerException ex) {
if (tranformationErrorHandler.getErrors().isEmpty()) {
tranformationErrorHandler.getErrors().add(ex.getMessageAndLocation());
}
}
return tranformationErrorHandler;
}
public File writePackage(DigitalObjectElement packageElm, List<Article> articles, CejshContext p) {
if (articles == null || articles.isEmpty()) {
return null;
}
File packageFolder = null;
try {
Document doc = mergeElements(articles);
String pkgName = createPackageName();
packageFolder = ExportUtils.createFolder(p.getOutput(), pkgName);
File importFolder = new File(packageFolder, IMPORTS_NEW_FILENAME);
importFolder.mkdirs();
writeProperties(packageFolder, articles.size());
File p0xml = new File(importFolder, P0XML_FILENAME);
DOMSource domSource = new DOMSource(doc);
TransformErrorListener cejshXslErrors = createCejshXml(domSource, new StreamResult(p0xml));
if (!cejshXslErrors.getErrors().isEmpty()) {
p.getStatus().error(packageElm, "Validation error!", cejshXslErrors.getErrors().toString(), null);
return packageFolder;
}
// validate XML after writing to disk to permit admin to check the output
List<String> validationErrors = validateCejshXml(new StreamSource(p0xml));
if (validationErrors.isEmpty()) {
writeZip(packageFolder, p.getStatus(), packageElm);
} else {
p.getStatus().error(packageElm, "Validation error!", ExportUtils.toString(validationErrors), null);
}
} catch (Exception ex) {
p.getStatus().error(packageElm, "Unexpected error!", null, ex);
}
return packageFolder;
}
void writeZip(File pkgFile, File packageFolder) throws ZipException {
ZipFile zipFile = new ZipFile(pkgFile);
ZipParameters zipParameters = new ZipParameters();
zipParameters.setEncryptionMethod(Zip4jConstants.ENC_METHOD_STANDARD);
zipParameters.setCompressionMethod(Zip4jConstants.COMP_DEFLATE);
zipParameters.setCompressionLevel(Zip4jConstants.DEFLATE_LEVEL_NORMAL);
zipParameters.setIncludeRootFolder(false);
zipParameters.setDefaultFolderPath(packageFolder.getAbsolutePath());
zipFile.addFiles(listZipFiles(packageFolder), zipParameters);
}
/**
* Lists all files that should be part of a zip. Folder entries are not part of the list.
* See issue #413.
* @param folder a folder to scan
* @return the list of files
*/
private static ArrayList<File> listZipFiles(File folder) {
ArrayList<File> files = new ArrayList<File>();
ArrayList<File> subfiles = new ArrayList<File>();
for (File file : folder.listFiles()) {
if (file.isFile()) {
files.add(file);
} else if (file.isDirectory()) {
subfiles.addAll(listZipFiles(file));
}
}
files.addAll(subfiles);
return files;
}
File writeZip(File packageFolder, CejshStatusHandler status, DigitalObjectElement packageElm) {
String pkgName = packageFolder.getName();
File pkgFile = new File(packageFolder.getParentFile(), pkgName + ".zip");
try {
writeZip(pkgFile, packageFolder);
return pkgFile;
} catch (ZipException ex) {
status.error(packageElm, "Zipping error!", pkgFile.getPath(), ex);
return null;
}
}
static Schema getBwSchema() throws SAXException {
if (SCHEMA_BWMETA == null) {
SCHEMA_BWMETA = SchemaFactory.newInstance(XMLConstants.W3C_XML_SCHEMA_NS_URI)
.newSchema(CejshBuilder.class.getResource(XSD_FILENAME));
}
return SCHEMA_BWMETA;
}
List<String> validateCejshXml(Source bwmeta) throws SAXException, IOException {
if (bwValidator == null) {
bwValidator = getBwSchema().newValidator();
bwValidator.setErrorHandler(new ValidationErrorHandler());
}
List<String> errors = ((ValidationErrorHandler) bwValidator.getErrorHandler()).getValidationErrors();
try {
errors.clear();
bwValidator.validate(bwmeta);
} catch (SAXException ex) {
errors.add(0, ex.getMessage());
}
return errors;
}
File writeProperties(File packageFolder, int articleCount) throws IOException, FileNotFoundException {
File propertiesFile = new File(packageFolder, IMPORT_PROPERTIES_FILENAME);
Properties properties = new Properties();
gcalendar.setTimeInMillis(System.currentTimeMillis());
String importDate = DatatypeConverter.printDateTime(gcalendar);
properties.setProperty(PROP_IMPORT_INFODATE, importDate);
properties.setProperty(PROP_IMPORT_OBJECTS, String.valueOf(articleCount));
properties.setProperty(PROP_IMPORT_CONTENT_FILES, "0");
properties.setProperty(PROP_IMPORT_BWMETA_FILES, "1");
Writer propsWriter = new NoCommentsWriter(new OutputStreamWriter(new FileOutputStream(propertiesFile), Charsets.UTF_8));
try {
properties.store(propsWriter, null);
return propertiesFile;
} finally {
propsWriter.close();
}
}
String createPackageName() {
//ISSN_publicationYear_volumePartNumber_issueParNumber
String issn = getPackageIssn();
String issueNumber = safeFilename(getIssue() == null ? null : getIssue().getIssueNumber());
String volumeYear = safeFilename(getVolume().getYear());
String volumeNumber = safeFilename(getVolume().getVolumeNumber());
String pkgName = String.format("%s_%s_%s_%s", issn, volumeYear, volumeNumber, issueNumber);
return pkgName;
}
static String safeFilename(String name) {
if (name != null) {
if (SAFE_FILENAME_RE == null) {
// replace even '_' as it separates values in the result filename
SAFE_FILENAME_RE = Pattern.compile("[^0-9a-zA-Z-.,;]");
}
SAFE_FILENAME_RE.matcher(name).replaceAll("");
}
return name == null || name.isEmpty() ? "NA" : name;
}
/**
* Builds modsCollection from mods of articles.
*/
Document mergeElements(List<Article> articles) throws DOMException {
Document doc = db.newDocument();
Element root = doc.createElementNS(ModsConstants.NS, "modsCollection");
for (Article article : articles) {
Element modsElm = article.getModsElement();
Node n = doc.adoptNode(modsElm);
root.appendChild(n);
}
doc.appendChild(root);
return doc;
}
boolean addTitle(DigitalObjectElement current, DigitalObjectElement title, CejshContext p) {
if (title == null) {
p.getStatus().error(current, "Missing title object!", null);
return false;
}
if (getTitle() != null) {
p.getStatus().error(current, "Title inside title? " + title.toLog() + ". Child title: " + getTitle().getIssn(), null);
return false;
}
Document modsDom = getModsDom(title, p);
if (modsDom != null) {
try {
Title result = new Title();
result.setIssn(getIssnPath().evaluate(modsDom));
setTitle(result);
return true;
} catch (XPathExpressionException ex) {
p.getStatus().error(current, "Invalid XPath!", ex);
return false;
}
} else {
p.getStatus().error(current, "Missing title MODS!" + title.toLog(), null);
return false;
}
}
boolean addVolume(DigitalObjectElement current, DigitalObjectElement volume, CejshContext p) {
if (volume == null) {
p.getStatus().error(current, "Missing volume object!", null);
return false;
}
if (getVolume() != null) {
p.getStatus().error(current, "Volume inside volume? " + volume.toLog() + ". Child volume: " + getVolume().getVolumeId(), null);
return false;
}
Document modsDom = getModsDom(volume, p);
if (modsDom != null) {
try {
Volume result = new Volume();
result.setVolumeId(FoxmlUtils.pidAsUuid(volume.getPid()));
result.setVolumeNumber(getPartNumberPath().evaluate(modsDom));
if (result.getVolumeNumber() == null || result.getVolumeNumber().isEmpty()) {
result.setVolumeNumber("NA");
}
result.setYear(getDateIssuedPath().evaluate(modsDom));
if (result.getYear() == null || result.getYear().isEmpty()) {
result.setYear("NA");
}
setVolume(result);
return true;
} catch (XPathExpressionException ex) {
p.getStatus().error(current, "Invalid XPath!", ex);
return false;
}
} else {
p.getStatus().error(current, "Missing volume MODS!" + volume.toLog(), null);
return false;
}
}
boolean addIssue(DigitalObjectElement current, DigitalObjectElement issue, CejshContext p) {
if (issue == null) {
p.getStatus().error(current, "Missing issue object!", null);
return false;
}
if (getIssue() != null) {
p.getStatus().error(current, "Issue inside issue? " + issue.toLog() + ". Child issue: " + getIssue().getIssueId(), null);
return false;
}
Document modsDom = getModsDom(issue, p);
if (modsDom != null) {
try {
Issue result = new Issue();
result.setIssueId(FoxmlUtils.pidAsUuid(issue.getPid()));
result.setIssueNumber(getPartNumberPath().evaluate(modsDom));
if (result.getIssueNumber() == null || result.getIssueNumber().isEmpty()) {
result.setIssueNumber("NA");
}
result.setIssn(getIssnPath().evaluate(modsDom));
setIssue(result);
return true;
} catch (XPathExpressionException ex) {
p.getStatus().error(current, "Invalid XPath!", ex);
return false;
}
} else {
p.getStatus().error(current, "Missing issue MODS!" + issue.toLog(), null);
return false;
}
}
public Title getTitle() {
return title;
}
public void setTitle(Title title) {
this.title = title;
}
public Volume getVolume() {
return volume;
}
public void setVolume(Volume volume) {
LOG.log(logLevel, String.valueOf(volume));
this.volume = volume;
}
public Issue getIssue() {
return issue;
}
public void setIssue(Issue issue) {
LOG.log(logLevel, String.valueOf(issue));
this.issue = issue;
}
private Document getModsDom(DigitalObjectElement elm, CejshContext p) {
try {
MetadataHandler<?> metadataHandler = elm.getHandler().metadata();
DescriptionMetadata<String> dm = metadataHandler.getMetadataAsXml();
String mods = dm.getData();
Document modsDom = db.parse(new InputSource(new StringReader(mods)));
return modsDom;
} catch (DigitalObjectException ex) {
p.getStatus().error(elm, "Missing MODS!", ex);
return null;
} catch (SAXException ex) {
p.getStatus().error(elm, "Invalid MODS!", ex);
return null;
} catch (IOException ex) {
p.getStatus().error(elm, null, ex);
return null;
}
}
DocumentBuilder getDocumentBuilder() {
return db;
}
List<String> getTranformationErrors() {
return tranformationErrorHandler.getErrors();
}
static class Title {
private String issn;
// private String year;
public String getIssn() {
return issn;
}
public void setIssn(String issn) {
this.issn = issn;
}
@Override
public String toString() {
return "Title{" + "issn=" + issn + '}';
}
}
static class Volume {
private String year;
private String volumeNumber;
private String volumeId;
public String getYear() {
return year;
}
public void setYear(String year) {
this.year = year;
}
public String getVolumeNumber() {
return volumeNumber;
}
public void setVolumeNumber(String volumeNumber) {
this.volumeNumber = volumeNumber;
}
public String getVolumeId() {
return volumeId;
}
public void setVolumeId(String volumeId) {
this.volumeId = volumeId;
}
@Override
public String toString() {
return "Volume{" + "year=" + year + ", volumeNumber=" + volumeNumber + ", volumeId=" + volumeId + '}';
}
}
static class Issue {
private String issueNumber;
private String issueId;
private String issn;
// private String year;
public String getIssueNumber() {
return issueNumber;
}
public void setIssueNumber(String issueNumber) {
this.issueNumber = issueNumber;
}
public String getIssueId() {
return issueId;
}
public void setIssueId(String issueId) {
this.issueId = issueId;
}
public String getIssn() {
return issn;
}
public void setIssn(String issn) {
this.issn = issn;
}
@Override
public String toString() {
return "Issue{" + "issueNumber=" + issueNumber + ", issueId=" + issueId + ", issn=" + issn + '}';
}
}
static class Article {
private Element mods;
private DigitalObjectElement article;
private String issn;
private boolean reviewed;
public Article() {
}
public Article(DigitalObjectElement article, Element mods, String issn) {
this.mods = mods;
this.article = article;
this.issn = issn;
}
public DigitalObjectElement getDigitalObject() {
return article;
}
public Element getModsElement() {
return mods;
}
public void setModsElement(Element mods) {
this.mods = mods;
}
public String getIssn() {
return issn;
}
public void setIssn(String issn) {
this.issn = issn;
}
public boolean isReviewed() {
return reviewed;
}
public Article setReviewed(boolean reviewed) {
this.reviewed = reviewed;
return this;
}
public static List<DigitalObjectElement> toDigitalObjects(List<Article> articles) {
ArrayList<DigitalObjectElement> dobjs = new ArrayList<DigitalObjectElement>(articles.size());
for (Article article : articles) {
dobjs.add(article.getDigitalObject());
}
return dobjs;
}
}
/**
* Filters out default line comment with localized date. The implementation
* heavily depends on {@link Properties#store} and {@link BufferedWriter}
* implementations.
*/
private static class NoCommentsWriter extends BufferedWriter {
private static final String LINE_SEPARATOR = System.getProperty("line.separator");
private boolean ignoreNextNewLine;
private final String expextedDateFragment;
public NoCommentsWriter(Writer out) {
super(out);
expextedDateFragment = '#' + new Date().toString().substring(0, 10);
}
@Override
public void write(String str) throws IOException {
if (ignoreNextNewLine && LINE_SEPARATOR.equals(str)) {
ignoreNextNewLine = false;
return;
}
if (str.length() > 0 && str.startsWith(expextedDateFragment)) {
ignoreNextNewLine = true;
return;
}
super.write(str);
}
}
}