/* * Copyright (C) 2015 Jan Pokorsky * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ package cz.cas.lib.proarc.common.export.cejsh; import cz.cas.lib.proarc.common.export.ExportUtils; import cz.cas.lib.proarc.common.export.mets.ValidationErrorHandler; import cz.cas.lib.proarc.common.fedora.DigitalObjectException; import cz.cas.lib.proarc.common.fedora.FoxmlUtils; import cz.cas.lib.proarc.common.mods.custom.ModsConstants; import cz.cas.lib.proarc.common.object.DescriptionMetadata; import cz.cas.lib.proarc.common.object.DigitalObjectElement; import cz.cas.lib.proarc.common.object.MetadataHandler; import cz.cas.lib.proarc.common.xml.ProarcXmlUtils; import cz.cas.lib.proarc.common.xml.SimpleNamespaceContext; import cz.cas.lib.proarc.common.xml.TransformErrorListener; import java.io.BufferedWriter; import java.io.File; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStreamWriter; import java.io.StringReader; import java.io.Writer; import java.util.ArrayList; import java.util.Date; import java.util.GregorianCalendar; import java.util.List; import java.util.Properties; import java.util.TimeZone; import java.util.logging.Level; import java.util.logging.Logger; import java.util.regex.Pattern; import javax.xml.XMLConstants; import javax.xml.bind.DatatypeConverter; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; import javax.xml.transform.OutputKeys; import javax.xml.transform.Result; import javax.xml.transform.Source; import javax.xml.transform.Transformer; import javax.xml.transform.TransformerConfigurationException; import javax.xml.transform.TransformerException; import javax.xml.transform.TransformerFactory; import javax.xml.transform.dom.DOMSource; import javax.xml.transform.stream.StreamResult; import javax.xml.transform.stream.StreamSource; import javax.xml.validation.Schema; import javax.xml.validation.SchemaFactory; import javax.xml.validation.Validator; import javax.xml.xpath.XPath; import javax.xml.xpath.XPathConstants; import javax.xml.xpath.XPathExpression; import javax.xml.xpath.XPathExpressionException; import javax.xml.xpath.XPathFactory; import net.lingala.zip4j.core.ZipFile; import net.lingala.zip4j.exception.ZipException; import net.lingala.zip4j.model.ZipParameters; import net.lingala.zip4j.util.Zip4jConstants; import org.apache.commons.io.Charsets; import org.w3c.dom.DOMException; import org.w3c.dom.Document; import org.w3c.dom.Element; import org.w3c.dom.Node; import org.xml.sax.InputSource; import org.xml.sax.SAXException; /** * Builds the cejsh package of articles. * * @author Jan Pokorsky * @see <a href='https://trac.icm.edu.pl/yadda/Public:BWmeta_format'>BWmeta format</a> */ class CejshBuilder { static final String IMPORT_PROPERTIES_FILENAME = "import.properties"; static final String IMPORTS_NEW_FILENAME = "imports_new"; /** * The Cejsh namespace. */ public static final String NS_BWMETA105 = "http://yadda.icm.edu.pl/bwmeta-1.0.5.xsd"; static final String XSD_FILENAME = "bwmeta-1.0.5.xsd"; static final String P0XML_FILENAME = "p0.xml"; static final String PROP_IMPORT_BWMETA_FILES = "import.import_bwmeta_files"; static final String PROP_IMPORT_CONTENT_FILES = "import.content_files"; static final String PROP_IMPORT_OBJECTS = "import.import_objects"; static final String PROP_IMPORT_INFODATE = "import.info.date"; static final TimeZone UTC = TimeZone.getTimeZone("UTC"); private static Schema SCHEMA_BWMETA; private static Pattern SAFE_FILENAME_RE; private static final Logger LOG = Logger.getLogger(CejshBuilder.class.getName()); private final Transformer bwmetaXsl; private final TransformErrorListener tranformationErrorHandler; private Validator bwValidator; private final DocumentBuilder db; private final XPathExpression issnPath; private final XPathExpression partNumberPath; private final XPathExpression dateIssuedPath; private final XPathExpression reviewedArticlePath; private final GregorianCalendar gcalendar; private Title title; private Volume volume; private Issue issue; private Level logLevel; public CejshBuilder(CejshConfig config) throws TransformerConfigurationException, ParserConfigurationException, XPathExpressionException { this.gcalendar = new GregorianCalendar(UTC); this.logLevel = config.getLogLevel(); TransformerFactory xslFactory = TransformerFactory.newInstance(); tranformationErrorHandler = new TransformErrorListener(); bwmetaXsl = xslFactory.newTransformer(new StreamSource(config.getCejshXslUrl())); if (bwmetaXsl == null) { throw new TransformerConfigurationException("Cannot load XSL: " + config.getCejshXslUrl()); } bwmetaXsl.setOutputProperty(OutputKeys.INDENT, "yes"); bwmetaXsl.setErrorListener(tranformationErrorHandler); DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance(); dbf.setNamespaceAware(true); db = dbf.newDocumentBuilder(); XPathFactory xPathFactory = ProarcXmlUtils.defaultXPathFactory(); XPath xpath = xPathFactory.newXPath(); xpath.setNamespaceContext(new SimpleNamespaceContext().add("m", ModsConstants.NS)); issnPath = xpath.compile("m:mods/m:identifier[@type='issn' and not(@invalid)]"); partNumberPath = xpath.compile("m:mods/m:titleInfo/m:partNumber"); dateIssuedPath = xpath.compile("m:mods/m:originInfo/m:dateIssued"); reviewedArticlePath = xpath.compile("m:mods/m:genre[text()='article' and @type='peer-reviewed']"); } public XPathExpression getDateIssuedPath() { return dateIssuedPath; } public XPathExpression getIssnPath() { return issnPath; } public XPathExpression getPartNumberPath() { return partNumberPath; } public XPathExpression getReviewedArticlePath() { return reviewedArticlePath; } public Article addArticle(DigitalObjectElement article, CejshContext p) { Document articleDom = getModsDom(article, p); return addArticle(articleDom, article, p); } Article addArticle(Document articleDom, DigitalObjectElement article, CejshContext p) { if (articleDom == null) { p.getStatus().error(article, "Missing article MODS!", null, null); return null; } try { Object isReviewed = getReviewedArticlePath().evaluate(articleDom, XPathConstants.BOOLEAN); if (!(isReviewed instanceof Boolean) || !((Boolean) isReviewed)) { LOG.log(logLevel, "Skipped not reviewed article: {0}", article.getPid()); return new Article().setReviewed(false); } } catch (XPathExpressionException ex) { p.getStatus().error(article, "Unexpected error!", null, ex); } if (getTitle() == null) { p.getStatus().error(article, "Missing title!", null, null); return null; } if (getVolume() == null) { p.getStatus().error(article, "Missing volume!", null, null); return null; } try { String articleIssn = getIssnPath().evaluate(articleDom); // XXX check mods vs modsCollection? Element modsElement = articleDom.getDocumentElement(); return new Article(article, modsElement, articleIssn) .setReviewed(true); } catch (Exception ex) { p.getStatus().error(article, "Unexpected error!", null, ex); } return null; } String getPackageIssn() { String issn = getIssue() == null ? null : getIssue().getIssn(); if (issn == null || issn.isEmpty()) { issn = getTitle() == null ? null : getTitle().getIssn(); if (issn == null || issn.isEmpty()) { issn = "NA"; } } return issn; } /** * Transforms a collection of articles to the bwmeta document. * @param src modsCollection in MODS format * @param dst bwmeta document * @return the error handler */ TransformErrorListener createCejshXml(Source src, Result dst) { bwmetaXsl.reset(); String packageIssn = getPackageIssn(); bwmetaXsl.setParameter("issn", packageIssn); if (getVolume() != null) { bwmetaXsl.setParameter("volume", getVolume().getVolumeNumber()); bwmetaXsl.setParameter("volumeId", getVolume().getVolumeId()); bwmetaXsl.setParameter("year", getVolume().getYear()); } if (getIssue() != null) { bwmetaXsl.setParameter("issue", getIssue().getIssueNumber()); bwmetaXsl.setParameter("issueId", getIssue().getIssueId()); } try { tranformationErrorHandler.reset(); bwmetaXsl.transform(src, dst); } catch (TransformerException ex) { if (tranformationErrorHandler.getErrors().isEmpty()) { tranformationErrorHandler.getErrors().add(ex.getMessageAndLocation()); } } return tranformationErrorHandler; } public File writePackage(DigitalObjectElement packageElm, List<Article> articles, CejshContext p) { if (articles == null || articles.isEmpty()) { return null; } File packageFolder = null; try { Document doc = mergeElements(articles); String pkgName = createPackageName(); packageFolder = ExportUtils.createFolder(p.getOutput(), pkgName); File importFolder = new File(packageFolder, IMPORTS_NEW_FILENAME); importFolder.mkdirs(); writeProperties(packageFolder, articles.size()); File p0xml = new File(importFolder, P0XML_FILENAME); DOMSource domSource = new DOMSource(doc); TransformErrorListener cejshXslErrors = createCejshXml(domSource, new StreamResult(p0xml)); if (!cejshXslErrors.getErrors().isEmpty()) { p.getStatus().error(packageElm, "Validation error!", cejshXslErrors.getErrors().toString(), null); return packageFolder; } // validate XML after writing to disk to permit admin to check the output List<String> validationErrors = validateCejshXml(new StreamSource(p0xml)); if (validationErrors.isEmpty()) { writeZip(packageFolder, p.getStatus(), packageElm); } else { p.getStatus().error(packageElm, "Validation error!", ExportUtils.toString(validationErrors), null); } } catch (Exception ex) { p.getStatus().error(packageElm, "Unexpected error!", null, ex); } return packageFolder; } void writeZip(File pkgFile, File packageFolder) throws ZipException { ZipFile zipFile = new ZipFile(pkgFile); ZipParameters zipParameters = new ZipParameters(); zipParameters.setEncryptionMethod(Zip4jConstants.ENC_METHOD_STANDARD); zipParameters.setCompressionMethod(Zip4jConstants.COMP_DEFLATE); zipParameters.setCompressionLevel(Zip4jConstants.DEFLATE_LEVEL_NORMAL); zipParameters.setIncludeRootFolder(false); zipParameters.setDefaultFolderPath(packageFolder.getAbsolutePath()); zipFile.addFiles(listZipFiles(packageFolder), zipParameters); } /** * Lists all files that should be part of a zip. Folder entries are not part of the list. * See issue #413. * @param folder a folder to scan * @return the list of files */ private static ArrayList<File> listZipFiles(File folder) { ArrayList<File> files = new ArrayList<File>(); ArrayList<File> subfiles = new ArrayList<File>(); for (File file : folder.listFiles()) { if (file.isFile()) { files.add(file); } else if (file.isDirectory()) { subfiles.addAll(listZipFiles(file)); } } files.addAll(subfiles); return files; } File writeZip(File packageFolder, CejshStatusHandler status, DigitalObjectElement packageElm) { String pkgName = packageFolder.getName(); File pkgFile = new File(packageFolder.getParentFile(), pkgName + ".zip"); try { writeZip(pkgFile, packageFolder); return pkgFile; } catch (ZipException ex) { status.error(packageElm, "Zipping error!", pkgFile.getPath(), ex); return null; } } static Schema getBwSchema() throws SAXException { if (SCHEMA_BWMETA == null) { SCHEMA_BWMETA = SchemaFactory.newInstance(XMLConstants.W3C_XML_SCHEMA_NS_URI) .newSchema(CejshBuilder.class.getResource(XSD_FILENAME)); } return SCHEMA_BWMETA; } List<String> validateCejshXml(Source bwmeta) throws SAXException, IOException { if (bwValidator == null) { bwValidator = getBwSchema().newValidator(); bwValidator.setErrorHandler(new ValidationErrorHandler()); } List<String> errors = ((ValidationErrorHandler) bwValidator.getErrorHandler()).getValidationErrors(); try { errors.clear(); bwValidator.validate(bwmeta); } catch (SAXException ex) { errors.add(0, ex.getMessage()); } return errors; } File writeProperties(File packageFolder, int articleCount) throws IOException, FileNotFoundException { File propertiesFile = new File(packageFolder, IMPORT_PROPERTIES_FILENAME); Properties properties = new Properties(); gcalendar.setTimeInMillis(System.currentTimeMillis()); String importDate = DatatypeConverter.printDateTime(gcalendar); properties.setProperty(PROP_IMPORT_INFODATE, importDate); properties.setProperty(PROP_IMPORT_OBJECTS, String.valueOf(articleCount)); properties.setProperty(PROP_IMPORT_CONTENT_FILES, "0"); properties.setProperty(PROP_IMPORT_BWMETA_FILES, "1"); Writer propsWriter = new NoCommentsWriter(new OutputStreamWriter(new FileOutputStream(propertiesFile), Charsets.UTF_8)); try { properties.store(propsWriter, null); return propertiesFile; } finally { propsWriter.close(); } } String createPackageName() { //ISSN_publicationYear_volumePartNumber_issueParNumber String issn = getPackageIssn(); String issueNumber = safeFilename(getIssue() == null ? null : getIssue().getIssueNumber()); String volumeYear = safeFilename(getVolume().getYear()); String volumeNumber = safeFilename(getVolume().getVolumeNumber()); String pkgName = String.format("%s_%s_%s_%s", issn, volumeYear, volumeNumber, issueNumber); return pkgName; } static String safeFilename(String name) { if (name != null) { if (SAFE_FILENAME_RE == null) { // replace even '_' as it separates values in the result filename SAFE_FILENAME_RE = Pattern.compile("[^0-9a-zA-Z-.,;]"); } SAFE_FILENAME_RE.matcher(name).replaceAll(""); } return name == null || name.isEmpty() ? "NA" : name; } /** * Builds modsCollection from mods of articles. */ Document mergeElements(List<Article> articles) throws DOMException { Document doc = db.newDocument(); Element root = doc.createElementNS(ModsConstants.NS, "modsCollection"); for (Article article : articles) { Element modsElm = article.getModsElement(); Node n = doc.adoptNode(modsElm); root.appendChild(n); } doc.appendChild(root); return doc; } boolean addTitle(DigitalObjectElement current, DigitalObjectElement title, CejshContext p) { if (title == null) { p.getStatus().error(current, "Missing title object!", null); return false; } if (getTitle() != null) { p.getStatus().error(current, "Title inside title? " + title.toLog() + ". Child title: " + getTitle().getIssn(), null); return false; } Document modsDom = getModsDom(title, p); if (modsDom != null) { try { Title result = new Title(); result.setIssn(getIssnPath().evaluate(modsDom)); setTitle(result); return true; } catch (XPathExpressionException ex) { p.getStatus().error(current, "Invalid XPath!", ex); return false; } } else { p.getStatus().error(current, "Missing title MODS!" + title.toLog(), null); return false; } } boolean addVolume(DigitalObjectElement current, DigitalObjectElement volume, CejshContext p) { if (volume == null) { p.getStatus().error(current, "Missing volume object!", null); return false; } if (getVolume() != null) { p.getStatus().error(current, "Volume inside volume? " + volume.toLog() + ". Child volume: " + getVolume().getVolumeId(), null); return false; } Document modsDom = getModsDom(volume, p); if (modsDom != null) { try { Volume result = new Volume(); result.setVolumeId(FoxmlUtils.pidAsUuid(volume.getPid())); result.setVolumeNumber(getPartNumberPath().evaluate(modsDom)); if (result.getVolumeNumber() == null || result.getVolumeNumber().isEmpty()) { result.setVolumeNumber("NA"); } result.setYear(getDateIssuedPath().evaluate(modsDom)); if (result.getYear() == null || result.getYear().isEmpty()) { result.setYear("NA"); } setVolume(result); return true; } catch (XPathExpressionException ex) { p.getStatus().error(current, "Invalid XPath!", ex); return false; } } else { p.getStatus().error(current, "Missing volume MODS!" + volume.toLog(), null); return false; } } boolean addIssue(DigitalObjectElement current, DigitalObjectElement issue, CejshContext p) { if (issue == null) { p.getStatus().error(current, "Missing issue object!", null); return false; } if (getIssue() != null) { p.getStatus().error(current, "Issue inside issue? " + issue.toLog() + ". Child issue: " + getIssue().getIssueId(), null); return false; } Document modsDom = getModsDom(issue, p); if (modsDom != null) { try { Issue result = new Issue(); result.setIssueId(FoxmlUtils.pidAsUuid(issue.getPid())); result.setIssueNumber(getPartNumberPath().evaluate(modsDom)); if (result.getIssueNumber() == null || result.getIssueNumber().isEmpty()) { result.setIssueNumber("NA"); } result.setIssn(getIssnPath().evaluate(modsDom)); setIssue(result); return true; } catch (XPathExpressionException ex) { p.getStatus().error(current, "Invalid XPath!", ex); return false; } } else { p.getStatus().error(current, "Missing issue MODS!" + issue.toLog(), null); return false; } } public Title getTitle() { return title; } public void setTitle(Title title) { this.title = title; } public Volume getVolume() { return volume; } public void setVolume(Volume volume) { LOG.log(logLevel, String.valueOf(volume)); this.volume = volume; } public Issue getIssue() { return issue; } public void setIssue(Issue issue) { LOG.log(logLevel, String.valueOf(issue)); this.issue = issue; } private Document getModsDom(DigitalObjectElement elm, CejshContext p) { try { MetadataHandler<?> metadataHandler = elm.getHandler().metadata(); DescriptionMetadata<String> dm = metadataHandler.getMetadataAsXml(); String mods = dm.getData(); Document modsDom = db.parse(new InputSource(new StringReader(mods))); return modsDom; } catch (DigitalObjectException ex) { p.getStatus().error(elm, "Missing MODS!", ex); return null; } catch (SAXException ex) { p.getStatus().error(elm, "Invalid MODS!", ex); return null; } catch (IOException ex) { p.getStatus().error(elm, null, ex); return null; } } DocumentBuilder getDocumentBuilder() { return db; } List<String> getTranformationErrors() { return tranformationErrorHandler.getErrors(); } static class Title { private String issn; // private String year; public String getIssn() { return issn; } public void setIssn(String issn) { this.issn = issn; } @Override public String toString() { return "Title{" + "issn=" + issn + '}'; } } static class Volume { private String year; private String volumeNumber; private String volumeId; public String getYear() { return year; } public void setYear(String year) { this.year = year; } public String getVolumeNumber() { return volumeNumber; } public void setVolumeNumber(String volumeNumber) { this.volumeNumber = volumeNumber; } public String getVolumeId() { return volumeId; } public void setVolumeId(String volumeId) { this.volumeId = volumeId; } @Override public String toString() { return "Volume{" + "year=" + year + ", volumeNumber=" + volumeNumber + ", volumeId=" + volumeId + '}'; } } static class Issue { private String issueNumber; private String issueId; private String issn; // private String year; public String getIssueNumber() { return issueNumber; } public void setIssueNumber(String issueNumber) { this.issueNumber = issueNumber; } public String getIssueId() { return issueId; } public void setIssueId(String issueId) { this.issueId = issueId; } public String getIssn() { return issn; } public void setIssn(String issn) { this.issn = issn; } @Override public String toString() { return "Issue{" + "issueNumber=" + issueNumber + ", issueId=" + issueId + ", issn=" + issn + '}'; } } static class Article { private Element mods; private DigitalObjectElement article; private String issn; private boolean reviewed; public Article() { } public Article(DigitalObjectElement article, Element mods, String issn) { this.mods = mods; this.article = article; this.issn = issn; } public DigitalObjectElement getDigitalObject() { return article; } public Element getModsElement() { return mods; } public void setModsElement(Element mods) { this.mods = mods; } public String getIssn() { return issn; } public void setIssn(String issn) { this.issn = issn; } public boolean isReviewed() { return reviewed; } public Article setReviewed(boolean reviewed) { this.reviewed = reviewed; return this; } public static List<DigitalObjectElement> toDigitalObjects(List<Article> articles) { ArrayList<DigitalObjectElement> dobjs = new ArrayList<DigitalObjectElement>(articles.size()); for (Article article : articles) { dobjs.add(article.getDigitalObject()); } return dobjs; } } /** * Filters out default line comment with localized date. The implementation * heavily depends on {@link Properties#store} and {@link BufferedWriter} * implementations. */ private static class NoCommentsWriter extends BufferedWriter { private static final String LINE_SEPARATOR = System.getProperty("line.separator"); private boolean ignoreNextNewLine; private final String expextedDateFragment; public NoCommentsWriter(Writer out) { super(out); expextedDateFragment = '#' + new Date().toString().substring(0, 10); } @Override public void write(String str) throws IOException { if (ignoreNextNewLine && LINE_SEPARATOR.equals(str)) { ignoreNextNewLine = false; return; } if (str.length() > 0 && str.startsWith(expextedDateFragment)) { ignoreNextNewLine = true; return; } super.write(str); } } }