/**
* Copyright 2008 - 2009 Pro-Netics S.P.A.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package it.pronetics.madstore.crawler.publisher.impl;
import it.pronetics.madstore.common.AtomConstants;
import it.pronetics.madstore.common.dom.DomHelper;
import it.pronetics.madstore.crawler.publisher.AtomPublisher;
import it.pronetics.madstore.crawler.model.Page;
import it.pronetics.madstore.repository.CollectionRepository;
import it.pronetics.madstore.repository.EntryRepository;
import java.net.URL;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import org.joda.time.format.ISODateTimeFormat;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.transaction.annotation.Propagation;
import org.springframework.transaction.annotation.Transactional;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
/**
* {@link it.pronetics.madstore.crawler.publisher.AtomPublisher} implementation publishing an Atom feed as an AtomPub collection
* into the {@link it.pronetics.madstore.repository.CollectionRepository}, and all related entries
* into the {@link it.pronetics.madstore.repository.EntryRepository}.
* <br><br>
* Atom feeds are published only if not already existent into the repository, while entries are updated if their
* publishing date is newer than the one of the already stored entry.
* <br><br>
* Atom feeds and entries should have a proper feed and entry key, in order to properly manage updating of entries: if no
* key is provided, surrogated keys will be automatically generated based on page and URL heuristics, more specifically:
* <ul>
* <li>The host name of the crawled site will be used for generating the feed key.</li>
* <li>An hash of the entry title will be used for generating each entry key.</li>
* <li>All entries will be inserted under the same collection.</li>
* </ul>
* <br>
* Atom entries should have a proper updated date, too: if no such a date is found, the current one will be used.
*
* @author Salvatore Incandela
* @author Sergio Bossa
*/
public class AtomPublisherImpl implements AtomPublisher {
private static final Logger LOG = LoggerFactory.getLogger(AtomPublisherImpl.class);
private EntryRepository entryRepository;
private CollectionRepository collectionRepository;
public void setEntryRepository(EntryRepository entryRepository) {
this.entryRepository = entryRepository;
}
public void setCollectionRepository(CollectionRepository collectionRepository) {
this.collectionRepository = collectionRepository;
}
@Transactional(propagation = Propagation.REQUIRED, readOnly = false)
public void publish(Page page) {
try {
if (LOG.isDebugEnabled()) {
LOG.debug("Publishing feed:\n{}.", page.getData());
}
Element feed = DomHelper.getDomFeedFromString(page.getData());
String collectionKey = getOrGenerateCollectionKey(page, feed);
String collectionTitle = getOrGenerateCollectionTitle(page, feed);
String collectionHref = collectionKey;
Element collectionElement = createCollectionElement(collectionKey, collectionHref, collectionTitle);
if (LOG.isDebugEnabled()) {
LOG.debug("Publishing collection:\n{}.", DomHelper.getStringFromDomElement(collectionElement));
}
String newCollectionKey = collectionRepository.putIfAbsent(collectionElement);
if (newCollectionKey != null) {
LOG.info("Inserted collection with key {}.", collectionKey);
} else {
LOG.info("Collection {} already existent.", collectionKey);
}
NodeList entryNodes = feed.getElementsByTagNameNS(AtomConstants.ATOM_NS, AtomConstants.ATOM_ENTRY);
if (entryNodes != null && entryNodes.getLength() > 0) {
for (int i = 0; i < entryNodes.getLength(); i++) {
Element entry = (Element) entryNodes.item(i);
String entryKey = getOrGenerateEntryKey(entry);
setUpdatedDateTimeIfNecessary(entry);
if (LOG.isDebugEnabled()) {
LOG.debug("Publishing entry:\n{}.", DomHelper.getStringFromDomElement(entry));
}
String newEntryKey = entryRepository.putIfAbsent(collectionKey, entry);
if (newEntryKey != null) {
LOG.info("Entry with key {} inserted in collection {}.", entryKey, collectionKey);
} else {
LOG.info("Entry with key {} already existent in collection {}.", entryKey, collectionKey);
String updatedEntryKey = entryRepository.updateIfNewer(collectionKey, entry);
if (updatedEntryKey != null) {
LOG.info("Entry with key {} in collection {} was updated.", entryKey, collectionKey);
} else {
LOG.info("Entry with key {} wasn't updated because is older.", entryKey);
}
}
}
} else {
LOG.info("No entries for {}", page.getLink());
}
} catch (Exception e) {
LOG.info("Publishing abnormally terminated: {}", page.getLink());
LOG.warn(e.getMessage());
LOG.debug(e.getMessage(), e);
}
}
private Element createCollectionElement(String key, String href, String title) throws Exception {
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
factory.setNamespaceAware(true);
DocumentBuilder builder = factory.newDocumentBuilder();
Document collectionDocument = builder.newDocument();
Element collectionelElement = collectionDocument.createElementNS(AtomConstants.APP_NS, AtomConstants.ATOM_COLLECTION);
collectionelElement.setAttribute(AtomConstants.ATOM_KEY, key);
collectionelElement.setAttribute(AtomConstants.ATOM_COLLECTION_HREF, href);
Element acceptElement = collectionDocument.createElementNS(AtomConstants.APP_NS, AtomConstants.ATOM_COLLECTION_ACCEPT);
Element titleElement = collectionDocument.createElementNS(AtomConstants.ATOM_NS, AtomConstants.ATOM_COLLECTION_TITLE);
titleElement.setTextContent(title);
collectionelElement.appendChild(acceptElement);
collectionelElement.appendChild(titleElement);
collectionDocument.appendChild(collectionelElement);
return collectionelElement;
}
private String getOrGenerateCollectionKey(Page page, Element feed) throws Exception {
String key = feed.getAttribute(AtomConstants.ATOM_KEY);
if (key == null || key.equals("")) {
LOG.warn("No feed key found, generating surrogate key ...");
URL url = new URL(page.getLink().getLink());
String path = url.getHost();
if (path.startsWith("/")) {
path = path.substring(1);
}
if (path.endsWith("/")) {
path = path.substring(0, path.length() - 1);
}
key = path.replaceAll("\\.", "_").replaceAll("/", "-").replaceAll("\\:", "-").replaceAll("\\,", "-");
LOG.warn("Surrogated feed key: {}", key);
}
return key;
}
private String getOrGenerateCollectionTitle(Page page, Element feed) throws Exception {
String key = feed.getAttribute(AtomConstants.ATOM_KEY);
if (key == null || key.equals("")) {
URL url = new URL(page.getLink().getLink());
return url.getHost();
} else {
return feed.getElementsByTagName(AtomConstants.ATOM_COLLECTION_TITLE).item(0).getTextContent();
}
}
private String getOrGenerateEntryKey(Element entry) throws Exception {
String key = entry.getAttribute(AtomConstants.ATOM_KEY);
if (key == null || key.equals("")) {
LOG.warn("No entry key found, generating surrogate key ...");
NodeList titleNodes = entry.getElementsByTagName(AtomConstants.ATOM_ENTRY_TITLE);
Node titleNode = titleNodes.item(0);
if (titleNode != null) {
int keyCode = titleNode.getTextContent().hashCode();
if (keyCode < 0) {
key = "e" + Integer.toString(keyCode * -1) + "n";
} else {
key = "e" + Integer.toString(keyCode) + "p";
}
} else {
key = Long.toString(System.currentTimeMillis());
}
entry.setAttribute(AtomConstants.ATOM_KEY, key);
LOG.warn("Surrogated entry key: {}", key);
}
return key;
}
private void setUpdatedDateTimeIfNecessary(Element entry) {
NodeList updatedNodes = entry.getElementsByTagName(AtomConstants.ATOM_ENTRY_UPDATED);
Node updatedNode = updatedNodes.item(0);
if (updatedNode != null) {
String entryUpdatedDateTime = updatedNode.getTextContent();
if (entryUpdatedDateTime == null || entryUpdatedDateTime.equals("")) {
LOG.warn("The entry has no updated date, using current time ...");
updatedNode.setTextContent(ISODateTimeFormat.dateTime().print(System.currentTimeMillis()));
}
} else {
LOG.warn("The entry has no updated date, using current time ...");
updatedNode = entry.getOwnerDocument().createElementNS(AtomConstants.ATOM_NS, AtomConstants.ATOM_ENTRY_UPDATED);
updatedNode.setTextContent(ISODateTimeFormat.dateTime().print(System.currentTimeMillis()));
entry.appendChild(updatedNode);
}
}
}