/* Copyright 2012 Tim Garrett, Mothsoft LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.mothsoft.alexis.engine.retrieval;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.util.Date;
import javax.jms.Connection;
import javax.jms.ConnectionFactory;
import javax.jms.Destination;
import javax.jms.JMSException;
import javax.jms.MessageProducer;
import javax.jms.Session;
import javax.jms.TextMessage;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.log4j.Logger;
import org.springframework.transaction.PlatformTransactionManager;
import org.springframework.transaction.TransactionStatus;
import org.springframework.transaction.support.TransactionCallback;
import org.springframework.transaction.support.TransactionCallbackWithoutResult;
import org.springframework.transaction.support.TransactionTemplate;
import com.mothsoft.alexis.dao.DocumentDao;
import com.mothsoft.alexis.domain.Document;
import com.mothsoft.alexis.domain.DocumentContent;
import com.mothsoft.alexis.domain.DocumentState;
import com.mothsoft.alexis.engine.textual.WebContentParser;
import com.mothsoft.alexis.security.CurrentUserUtil;
import com.mothsoft.alexis.util.HttpClientResponse;
import com.mothsoft.alexis.util.NetworkingUtil;
public class DocumentRetrievalTaskImpl implements RetrievalTask {
private static final Logger logger = Logger.getLogger(DocumentRetrievalTaskImpl.class);
private static final String DOCUMENT_ID = "DOCUMENT_ID";
private ConnectionFactory connectionFactory;
private Destination requestQueue;
private Destination responseQueue;
private DocumentDao documentDao;
private PlatformTransactionManager transactionManager;
private TransactionTemplate transactionTemplate;
private WebContentParser webContentParser;
private IntelligentDelay delay;
public DocumentRetrievalTaskImpl() {
super();
delay = new IntelligentDelay("Document Retrieval", 2, 30);
}
public void setConnectionFactory(final ConnectionFactory connectionFactory) {
this.connectionFactory = connectionFactory;
}
public void setRequestQueue(final Destination requestQueue) {
this.requestQueue = requestQueue;
}
public void setResponseQueue(final Destination responseQueue) {
this.responseQueue = responseQueue;
}
public void setDocumentDao(final DocumentDao documentDao) {
this.documentDao = documentDao;
}
public void setWebContentParser(final WebContentParser webContentParser) {
this.webContentParser = webContentParser;
}
public void setTransactionManager(final PlatformTransactionManager transactionManager) {
this.transactionManager = transactionManager;
this.transactionTemplate = new TransactionTemplate(this.transactionManager);
}
public void retrieve() {
try {
CurrentUserUtil.setSystemUserAuthentication();
// retrieve up to 100 per scheduler call, returning as soon as none
// are found
boolean fetching = true;
int remaining = 100;
while (fetching && remaining > 0) {
fetching = doRetrieve();
remaining--;
}
if (!fetching) {
logger.info("Document Retrieval found nothing to do, will return");
}
} finally {
CurrentUserUtil.clearAuthentication();
}
}
private boolean doRetrieve() {
boolean foundSomething = false;
Document document = null;
try {
logger.info("Looking for documents pending retrieval");
document = findDocumentToRetrieve();
if (document == null) {
foundSomething = false;
} else {
foundSomething = true;
handleDocument(document);
}
} catch (final Exception e) {
if (document != null) {
foundSomething = true;
logger.warn("Document: " + document.getId() + " failed retrieval, will be set to error state");
onErrorState(document.getId(), DocumentState.ERROR_RETRIEVAL_FAILED);
} else {
// not finding documents, but erroring. might indicate subsystem
// problem (database, disk, etc.)
logger.warn("Document not found or handled improperly, may retry again -- this may be FATAL!");
}
// throttle on error conditions
logger.warn("Throttling document retrieval on error condition: " + e, e);
delay.sleep();
}
return foundSomething;
}
private void handleDocument(final Document document) throws IOException {
logger.info("Retrieving document: " + document.getId() + ", URL: " + document.getUrl());
// allow for content already ingested but not parsed
String entryContent = document.getText();
String etag = null;
Date lastModifiedDate = null;
final Date retrievalDate = new Date();
if (StringUtils.isEmpty(entryContent)) {
final URL url = new URL(document.getUrl());
HttpClientResponse response = null;
InputStream is = null;
try {
response = NetworkingUtil.get(url, null, null);
is = response.getInputStream();
entryContent = this.webContentParser.parse(is);
etag = response.getEtag();
lastModifiedDate = response.getLastModifiedDate();
logger.debug("Document " + document.getId() + " has: " + entryContent.length() + " characters");
} catch (IOException e) {
response.abort();
logger.error("IOException while retrieving URL: " + url + " " + e, e);
onErrorState(document.getId(), DocumentState.ERROR_PARSE_FAILED);
} finally {
IOUtils.closeQuietly(response);
}
}
updateStateAndQueueForParsing(document.getId(), entryContent, etag, lastModifiedDate, retrievalDate);
}
private Document findDocumentToRetrieve() {
return this.transactionTemplate.execute(new TransactionCallback<Document>() {
public Document doInTransaction(TransactionStatus txStatus) {
return DocumentRetrievalTaskImpl.this.documentDao.findAndLockOneDocument(DocumentState.DISCOVERED);
}
});
}
private void onErrorState(final Long documentId, final DocumentState errorState) {
logger.warn("Setting error state: " + errorState.toString() + " on document " + documentId);
this.transactionTemplate.execute(new TransactionCallbackWithoutResult() {
@Override
protected void doInTransactionWithoutResult(TransactionStatus txStatus) {
final Document attachedDocument = DocumentRetrievalTaskImpl.this.documentDao.get(documentId);
attachedDocument.onErrorState(errorState);
DocumentRetrievalTaskImpl.this.documentDao.update(attachedDocument);
}
});
}
private void updateStateAndQueueForParsing(final Long documentId, final String content, final String etag,
final Date lastModifiedDate, final Date retrievalDate) {
this.transactionTemplate.execute(new TransactionCallbackWithoutResult() {
@Override
protected void doInTransactionWithoutResult(TransactionStatus txStatus) {
final Document attachedDocument = DocumentRetrievalTaskImpl.this.documentDao.get(documentId);
attachedDocument.setEtag(etag);
attachedDocument.setLastModifiedDate(lastModifiedDate);
attachedDocument.setRetrievalDate(retrievalDate);
if (attachedDocument.getDocumentContent() == null) {
final DocumentContent documentContent = new DocumentContent(attachedDocument, content);
attachedDocument.setDocumentContent(documentContent);
DocumentRetrievalTaskImpl.this.documentDao.add(documentContent);
}
DocumentRetrievalTaskImpl.this.documentDao.update(attachedDocument);
// do in transaction to make sure failure to queue doesn't leave
// dead doc
requestParse(documentId, content);
}
});
}
private String requestParse(final Long documentId, final String content) {
Connection connection = null;
Session session = null;
MessageProducer producer = null;
// set up JMS connection, session, consumer, producer
try {
connection = this.connectionFactory.createConnection();
session = connection.createSession(false, Session.AUTO_ACKNOWLEDGE);
producer = session.createProducer(this.requestQueue);
logger.info("Sending parse request, document ID: " + documentId);
final TextMessage textMessage = session.createTextMessage(content);
textMessage.setJMSReplyTo(this.responseQueue);
textMessage.setLongProperty(DOCUMENT_ID, documentId);
producer.send(textMessage);
} catch (JMSException e) {
throw new RuntimeException(e);
} finally {
try {
if (producer != null) {
producer.close();
}
if (session != null) {
session.close();
}
if (connection != null) {
connection.close();
}
} catch (JMSException e) {
throw new RuntimeException(e);
}
}
return content;
}
}