package com.villemos.ispace.assembler;
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.lang.reflect.Field;
import java.net.URI;
import java.net.URL;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import org.apache.camel.Endpoint;
import org.apache.camel.Exchange;
import org.apache.camel.ProducerTemplate;
import org.apache.camel.builder.RouteBuilder;
import org.apache.camel.impl.DefaultExchange;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.http.HttpResponse;
import org.apache.http.client.methods.HttpGet;
import com.villemos.ispace.aperture.DocumentProcessor;
import com.villemos.ispace.aperture.InformationObject;
import com.villemos.ispace.aperture.enricher.MicrosoftPropertyReader;
import com.villemos.ispace.assembler.helper.Buffer;
import com.villemos.ispace.assembler.helper.LanguageDetector;
import com.villemos.ispace.assembler.helper.ReferenceIdBodyEnricher;
import com.villemos.ispace.httpcrawler.HttpClientConfigurer;
import com.villemos.ispace.httpcrawler.HttpCrawlerConsumer;
import com.villemos.ispace.ktree.KtreeAccessor;
import com.villemos.ispace.ktree.folder.Item;
import com.villemos.ispace.ktree.session.Session;
/**
* Specialized crawler for crawling the ktree website.
*
* The crawler will access a initial folder, thereafter iterate through all found folders
* collecting folder and document information as it goes.
*
* The end result is a Map, containing a set of lists describing the found data.
*
*/
public class DocumentRetriever extends KtreeAccessor {
private static final Log LOG = LogFactory.getLog(DocumentRetriever.class);
protected Exchange exchange = null;
protected DirectoryAssemblerEndpoint endpoint = null;
protected ProducerTemplate fileparser = null;
protected Buffer buffer = null;
protected void initParser() {
RouteBuilder builder = new RouteBuilder() {
public void configure() {
DocumentProcessor extractor = new DocumentProcessor();
MicrosoftPropertyReader property = new MicrosoftPropertyReader();
LanguageDetector languageDetector = new LanguageDetector();
ReferenceIdBodyEnricher bodyEnricher = new ReferenceIdBodyEnricher();
buffer = new Buffer();
from("direct:fileparser").split().method(extractor).bean(property).bean(languageDetector).bean(bodyEnricher).bean(buffer);
}
};
try {
getEndpoint().getCamelContext().addRoutes(builder);
} catch (Exception e) {
e.printStackTrace();
}
fileparser = builder.getContext().createProducerTemplate();
}
public DocumentRetriever(Endpoint endpoint, HttpCrawlerConsumer consumer) {
super(endpoint, consumer);
this.endpoint = (DirectoryAssemblerEndpoint) endpoint;
}
public void doPoll(Exchange exchange) throws Exception {
this.exchange = exchange;
this.poll();
}
/** Method called by the iSpace httpcrawler when the initial request has
* been performed and succeeded. The page corresponds to the first
* page AFTER the login. */
@Override
protected void processSite(URI uri, HttpResponse response) throws IOException {
String page = HttpClientConfigurer.readFully(response.getEntity().getContent());
session = new Session();
xstream.fromXML(page, session);
/** Check whether files already exist. */
Map<String, List<Item>> documents = (Map<String, List<Item>>) exchange.getIn().getBody();
long count = 0;
Iterator<Entry<String, List<Item>>> it1 = documents.entrySet().iterator();
while (it1.hasNext()) {
Entry<String, List<Item>> entry = it1.next();
if (entry.getKey().equals("Statistics")) {
continue;
}
count += entry.getValue().size();
}
String rootFolder = ((DirectoryAssemblerEndpoint)getEndpoint()).getRootFolder() + File.separator;
long downloaded = 0;
Iterator<Entry<String, List<Item>>> it = documents.entrySet().iterator();
while (it.hasNext()) {
Entry<String, List<Item>> entry = it.next();
if (entry.getKey().equals("Statistics")) {
continue;
}
/** See if the root folder exist. */
File folder = new File(rootFolder + entry.getKey());
if (folder.exists()) {
/** Iterate through all items and see if they exist. */
for (Item doc : entry.getValue()) {
File file = new File(rootFolder + entry.getKey() + File.separator + doc.filename);
if (file.exists() == false) {
LOG.info(downloaded + "/" + count + ". Retrieving document '" + doc.absoluteFilename + "'.");
getDocument(rootFolder + entry.getKey(), entry.getKey(), doc);
}
else {
/** See if the file have changed, using the file size. */
if (file.length() != Long.parseLong(doc.filesize)) {
LOG.info(downloaded + "/" + count + ". Retrieving document '" + doc.absoluteFilename + "'.");
getDocument(rootFolder + entry.getKey(), entry.getKey(), doc);
}
else {
LOG.info(downloaded + "/" + count + ". File '" + entry.getKey() + "/" + doc.filename + "' already exist. Has same size.");
doc.metadata.put("accessibleThrough", new URL("file:." + File.separator + entry.getKey() + File.separator + doc.filename));
}
}
if (getAssemblerEndpoint().isParseBody()) {
if (fileparser == null) {
initParser();
}
Exchange exchange = new DefaultExchange(getEndpoint().getCamelContext());
exchange.getIn().setBody(file);
fileparser.send("direct:fileparser", exchange);
if (buffer.io != null) {
doc.metadata.putAll(buffer.io.metadata);
buffer.clear();
}
}
downloaded++;
}
}
else {
/** Create folder. */
File newDir = new File(rootFolder + entry.getKey());
newDir.mkdirs();
/** Iterate through the files and get each. */
for (Item doc : entry.getValue()) {
getDocument(rootFolder + entry.getKey(), entry.getKey(), doc);
}
}
}
}
protected void getDocument(String downloadFolder, String parentFolder, Item doc) {
if (getAssemblerEndpoint().isDownload()) {
try {
HttpGet get = new HttpGet("https://om.eo.esa.int/oem/kt/action.php?kt_path_info=ktcore.actions.document.view&fDocumentId=" + doc.id + "&session_id=" + session.results);
HttpResponse response = client.execute(get);
/** Create the file. */
File newFile = new File(downloadFolder + File.separator + doc.filename);
/** Write to the file. */
writeFile(response.getEntity().getContent(), newFile);
doc.metadata.put("accessibleThrough", new URL("file:." + File.separator + parentFolder + File.separator + doc.filename));
}
catch (Exception e) {
e.printStackTrace();
}
}
}
protected void writeFile(InputStream input, File file) throws IOException {
BufferedOutputStream out = new BufferedOutputStream(new FileOutputStream(file));
byte[] buffer = new byte[4 * 1024];
int charsRead;
while ((charsRead = input.read(buffer)) != -1) {
out.write(buffer, 0, charsRead);
}
input.close();
out.close();
}
protected Field findUriField(Object object) {
for (Field field : object.getClass().getFields()) {
if (field.getType() == URL.class) {
return field;
}
}
return null;
}
protected Field findFilenameField(Object object) {
for (Field field : object.getClass().getFields()) {
if (field.getType() == String.class) {
return field;
}
}
return null;
}
protected DirectoryAssemblerEndpoint getAssemblerEndpoint() {
return endpoint;
}
}