package org.myrobotlab.document.transformer;
import java.io.BufferedInputStream;
import java.io.ByteArrayOutputStream;
import java.io.DataInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.util.List;
import org.apache.commons.io.IOUtils;
import org.myrobotlab.document.Document;
/**
* This stage will fetch a web page defined by the uriField and store its byte
* array in the bytesField.
*
* @author kwatters
*
*/
public class FetchURI extends AbstractStage {
private String uriField = "uri";
private String bytesField = "bytes";
@Override
public void startStage(StageConfiguration config) {
if (config != null) {
uriField = config.getProperty("uriField", "uri");
bytesField = config.getProperty("bytesField", "bytes");
}
}
@Override
public List<Document> processDocument(Document doc) {
// TODO: support https and other protocols
for (Object o : doc.getField(uriField)) {
byte[] page;
try {
page = fetchUrlAsByteArray(o.toString());
doc.addToField(bytesField, page);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
continue;
}
}
return null;
}
private byte[] fetchUrlAsByteArray(String uri) throws IOException {
URL url = new URL(uri);
InputStream in = null;
in = url.openStream();
DataInputStream dis = new DataInputStream(new BufferedInputStream(in));
ByteArrayOutputStream baos = new ByteArrayOutputStream();
IOUtils.copy(dis, baos);
return baos.toByteArray();
}
@Override
public void stopStage() {
// TODO Auto-generated method stub
}
@Override
public void flush() {
// TODO Auto-generated method stub
}
public String getUriField() {
return uriField;
}
public void setUriField(String uriField) {
this.uriField = uriField;
}
public String getBytesField() {
return bytesField;
}
public void setBytesField(String bytesField) {
this.bytesField = bytesField;
}
}