package org.juxtasoftware.resource;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.lang.reflect.Type;
import java.net.UnknownHostException;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import javax.xml.stream.XMLStreamException;
import org.apache.commons.fileupload.FileItem;
import org.apache.commons.fileupload.FileUploadBase.FileSizeLimitExceededException;
import org.apache.commons.fileupload.disk.DiskFileItemFactory;
import org.apache.commons.io.IOUtils;
import org.apache.commons.validator.routines.UrlValidator;
import org.apache.http.client.HttpClient;
import org.apache.http.client.HttpResponseException;
import org.apache.http.client.ResponseHandler;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.BasicResponseHandler;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.params.HttpProtocolParams;
import org.juxtasoftware.dao.SourceDao;
import org.juxtasoftware.model.Source;
import org.juxtasoftware.model.Usage;
import org.juxtasoftware.service.SourceRemover;
import org.juxtasoftware.util.ConversionUtils;
import org.juxtasoftware.util.EncodingUtils;
import org.juxtasoftware.util.HtmlUtils;
import org.juxtasoftware.util.MetricsHelper;
import org.restlet.data.MediaType;
import org.restlet.data.Status;
import org.restlet.ext.fileupload.RestletFileUpload;
import org.restlet.representation.Representation;
import org.restlet.resource.Delete;
import org.restlet.resource.Get;
import org.restlet.resource.Post;
import org.restlet.resource.ResourceException;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.config.BeanDefinition;
import org.springframework.context.annotation.Scope;
import org.springframework.stereotype.Service;
import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
import com.google.gson.JsonArray;
import com.google.gson.JsonElement;
import com.google.gson.JsonObject;
import com.google.gson.JsonParser;
import com.google.gson.JsonPrimitive;
import com.google.gson.JsonSerializationContext;
import com.google.gson.JsonSerializer;
@Service
@Scope(BeanDefinition.SCOPE_PROTOTYPE)
public class SourcesResource extends BaseResource {
@Autowired private SourceDao sourceDao;
@Autowired private Long maxSourceSize;
@Autowired private MetricsHelper metrics;
@Autowired private SourceRemover remover;
private boolean batchDelete;
@Override
protected void doInit() throws ResourceException {
super.doInit();
String lastSeg = getRequest().getResourceRef().getLastSegment().toLowerCase();
this.batchDelete = lastSeg.equals("delete");
}
/**
* Get Json representation of all available sources
* @return
*/
@Get("json")
public Representation toJson() {
List<Source> docs = this.sourceDao.list(this.workspace);
Gson gson = new GsonBuilder().registerTypeAdapter(Source.class, new SourcesSerializer()).create();
return toJsonRepresentation(gson.toJson(docs));
}
/**
* Get HTML representation of all available sources
* @return
*/
@Get("html")
public Representation toHtml() {
List<Source> docs = this.sourceDao.list(this.workspace);
Map<String, Object> map = new HashMap<String, Object>();
map.put("docs", docs);
map.put("page", "source");
map.put("title", "Juxta Sources");
return toHtmlRepresentation("sources.ftl", map);
}
@Delete("json")
public Representation batchDelete( final String jsonContent) {
LOG.info("Batch delete sources "+jsonContent);
JsonParser parser = new JsonParser();
JsonArray jsonArray = parser.parse(jsonContent).getAsJsonArray();
Set<Usage> usage = new HashSet<Usage>();
for ( Iterator<JsonElement> itr = jsonArray.iterator(); itr.hasNext(); ) {
JsonElement ele = itr.next();
Long id = ele.getAsLong();
Source s = this.sourceDao.find(this.workspace.getId(), id);
if ( s != null ) {
try {
usage.addAll( this.remover.removeSource(this.workspace, s));
} catch ( ResourceException e ) {
LOG.warn(e.toString());
}
} else {
LOG.warn("Source ID "+id+" is not a valid source for this workspace");
}
}
Gson gson = new Gson();
return toJsonRepresentation( gson.toJson(usage) );
}
/**
* Accept posts to create sources. Two types are supported; one is a
* multipart/form post consisting of a json header and a file stream. The
* header must contain two members: sourceName and contentType.
*
* The other is a json array. Each entry is a json object with the following data:
* name, type and data. Supported types are txt, xml and url. For txt and xml,
* the data element contains the raw text or xml data. For url, the data contains
* a url that will be scraped for source content.
*
* These two types can be used together. In this case, the multipart/form would
* contain 3 parts; jsonHeader, file stream, and a json array.
*
* @param entity
*/
@Post
public Representation create(Representation entity) throws ResourceException {
if ( this.batchDelete ) {
try {
return batchDelete(entity.getText());
} catch (IOException e) {
setStatus(Status.CLIENT_ERROR_BAD_REQUEST);
return toTextRepresentation("Invalid delete data");
}
}
if (entity == null) {
setStatus(Status.CLIENT_ERROR_BAD_REQUEST);
return toTextRepresentation("Missing source payload");
}
if (MediaType.MULTIPART_FORM_DATA.equals(entity.getMediaType(), true)) {
return handleMutipartPost(entity);
} else if (MediaType.APPLICATION_JSON.equals(entity.getMediaType(), true)) {
return handleJsonPost(entity);
}
setStatus(Status.CLIENT_ERROR_BAD_REQUEST);
return toTextRepresentation("Unsupported content type in post");
}
/**
* Create sources based on a POST of JSON data. This post can either contain the raw
* source text or a URL that points to the source to be added
*
* @param entity
* @return
*/
private Representation handleJsonPost(Representation entity) {
// parse request into a JSON array
JsonParser parser = new JsonParser();
Gson gson = new Gson();
JsonArray jsonArray = null;
try {
jsonArray = parser.parse(entity.getText()).getAsJsonArray();
} catch (Exception e) {
setStatus(Status.CLIENT_ERROR_BAD_REQUEST);
return toTextRepresentation("Invalid JSON data in request");
}
List<Long> ids = new ArrayList<Long>();
for (Iterator<JsonElement> itr = jsonArray.iterator(); itr.hasNext();) {
JsonObject jsonObj = itr.next().getAsJsonObject();
// make sure all necessary data is present
if (jsonObj.has("type") == false) {
setStatus(Status.CLIENT_ERROR_BAD_REQUEST);
return toTextRepresentation("Missing required information: type");
}
if (jsonObj.has("name") == false) {
setStatus(Status.CLIENT_ERROR_BAD_REQUEST);
return toTextRepresentation("Missing required information: name");
}
if (jsonObj.has("data") == false) {
setStatus(Status.CLIENT_ERROR_BAD_REQUEST);
return toTextRepresentation("Missing required information: data");
}
String type = jsonObj.get("type").getAsString();
String name = jsonObj.get("name").getAsString();
String data = jsonObj.get("data").getAsString();
try {
if (type.equalsIgnoreCase("url")) {
if ( UrlValidator.getInstance().isValid(data)) {
// pull content from the URL. Type will be determined from
// the HTTP response
ids.add( scrapeExternalUrl(name, data) );
} else {
setStatus(Status.CLIENT_ERROR_BAD_REQUEST);
return toTextRepresentation("Malformed source URL");
}
} else if (type.equalsIgnoreCase("raw")) {
if (jsonObj.has("contentType") == false) {
setStatus(Status.CLIENT_ERROR_BAD_REQUEST);
return toTextRepresentation("Missing required information: contentType");
}
Source.Type contentType = Source.Type.valueOf(jsonObj.get("contentType").getAsString().toUpperCase());
ids.add( createSourceFromRawData(name, data, contentType) );
}
} catch (HttpResponseException e) {
LOG.error("Link to source "+data+" failed", e);
setStatus( Status.valueOf(e.getStatusCode()));
if ( e.getStatusCode() == 403 ) {
String msg = "The target web site is not allowing Juxta to access its content.\n"
+"To work around this, download the content to your local system,\nthen upload it using option #1.";
return toTextRepresentation(msg);
} else {
return toTextRepresentation("Link to "+data+" failed: "+e.getMessage());
}
} catch (UnknownHostException e) {
setStatus(Status.CLIENT_ERROR_BAD_REQUEST);
return toTextRepresentation("The URL contains an unknown host");
} catch (IOException e) {
setStatus(Status.SERVER_ERROR_INTERNAL);
return toTextRepresentation("Unable to create source "+name+": "+e.toString());
} catch (XMLStreamException e) {
setStatus(Status.CLIENT_ERROR_BAD_REQUEST);
return toTextRepresentation("Source "+name+" contains invalid XML: "+e.toString());
} catch (DuplicateSourceException e) {
setStatus(Status.CLIENT_ERROR_CONFLICT);
return toTextRepresentation("Source '" + name + "' already exists in workspace '"
+ this.workspace.getName() + "'");
}
}
return toJsonRepresentation(gson.toJson(ids));
}
/**
* Create sources from a multipart POST
* @param entity
* @return
*/
private Representation handleMutipartPost(Representation entity) {
String sourceName = null;
String contentType = null;
InputStream srcInputStream = null;
try {
// pull the list of items in this multipart request
DiskFileItemFactory factory = new DiskFileItemFactory();
factory.setSizeThreshold(1000240);
RestletFileUpload upload = new RestletFileUpload(factory);
List<FileItem> items = upload.parseRequest(getRequest());
for (FileItem item : items) {
if (item.getFieldName().equals("sourceName")) {
sourceName = item.getString();
} else if (item.getFieldName().equals("contentType")) {
contentType = item.getString();
} else if (item.getFieldName().equals("sourceFile")) {
srcInputStream = item.getInputStream();
}
}
// validate that everything needed is present
if (srcInputStream == null) {
setStatus(Status.CLIENT_ERROR_BAD_REQUEST);
return toTextRepresentation("Missing file data in post");
}
if (sourceName == null || contentType == null) {
setStatus(Status.CLIENT_ERROR_BAD_REQUEST);
return toTextRepresentation("Missing name and/or content type information");
}
} catch (Exception e) {
LOG.error("Unable to parse multipart data", e);
setStatus(Status.CLIENT_ERROR_BAD_REQUEST);
return toTextRepresentation("File upload failed");
}
List<Long> idList = new ArrayList<Long>();
try {
// create the source
Long id = createSource(sourceName, MediaType.valueOf(contentType), srcInputStream);
idList.add(id);
} catch (IOException e) {
setStatus(Status.SERVER_ERROR_INTERNAL);
return toTextRepresentation("Unable to import source: " + e.getMessage());
} catch (XMLStreamException e) {
setStatus(Status.CLIENT_ERROR_BAD_REQUEST);
String msg = e.getMessage();
int pos = msg.indexOf(":");
if (pos > -1) {
msg = msg.substring(pos + 1).trim();
}
return toTextRepresentation("This document contains malformed xml - " + msg);
} catch (DuplicateSourceException e) {
setStatus(Status.CLIENT_ERROR_CONFLICT);
return toTextRepresentation("Source '" + e.getSourceName() + "' already exists in workspace '"
+ this.workspace.getName() + "'");
}
Gson gson = new Gson();
return toJsonRepresentation(gson.toJson(idList));
}
/**
* Create a source from a multipart data stream
* @param sourceName
* @param mediaType
* @param srcInputStream
* @return
* @throws IOException
* @throws XMLStreamException
* @throws FileSizeLimitExceededException
* @throws DuplicateSourceException
*/
private Long createSource(final String sourceName, final MediaType mediaType, InputStream srcInputStream) throws IOException, XMLStreamException, DuplicateSourceException {
File srcFile = null;
Source.Type contentType = Source.Type.TXT;
// Special handling for files that will not be auto transformed:
// be sure they are UTF-8 and set type flags. Strip scary stuff out
// of HTML files
if ( mediaType.equals(MediaType.APPLICATION_XML) ||
mediaType.equals(MediaType.TEXT_XML) ||
mediaType.equals(MediaType.TEXT_HTML) ||
mediaType.equals(MediaType.TEXT_PLAIN) ||
sourceName.endsWith(".wiki") ) {
srcFile = EncodingUtils.fixEncoding(srcInputStream);
if ( mediaType.equals(MediaType.APPLICATION_XML) || mediaType.equals(MediaType.TEXT_XML)) {
contentType = Source.Type.XML;
} else if ( mediaType.equals(MediaType.TEXT_HTML) ) {
contentType = Source.Type.HTML;
HtmlUtils.strip(srcFile);
} else {
if ( sourceName.endsWith(".wiki")) {
contentType = Source.Type.WIKI;
}
}
}
else {
// General case: auto transform to TXT
try {
srcFile = ConversionUtils.convertToText(srcInputStream);
} catch (Exception e) {
LOG.error("Unable to convert "+sourceName+" "+mediaType+" to text", e);
throw new IOException(e.getMessage());
}
}
return writeSourceData(srcFile, sourceName, contentType);
}
/**
* Create a source from the string data passed along with the request. This can create sources that
* are based on ascii text: HTML, TXT and XML.
*
* @param name
* @param data
* @param contentType
* @return
* @throws DuplicateSourceException
* @throws XMLStreamException
* @throws IOException
* @throws Exception
*/
private Long createSourceFromRawData(final String name, final String data, final Source.Type contentType) throws IOException, XMLStreamException, DuplicateSourceException {
File fixed = EncodingUtils.fixEncoding(new ByteArrayInputStream(data.getBytes()));
if (contentType.equals(Source.Type.HTML)) {
HtmlUtils.strip(fixed);
}
return writeSourceData(fixed, name, contentType);
}
/**
* Create a new source with the sprcified type and name.
*
* @param srcFile
* @param name
* @param type
* @return
* @throws IOException
* @throws XMLStreamException
* @throws DuplicateSourceException
*/
private Long writeSourceData(File srcFile, final String name, final Source.Type type) throws DuplicateSourceException, IOException, XMLStreamException {
if (this.maxSourceSize > 0 && srcFile.length() > this.maxSourceSize) {
String err = "Source size is " + srcFile.length() / 1024 + "K.\nThis exceeds the Juxta size limit of "
+ this.maxSourceSize / 1024 + "K.\n\n"
+ "Try breaking the source into smaller segments and re-submitting.";
throw new IOException(err);
}
String finalName = appendExtension(name, type);
// prevent duplicate file names
if (this.sourceDao.exists(this.workspace, finalName)) {
throw new DuplicateSourceException(finalName);
}
FileInputStream fis = new FileInputStream(srcFile);
InputStreamReader isr = new InputStreamReader(fis, "UTF-8");
Long id = this.sourceDao.create(this.workspace, finalName, type, isr);
IOUtils.closeQuietly(isr);
srcFile.delete();
this.metrics.sourceAdded(this.workspace, this.sourceDao.find(this.workspace.getId(), id));
return id;
}
private Long scrapeExternalUrl(final String name, final String url) throws HttpResponseException, IOException, XMLStreamException, DuplicateSourceException {
// get the contents of the URL and store them in rawFile
HttpClient httpClient = new DefaultHttpClient();
HttpProtocolParams.setUserAgent(httpClient.getParams(), "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_5) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.57 Safari/537.17");
HttpGet get = new HttpGet(url);
ResponseHandler<String> responseHandler = new BasicResponseHandler();
String response = httpClient.execute(get, responseHandler);
File rawFile = File.createTempFile("url", "dat");
OutputStreamWriter osw = new OutputStreamWriter(new FileOutputStream(rawFile), "UTF-8");
IOUtils.write(response, osw);
IOUtils.closeQuietly(osw);
// prepare source for addition to library
File srcFile = null;
MediaType mediaType = ConversionUtils.determineMediaType(rawFile);
if (MediaType.TEXT_XML.isCompatible(mediaType) ||
MediaType.APPLICATION_XML.isCompatible(mediaType) ||
MediaType.TEXT_HTML.isCompatible(mediaType) ||
MediaType.TEXT_PLAIN.isCompatible(mediaType)) {
try {
srcFile = EncodingUtils.fixEncoding(new FileInputStream(rawFile));
if (MediaType.TEXT_HTML.isCompatible(mediaType)) {
HtmlUtils.strip(srcFile);
}
} finally {
rawFile.delete();
}
} else {
mediaType = MediaType.TEXT_PLAIN;
try {
srcFile = ConversionUtils.convertToText(new FileInputStream(rawFile));
} catch (Exception e) {
throw new IOException(e.getMessage());
} finally {
rawFile.delete();
}
}
// Convert media type to Source Type
Source.Type srcType = Source.Type.TXT;
if (MediaType.TEXT_XML.isCompatible(mediaType) ||
MediaType.APPLICATION_XML.isCompatible(mediaType) ) {
srcType = Source.Type.XML;
} else if (MediaType.TEXT_HTML.isCompatible(mediaType)) {
srcType = Source.Type.HTML;
}
// dump results and enforce limits / uniqueness
return writeSourceData(srcFile, name, srcType);
}
private String appendExtension(String name, org.juxtasoftware.model.Source.Type srcType) {
// make sure theres some kind of extension
String finalName = name;
String lcName = name.toLowerCase();
if ( lcName.endsWith(".txt") == false && lcName.endsWith(".html") == false && lcName.endsWith(".htm") == false &&
lcName.endsWith(".xml") == false && lcName.endsWith(".wiki") == false ) {
finalName = name + "." +srcType.toString().toLowerCase();
}
return finalName;
}
private class SourcesSerializer implements JsonSerializer<Source> {
private final DateFormat format = new SimpleDateFormat("MM/dd/yyyy HH:mm:ss");
@Override
public JsonElement serialize(Source src, Type typeOfSrc, JsonSerializationContext context) {
JsonObject obj = new JsonObject();
obj.add("id", new JsonPrimitive(src.getId()));
obj.add("name", new JsonPrimitive(src.getName()));
obj.add("type", new JsonPrimitive(src.getType().toString()));
obj.add("length", new JsonPrimitive(src.getText().getLength()));
obj.add("created", new JsonPrimitive(this.format.format(src.getCreated())));
return obj;
}
}
private static class DuplicateSourceException extends Exception {
private static final long serialVersionUID = 8890164370720970377L;
private final String sourceName;
public DuplicateSourceException(String name) {
super();
this.sourceName = name;
}
public String getSourceName() {
return this.sourceName;
}
}
}