SourcesResource.java example

Explorer
juxta-service-master
package org.juxtasoftware.resource;

import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.lang.reflect.Type;
import java.net.UnknownHostException;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;

import javax.xml.stream.XMLStreamException;

import org.apache.commons.fileupload.FileItem;
import org.apache.commons.fileupload.FileUploadBase.FileSizeLimitExceededException;
import org.apache.commons.fileupload.disk.DiskFileItemFactory;
import org.apache.commons.io.IOUtils;
import org.apache.commons.validator.routines.UrlValidator;
import org.apache.http.client.HttpClient;
import org.apache.http.client.HttpResponseException;
import org.apache.http.client.ResponseHandler;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.BasicResponseHandler;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.params.HttpProtocolParams;
import org.juxtasoftware.dao.SourceDao;
import org.juxtasoftware.model.Source;
import org.juxtasoftware.model.Usage;
import org.juxtasoftware.service.SourceRemover;
import org.juxtasoftware.util.ConversionUtils;
import org.juxtasoftware.util.EncodingUtils;
import org.juxtasoftware.util.HtmlUtils;
import org.juxtasoftware.util.MetricsHelper;
import org.restlet.data.MediaType;
import org.restlet.data.Status;
import org.restlet.ext.fileupload.RestletFileUpload;
import org.restlet.representation.Representation;
import org.restlet.resource.Delete;
import org.restlet.resource.Get;
import org.restlet.resource.Post;
import org.restlet.resource.ResourceException;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.config.BeanDefinition;
import org.springframework.context.annotation.Scope;
import org.springframework.stereotype.Service;

import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
import com.google.gson.JsonArray;
import com.google.gson.JsonElement;
import com.google.gson.JsonObject;
import com.google.gson.JsonParser;
import com.google.gson.JsonPrimitive;
import com.google.gson.JsonSerializationContext;
import com.google.gson.JsonSerializer;

@Service
@Scope(BeanDefinition.SCOPE_PROTOTYPE)
public class SourcesResource extends BaseResource {
    @Autowired private SourceDao sourceDao;
    @Autowired private Long maxSourceSize;
    @Autowired private MetricsHelper metrics;
    @Autowired private SourceRemover remover;
    private boolean batchDelete;
    
    @Override
    protected void doInit() throws ResourceException { 
        super.doInit();
        String lastSeg  = getRequest().getResourceRef().getLastSegment().toLowerCase();
        this.batchDelete =  lastSeg.equals("delete");
    }
    
    /**
     * Get Json representation of all available sources
     * @return
     */
    @Get("json")
    public Representation toJson() {
        List<Source> docs = this.sourceDao.list(this.workspace);
        Gson gson = new GsonBuilder().registerTypeAdapter(Source.class, new SourcesSerializer()).create();
        return toJsonRepresentation(gson.toJson(docs));
    }

    /**
     * Get HTML representation of all available sources
     * @return
     */
    @Get("html")
    public Representation toHtml() {
        List<Source> docs = this.sourceDao.list(this.workspace);
        Map<String, Object> map = new HashMap<String, Object>();
        map.put("docs", docs);
        map.put("page", "source");
        map.put("title", "Juxta Sources");
        return toHtmlRepresentation("sources.ftl", map);
    }
    
    @Delete("json")
    public Representation batchDelete( final String jsonContent) {
        LOG.info("Batch delete sources "+jsonContent);
        JsonParser parser = new JsonParser();
        JsonArray jsonArray = parser.parse(jsonContent).getAsJsonArray();
        Set<Usage> usage = new HashSet<Usage>();
        for ( Iterator<JsonElement>  itr = jsonArray.iterator(); itr.hasNext(); ) {
            JsonElement ele = itr.next();
            Long id = ele.getAsLong();
            Source s = this.sourceDao.find(this.workspace.getId(), id);
            if ( s != null ) {
                try {
                    usage.addAll( this.remover.removeSource(this.workspace, s));
                } catch ( ResourceException e ) {
                    LOG.warn(e.toString());
                }
            } else {
                LOG.warn("Source ID "+id+" is not a valid source for this workspace");
            }
        }
        Gson gson = new Gson();
        return toJsonRepresentation( gson.toJson(usage) );
    }

    /**
     * Accept posts to create sources. Two types are supported; one is a 
     * multipart/form post consisting of a json header and a file stream. The 
     * header must contain two members: sourceName and contentType.
     * 
     * The other is a json array. Each entry is a json object with the following data:
     * name, type and data. Supported types are txt, xml and url. For txt and xml,
     * the data element contains the raw text or xml data. For url, the data contains
     * a url that will be scraped for source content.
     * 
     * These two types can be used together. In this case, the multipart/form would
     * contain 3 parts; jsonHeader, file stream, and a json array.
     * 
     * @param entity
     */
    @Post
    public Representation create(Representation entity) throws ResourceException {
        if ( this.batchDelete ) {
            try {
                return batchDelete(entity.getText());
            } catch (IOException e) {
                setStatus(Status.CLIENT_ERROR_BAD_REQUEST);
                return toTextRepresentation("Invalid delete data");
            }
        }
        
        if (entity == null) {
            setStatus(Status.CLIENT_ERROR_BAD_REQUEST);
            return toTextRepresentation("Missing source payload");
        }

        if (MediaType.MULTIPART_FORM_DATA.equals(entity.getMediaType(), true)) {
            return handleMutipartPost(entity);
        } else if (MediaType.APPLICATION_JSON.equals(entity.getMediaType(), true)) {
            return handleJsonPost(entity);
        }

        setStatus(Status.CLIENT_ERROR_BAD_REQUEST);
        return toTextRepresentation("Unsupported content type in post");
    }

    /**
     * Create sources based on a POST of JSON data. This post can either contain the raw
     * source text or a URL that points to the source to be added
     * 
     * @param entity
     * @return
     */
    private Representation handleJsonPost(Representation entity) {
        // parse request into a JSON array
        JsonParser parser = new JsonParser();
        Gson gson = new Gson();
        JsonArray jsonArray = null;
        try {
            jsonArray = parser.parse(entity.getText()).getAsJsonArray();
        } catch (Exception e) {
            setStatus(Status.CLIENT_ERROR_BAD_REQUEST);
            return toTextRepresentation("Invalid JSON data in request");
        } 
        
        List<Long> ids = new ArrayList<Long>();
        for (Iterator<JsonElement> itr = jsonArray.iterator(); itr.hasNext();) {
            JsonObject jsonObj = itr.next().getAsJsonObject();
            
            // make sure all necessary data is present
            if (jsonObj.has("type") == false) {
                setStatus(Status.CLIENT_ERROR_BAD_REQUEST);
                return toTextRepresentation("Missing required information: type");
            }
            if (jsonObj.has("name") == false) {
                setStatus(Status.CLIENT_ERROR_BAD_REQUEST);
                return toTextRepresentation("Missing required information: name");
            }
            if (jsonObj.has("data") == false) {
                setStatus(Status.CLIENT_ERROR_BAD_REQUEST);
                return toTextRepresentation("Missing required information: data");
            }
            
            String type = jsonObj.get("type").getAsString();
            String name = jsonObj.get("name").getAsString();
            String data = jsonObj.get("data").getAsString();

            try {
                if (type.equalsIgnoreCase("url")) {                   
                    if ( UrlValidator.getInstance().isValid(data)) {
                        // pull content from the URL. Type will be determined from
                        // the HTTP response
                        ids.add( scrapeExternalUrl(name, data) );
                    } else {
                        setStatus(Status.CLIENT_ERROR_BAD_REQUEST);
                        return toTextRepresentation("Malformed source URL");
                    }
                } else if (type.equalsIgnoreCase("raw")) {
                    if (jsonObj.has("contentType") == false) {
                        setStatus(Status.CLIENT_ERROR_BAD_REQUEST);
                        return toTextRepresentation("Missing required information: contentType");
                    }
                    Source.Type contentType = Source.Type.valueOf(jsonObj.get("contentType").getAsString().toUpperCase());
                    ids.add( createSourceFromRawData(name, data, contentType) );
                }
            } catch (HttpResponseException e) {
                LOG.error("Link to source "+data+" failed", e);
                setStatus( Status.valueOf(e.getStatusCode()));
                if ( e.getStatusCode() == 403 ) {
                    String msg = "The target web site is not allowing Juxta to access its content.\n"
                        +"To work around this, download the content to your local system,\nthen upload it using option #1.";
                    return toTextRepresentation(msg);
                } else {
                    return toTextRepresentation("Link to "+data+" failed: "+e.getMessage());
                }
            } catch (UnknownHostException e) {
                setStatus(Status.CLIENT_ERROR_BAD_REQUEST);
                return toTextRepresentation("The URL contains an unknown host");
            } catch (IOException e) {
                setStatus(Status.SERVER_ERROR_INTERNAL);
                return toTextRepresentation("Unable to create source "+name+": "+e.toString());
            } catch (XMLStreamException e) {
                setStatus(Status.CLIENT_ERROR_BAD_REQUEST);
                return toTextRepresentation("Source "+name+" contains invalid XML: "+e.toString());
            } catch (DuplicateSourceException e) {
                setStatus(Status.CLIENT_ERROR_CONFLICT);
                return toTextRepresentation("Source '" + name + "' already exists in workspace '"
                    + this.workspace.getName() + "'");
            }
        }
        return toJsonRepresentation(gson.toJson(ids)); 
    }

    /**
     * Create sources from a multipart POST
     * @param entity
     * @return
     */
    private Representation handleMutipartPost(Representation entity) {
        String sourceName = null;
        String contentType = null;
        InputStream srcInputStream = null;
        try {
            // pull the list of items in this multipart request
            DiskFileItemFactory factory = new DiskFileItemFactory();
            factory.setSizeThreshold(1000240);
            RestletFileUpload upload = new RestletFileUpload(factory);
            List<FileItem> items = upload.parseRequest(getRequest());
            for (FileItem item : items) {
                if (item.getFieldName().equals("sourceName")) {
                    sourceName = item.getString();
                } else if (item.getFieldName().equals("contentType")) {
                    contentType = item.getString();
                } else if (item.getFieldName().equals("sourceFile")) {
                    srcInputStream = item.getInputStream();
                }
            }

            // validate that everything needed is present
            if (srcInputStream == null) {
                setStatus(Status.CLIENT_ERROR_BAD_REQUEST);
                return toTextRepresentation("Missing file data in post");
            }
            if (sourceName == null || contentType == null) {
                setStatus(Status.CLIENT_ERROR_BAD_REQUEST);
                return toTextRepresentation("Missing name and/or content type information");
            }
        } catch (Exception e) {
            LOG.error("Unable to parse multipart data", e);
            setStatus(Status.CLIENT_ERROR_BAD_REQUEST);
            return toTextRepresentation("File upload failed");
        }

        List<Long> idList = new ArrayList<Long>();
        try {
            // create the source
            Long id = createSource(sourceName, MediaType.valueOf(contentType), srcInputStream);
            idList.add(id);
        } catch (IOException e) {
            setStatus(Status.SERVER_ERROR_INTERNAL);
            return toTextRepresentation("Unable to import source: " + e.getMessage());
        } catch (XMLStreamException e) {
            setStatus(Status.CLIENT_ERROR_BAD_REQUEST);
            String msg = e.getMessage();
            int pos = msg.indexOf(":");
            if (pos > -1) {
                msg = msg.substring(pos + 1).trim();
            }
            return toTextRepresentation("This document contains malformed xml - " + msg);
        } catch (DuplicateSourceException e) {
            setStatus(Status.CLIENT_ERROR_CONFLICT);
            return toTextRepresentation("Source '" + e.getSourceName() + "' already exists in workspace '"
                + this.workspace.getName() + "'");
        }

        Gson gson = new Gson();
        return toJsonRepresentation(gson.toJson(idList));
    }
    
    /**
     * Create a source from a multipart data stream
     * @param sourceName
     * @param mediaType
     * @param srcInputStream
     * @return
     * @throws IOException
     * @throws XMLStreamException
     * @throws FileSizeLimitExceededException
     * @throws DuplicateSourceException 
     */
    private Long createSource(final String sourceName, final MediaType mediaType, InputStream srcInputStream) throws IOException, XMLStreamException, DuplicateSourceException {
        File srcFile = null;
        Source.Type contentType = Source.Type.TXT;
        
        // Special handling for files that will not be auto transformed:
        // be sure they are UTF-8 and set type flags. Strip scary stuff out
        // of HTML files
        if ( mediaType.equals(MediaType.APPLICATION_XML) ||  
             mediaType.equals(MediaType.TEXT_XML) ||
             mediaType.equals(MediaType.TEXT_HTML) ||
             mediaType.equals(MediaType.TEXT_PLAIN) || 
             sourceName.endsWith(".wiki") ) {
            
            srcFile = EncodingUtils.fixEncoding(srcInputStream);
            if ( mediaType.equals(MediaType.APPLICATION_XML) ||  mediaType.equals(MediaType.TEXT_XML)) {
                contentType = Source.Type.XML;
            } else if ( mediaType.equals(MediaType.TEXT_HTML) ) {
                contentType = Source.Type.HTML;
                HtmlUtils.strip(srcFile);
            } else {
                if ( sourceName.endsWith(".wiki")) {
                    contentType = Source.Type.WIKI;
                }
            }
            
        }
        else {
            // General case: auto transform to TXT
            try {
                srcFile = ConversionUtils.convertToText(srcInputStream);
            } catch (Exception e) {
                LOG.error("Unable to convert "+sourceName+" "+mediaType+" to text", e);
                throw new IOException(e.getMessage());
            }
        } 
   
        return writeSourceData(srcFile, sourceName, contentType);
    }

    /**
     * Create a source from the string data passed along with the request. This can create sources that
     * are based on ascii text: HTML, TXT and XML.
     * 
     * @param name
     * @param data
     * @param contentType
     * @return
     * @throws DuplicateSourceException 
     * @throws XMLStreamException 
     * @throws IOException 
     * @throws Exception
     */
    private Long createSourceFromRawData(final String name, final String data, final Source.Type contentType) throws IOException, XMLStreamException, DuplicateSourceException {
        File fixed = EncodingUtils.fixEncoding(new ByteArrayInputStream(data.getBytes()));
        if (contentType.equals(Source.Type.HTML)) {
            HtmlUtils.strip(fixed);
        }
        return writeSourceData(fixed, name, contentType);
    }

    /**
     * Create a new source with the sprcified type and name. 
     * 
     * @param srcFile
     * @param name
     * @param type
     * @return
     * @throws IOException
     * @throws XMLStreamException
     * @throws DuplicateSourceException 
     */
    private Long writeSourceData(File srcFile, final String name, final Source.Type type) throws DuplicateSourceException, IOException, XMLStreamException {
        if (this.maxSourceSize > 0 && srcFile.length() > this.maxSourceSize) {
            String err = "Source size is " + srcFile.length() / 1024 + "K.\nThis exceeds the Juxta size limit of "
                + this.maxSourceSize / 1024 + "K.\n\n"
                + "Try breaking the source into smaller segments and re-submitting.";
            throw new IOException(err);
        }

        String finalName = appendExtension(name, type);
        
        // prevent duplicate file names
        if (this.sourceDao.exists(this.workspace, finalName)) {
            throw new DuplicateSourceException(finalName);
        }
        
        FileInputStream fis = new FileInputStream(srcFile);
        InputStreamReader isr = new InputStreamReader(fis, "UTF-8");
        Long id = this.sourceDao.create(this.workspace, finalName, type, isr);
        IOUtils.closeQuietly(isr);
        srcFile.delete();

        this.metrics.sourceAdded(this.workspace, this.sourceDao.find(this.workspace.getId(), id));

        return id;
    }

    private Long scrapeExternalUrl(final String name, final String url) throws HttpResponseException, IOException, XMLStreamException, DuplicateSourceException {
        
        // get the contents of the URL and store them in rawFile
        HttpClient httpClient = new DefaultHttpClient();
        HttpProtocolParams.setUserAgent(httpClient.getParams(), "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_5) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.57 Safari/537.17");
        HttpGet get = new HttpGet(url);
        ResponseHandler<String> responseHandler = new BasicResponseHandler();
        String response = httpClient.execute(get, responseHandler);
        File rawFile = File.createTempFile("url", "dat");
        OutputStreamWriter osw = new OutputStreamWriter(new FileOutputStream(rawFile), "UTF-8");
        IOUtils.write(response, osw);
        IOUtils.closeQuietly(osw);

        // prepare source for addition to library
        File srcFile = null;
        MediaType mediaType = ConversionUtils.determineMediaType(rawFile);
        if (MediaType.TEXT_XML.isCompatible(mediaType) || 
            MediaType.APPLICATION_XML.isCompatible(mediaType) || 
            MediaType.TEXT_HTML.isCompatible(mediaType) || 
            MediaType.TEXT_PLAIN.isCompatible(mediaType)) {
            
            try {
                srcFile = EncodingUtils.fixEncoding(new FileInputStream(rawFile));
                if (MediaType.TEXT_HTML.isCompatible(mediaType)) {
                    HtmlUtils.strip(srcFile);
                }
            } finally {
                rawFile.delete();
            }
        } else {
            mediaType = MediaType.TEXT_PLAIN;
            try {
                srcFile = ConversionUtils.convertToText(new FileInputStream(rawFile));
            } catch (Exception e) {
                throw new IOException(e.getMessage());
            } finally {
                rawFile.delete();
            }
        }

        // Convert media type to Source Type
        Source.Type srcType = Source.Type.TXT;
        if (MediaType.TEXT_XML.isCompatible(mediaType) ||
            MediaType.APPLICATION_XML.isCompatible(mediaType) ) {
            srcType = Source.Type.XML;
        } else if (MediaType.TEXT_HTML.isCompatible(mediaType)) {
            srcType = Source.Type.HTML;
        }

        // dump results and enforce limits / uniqueness
        return writeSourceData(srcFile, name, srcType);
    }

    private String appendExtension(String name, org.juxtasoftware.model.Source.Type srcType) {
        // make sure theres some kind of extension
        String finalName = name;
        String lcName = name.toLowerCase();
        if ( lcName.endsWith(".txt") == false && lcName.endsWith(".html") == false && lcName.endsWith(".htm") == false &&
             lcName.endsWith(".xml") == false && lcName.endsWith(".wiki") == false  ) {
            finalName = name + "." +srcType.toString().toLowerCase();
        }
        return finalName;
    }

    private class SourcesSerializer implements JsonSerializer<Source> {
        private final DateFormat format = new SimpleDateFormat("MM/dd/yyyy HH:mm:ss");

        @Override
        public JsonElement serialize(Source src, Type typeOfSrc, JsonSerializationContext context) {

            JsonObject obj = new JsonObject();
            obj.add("id", new JsonPrimitive(src.getId()));
            obj.add("name", new JsonPrimitive(src.getName()));
            obj.add("type", new JsonPrimitive(src.getType().toString()));
            obj.add("length", new JsonPrimitive(src.getText().getLength()));
            obj.add("created", new JsonPrimitive(this.format.format(src.getCreated())));
            return obj;
        }

    }
    
    private static class DuplicateSourceException extends Exception {
        private static final long serialVersionUID = 8890164370720970377L;
        private final String sourceName;
        public DuplicateSourceException(String name) {
            super();
            this.sourceName = name;
        }
        public String getSourceName() {
            return this.sourceName;
        }
    }
}