package info.aaronland.extruder;
import info.aaronland.extruder.Document;
import info.aaronland.extruder.DocumentView;
import java.util.List;
import java.util.ArrayList;
import java.lang.StringBuilder;
import java.io.InputStream;
import java.io.BufferedInputStream;
import java.io.ByteArrayOutputStream;
import java.io.ByteArrayInputStream;
import com.sun.jersey.core.header.FormDataContentDisposition;
import com.sun.jersey.multipart.FormDataMultiPart;
import com.sun.jersey.multipart.FormDataBodyPart;
import javax.ws.rs.GET;
import javax.ws.rs.POST;
import javax.ws.rs.Path;
import javax.ws.rs.QueryParam;
import javax.ws.rs.core.MediaType;
import javax.ws.rs.Consumes;
import javax.ws.rs.Produces;
import javax.ws.rs.core.Response.Status;
import javax.ws.rs.core.Response;
import java.net.URL;
import java.net.URLConnection;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.ContentHandler;
import org.apache.commons.io.FilenameUtils;
@Path(value = "/tika")
@Produces({MediaType.TEXT_HTML + "; charset=UTF-8", MediaType.APPLICATION_JSON})
public class TikaResource {
private static final Logger LOGGER = LoggerFactory.getLogger(TikaResource.class);
@GET
public Response extrudeThisUrl(@QueryParam("url") String uri){
URL url;
Document doc;
DocumentView view;
try {
url = new URL(uri);
}
catch (Exception e){
return Response.status(Response.Status.NOT_ACCEPTABLE).entity(e.toString()).build();
}
BufferedInputStream buffer = null;
try {
URLConnection conn = url.openConnection();
buffer = new BufferedInputStream(conn.getInputStream());
}
catch (Exception e){
return Response.status(Response.Status.INTERNAL_SERVER_ERROR).entity(e.toString()).build();
}
try {
doc = extrudeThis(buffer);
view = new DocumentView(doc);
}
catch (Exception e){
return Response.status(Response.Status.INTERNAL_SERVER_ERROR).entity(e.toString()).build();
}
return Response.status(Response.Status.OK).entity(view).build();
}
@POST
@Consumes(MediaType.MULTIPART_FORM_DATA)
public Response extrudeThisFile(FormDataMultiPart formParams){
FormDataBodyPart stream = formParams.getField("file");
InputStream upload = stream.getValueAs(InputStream.class);
// MOON LANGUAGE – if there's a better way to make it so that
// Tika doesn't complain that the stream (upload) is already
// closed I would love to hear about it... (20130831/straup)
ByteArrayInputStream buffer = null;
try {
ByteArrayOutputStream out = new ByteArrayOutputStream();
int read = 0;
byte[] bytes = new byte[1024];
while ((read = upload.read(bytes)) != -1) {
out.write(bytes, 0, read);
}
out.flush();
out.close();
buffer = new ByteArrayInputStream(out.toByteArray());
}
catch (Exception e){
return Response.status(Response.Status.INTERNAL_SERVER_ERROR).entity(e.toString()).build();
}
Document doc;
DocumentView view;
try {
doc = extrudeThis(buffer);
view = new DocumentView(doc);
}
catch (Exception e){
return Response.status(Response.Status.INTERNAL_SERVER_ERROR).entity(e.toString()).build();
}
return Response.status(Response.Status.OK).entity(view).build();
}
// TO DO: figure out how to make this return HTML instead of text
// (20130831/straup)
// I have no idea how that would square with the Document class...
// (20130901/straup)
private Document extrudeThis(InputStream buffer){
String text;
String title;
// http://stackoverflow.com/questions/6144708/apache-tika-and-character-limit-when-parsing-documents
Parser parser = new AutoDetectParser();
ContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
try {
parser.parse(buffer, handler, metadata, new ParseContext());
}
catch (Exception e){
throw new RuntimeException(e);
}
text = handler.toString();
text = unwrapText(text);
// http://www.celinio.net/techblog/?p=1295
title = metadata.get(Metadata.TITLE);
if (title == null){
String type = "mystery";
try {
String content_type = metadata.get(Metadata.CONTENT_TYPE);
String[] parts = content_type.split("/");
type = parts[1];
}
catch (Exception e){
LOGGER.info("Failed to parse content type because " + e.toString());
}
title = "Untitled " + type.toUpperCase() + " Document #" + System.currentTimeMillis();
}
return new Document(text, title);
}
// Not awesome. No. (20130903/straup)
private static String unwrapText(String text){
String[] raw = text.split(System.getProperty("line.separator"));
List<String> paras = new ArrayList<String>();
String buffer = "";
for (String ln : raw){
ln = ln.trim();
if (ln.equals("")){
if (buffer.length() > 0){
paras.add(buffer);
}
buffer = "";
}
else {
buffer = buffer + " " + ln;
}
}
if (buffer.length() > 0){
paras.add(buffer);
}
// why you hate "join" so much Java?
// (20130831/straup)
StringBuilder sb = new StringBuilder();
for (Object obj : paras) {
sb.append(obj.toString());
sb.append("\n");
}
return sb.toString();
}
}