package org.meaningfulweb.servlet; import java.io.ByteArrayInputStream; import java.util.HashMap; import java.util.HashSet; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.Set; import javax.servlet.http.HttpServletRequest; import javax.servlet.http.HttpServletResponse; import org.apache.commons.codec.binary.Base64; import org.apache.commons.lang.StringEscapeUtils; import org.apache.commons.lang.StringUtils; import org.meaningfulweb.cext.Extract; import org.meaningfulweb.cext.ExtractForm; import org.meaningfulweb.cext.HtmlExtractor; import org.meaningfulweb.util.http.HttpClientService; import org.meaningfulweb.util.http.HttpException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.beans.factory.annotation.Qualifier; import org.springframework.stereotype.Controller; import org.springframework.web.bind.annotation.RequestBody; import org.springframework.web.bind.annotation.RequestMapping; import org.springframework.web.bind.annotation.RequestMethod; import org.springframework.web.bind.annotation.ResponseBody; @Controller @RequestMapping("/api/content/extract") public class HtmlExtractorController { private final static Logger LOG = LoggerFactory .getLogger(HtmlExtractorController.class); @Autowired private HtmlExtractor htmlExtractor; @Autowired @Qualifier("httpClientService") private HttpClientService httpClientService; private Map getErrors(Exception e) { Map<String, String> errMap = new LinkedHashMap<String, String>(); if (e instanceof HttpException) { HttpException he = (HttpException)e; errMap.put("statusCode", String.valueOf(he.getStatusCode())); } errMap.put("message", e.getMessage()); return errMap; } private Map extractContent(byte[] content, Set<String> pipelines, Set<String> components, Map<String, Object> config, Map<String, Object> metadata) { Map output = new HashMap(); if (content != null && content.length > 0) { Extract extract = new Extract(content); extract.setPipelines(pipelines); extract.setComponents(components); extract.setConfig(config); extract.setMetadata(metadata); try { htmlExtractor.extract(extract); output = extract.getExtracted(); } catch (Exception e) { LOG.error("Error extracting content", e); } } return output; } @RequestMapping(value = "/url.json", method = RequestMethod.POST) public @ResponseBody Map extractContentFromUrl(@RequestBody ExtractForm extractForm, HttpServletRequest request, HttpServletResponse response) { Map errors = new HashMap(); // check for blank global hash String url = extractForm.getUrl(); if (StringUtils.isBlank(url)) { errors.put("url.required", "Url is required and cannot be blank"); } // check for no processors List<String> components = extractForm.getComponents(); boolean hasComponents = (components != null && components.size() > 0); List<String> pipelines = extractForm.getPipelines(); boolean hasPipelines = (pipelines != null && pipelines.size() > 0); if (!hasComponents && !hasPipelines) { errors.put("processors.required", "One or more components or pipelines must be specified to process " + "content"); } // return errors if any exist if (errors.size() > 0) { return errors; } // add the url to the metadata extractForm.getMetadata().put("url", url); // get the url content Map output = new HashMap(); byte[] content; try { content = httpClientService.get(url); } catch (Exception e) { Map<String, String> errMap = getErrors(e); errors.put("errors", errMap); return errors; } // return an empty map if no content if (content == null || content.length == 0) { return output; } HashSet<String> componetSet = new HashSet<String>(); componetSet.addAll(components); HashSet<String> pipelineSet = new HashSet<String>(); pipelineSet.addAll(pipelines); // process the content and return anything extracted return extractContent(content, pipelineSet, componetSet, extractForm.getConfig(), extractForm.getMetadata()); } @RequestMapping(value = "/bytes.json", method = RequestMethod.POST) public @ResponseBody Map extractContentFromBytes(@RequestBody ExtractForm extractForm, HttpServletRequest request, HttpServletResponse response) { Map errors = new HashMap(); // check for blank global hash String content = extractForm.getContent(); if (StringUtils.isBlank(content)) { errors.put("content.required", "A post of the content bytes " + "is required, either utf-8 or base64 encoded."); } // check for no processors List<String> components = extractForm.getComponents(); boolean hasComponents = (components != null && components.size() > 0); List<String> pipelines = extractForm.getPipelines(); boolean hasPipelines = (pipelines != null && pipelines.size() > 0); if (!hasComponents && !hasPipelines) { errors.put("processors.required", "One or more components or pipelines must be specified to process " + "content"); } // return errors if any exist if (errors.size() > 0) { return errors; } Map output = new HashMap(); byte[] contentBytes = content.getBytes(); if (Base64.isArrayByteBase64(contentBytes)) { // base64 decode the content into bytes, which should be the document // content to extract from try { Base64 base64 = new Base64(); contentBytes = base64.decode(contentBytes); contentBytes = StringEscapeUtils.unescapeHtml(new String(contentBytes)) .getBytes(); } catch (Exception e) { Map<String, String> errMap = getErrors(e); errors.put("errors", errMap); return errors; } } // return an empty map if no content if (contentBytes == null || contentBytes.length == 0) { return output; } HashSet<String> componetSet = new HashSet<String>(); componetSet.addAll(components); HashSet<String> pipelineSet = new HashSet<String>(); pipelineSet.addAll(pipelines); // process the content and return anything extracted return extractContent(contentBytes, pipelineSet, componetSet, extractForm.getConfig(), extractForm.getMetadata()); } public void setHtmlExtractor(HtmlExtractor htmlExtractor) { this.htmlExtractor = htmlExtractor; } public void setHttpClientService(HttpClientService httpClientService) { this.httpClientService = httpClientService; } }