package org.meaningfulweb.api;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import org.apache.commons.io.IOUtils;
import org.apache.http.HttpEntity;
import org.apache.http.HttpHost;
import org.apache.http.HttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpUriRequest;
import org.apache.http.protocol.BasicHttpContext;
import org.apache.http.protocol.ExecutionContext;
import org.apache.http.protocol.HttpContext;
import org.apache.tika.detect.Detector;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.txt.TXTParser;
import org.meaningfulweb.cext.Extract;
import org.meaningfulweb.cext.HtmlContentProcessorFactory;
import org.meaningfulweb.cext.HtmlExtractor;
import org.meaningfulweb.detector.DetectorFactory;
import org.meaningfulweb.util.ImageUtil;
import org.meaningfulweb.util.URIUtils;
import org.meaningfulweb.util.URLUtil;
import org.meaningfulweb.util.http.HttpClientFactory;
import org.meaningfulweb.util.http.HttpClientService;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
public class MetaContentExtractor {
private static Logger logger = LoggerFactory.getLogger(MetaContentExtractor.class);
private static final String RESOLVED_URL = "resolved-url";
private static final String STATUS_CODE = "status-code";
private static final long MAX_CONTENT_LEN = 500000;
private static final int DEFAULT_BUFFER_SIZE = 1024 * 4;
private final Detector _detector;
private final Parser _autoParser;
private final TXTParser _txtParser;
private final HtmlContentProcessorFactory processorFactory;
private final HtmlExtractor htmlExtractor;
public MetaContentExtractor() throws Exception{
_detector = DetectorFactory.getInstance().buildDetector();
_autoParser = new AutoDetectParser(_detector);
_txtParser = new TXTParser();
// the config file and the url
// TODO: should refactor here to take some sort of configuration object
String jsonConfig = "{\"components\": [{"+
"\"name\": \"meaningfulweb\","+
"\"class\": \"org.meaningfulweb.cext.processors.MeaningfulwebCompositeProcessor\"}]}";
processorFactory = new HtmlContentProcessorFactory(jsonConfig);
htmlExtractor = new HtmlExtractor();
htmlExtractor.setProcessorFactory(processorFactory);
}
private Map<String,Object> extractHTMLContent(String url,InputStream in) throws Exception{
// create base config
Map<String, Object> config = new HashMap<String, Object>();
config.put("perComponentDOM", false);
config.put("perPipelineDOM", true);
// create base metadata
Map<String, Object> metadata = new HashMap<String, Object>();
metadata.put("url", url);
// create the pipelines and components to run
List<String> components = new ArrayList<String>();
components.add("meaningfulweb");
Map<String,Object> output = new HashMap<String,Object>();
ByteArrayOutputStream baos = new ByteArrayOutputStream();
byte[] contentBytes = new byte[DEFAULT_BUFFER_SIZE];
long totalBytesRead = 0L;
while(true){
int count = in.read(contentBytes);
if (count>0){
totalBytesRead+=count;
baos.write(contentBytes,0,count);
}
else if (count<0){
break;
}
if (totalBytesRead>=MAX_CONTENT_LEN) break;
}
if (totalBytesRead>=MAX_CONTENT_LEN) throw new IOException("content too large");
contentBytes = baos.toByteArray();
Extract extract = new Extract(contentBytes);
extract.getComponents().addAll(components);
extract.setConfig(config);
extract.setMetadata(metadata);
try {
htmlExtractor.extract(extract);
output = extract.getExtracted();
}
catch (Exception e) {
logger.error("Error extracting content", e);
}
return output;
}
static String trim(String str){
return str==null ? "" : str.trim();
}
private static void parseMeta(Parser parser,InputStream in,Metadata meta,Map<String,String> ogmeta) throws IOException, SAXException, TikaException{
parser.parse(in, new DefaultHandler(), meta, new ParseContext());
String[] propnames = meta.names();
for (String propname : propnames){
String val = meta.get(propname);
ogmeta.put(propname, val);
}
}
public MeaningfulWebObject extract(String url,InputStream in,Metadata meta) throws Exception{
MeaningfulWebObject obj = new MeaningfulWebObject();
Map<String,String> ogMeta = obj.getMeta();
MediaType type = _detector.detect(in, meta);
ogMeta.put("content-type",type.toString());
if ("image".equals(type.getType())){
ogMeta.put("image", url);
ogMeta.put("title", url);
ogMeta.put("url", url);
}
else if ("video".equals(type.getType())){
ogMeta.put("image", ImageUtil.getVideoImage());
ogMeta.put("title", url);
ogMeta.put("url", url);
}
else if ("text".equals(type.getType())){
ogMeta.put("type", "text");
String subtype = type.getSubtype();
if ("plain".equals(subtype)){
parseMeta(_txtParser,in,meta,ogMeta);
}
else if ("html".equals(subtype)){
Map<String,Object> extracted = extractHTMLContent(url,in);
// We now have a string of text from the the page.
ogMeta.put("url", url);
Object title = extracted.get("meaningfulweb.title");
if (title!=null){
ogMeta.put("title",String.valueOf(title));
}
Object desc = extracted.get("meaningfulweb.description");
if (desc!=null){
ogMeta.put("description", String.valueOf(desc));
}
Object img = extracted.get("meaningfulweb.image");
if (img!=null){
ogMeta.put("image", String.valueOf(img));
}
Object fullimg = extracted.get("meaningfulweb.fullimage");
if (fullimg!=null){
ogMeta.put("fullimage", String.valueOf(fullimg));
}
Object content = extracted.get("meaningfulweb.text");
if (content!=null){
ogMeta.put("content", String.valueOf(content));
}
Set<Entry<String,Object>> entries = extracted.entrySet();
for (Entry<String,Object> entry : entries){
Object val = entry.getValue();
if (val!=null){
ogMeta.put(entry.getKey(), String.valueOf(val));
}
}
}
}
else if ("application".equals(type.getType())){
parseMeta(_autoParser,in,meta,ogMeta);
String subType = type.getSubtype();
String imgUrl=null;
if (subType.contains("pdf")){
imgUrl = ImageUtil.getPDFImage();
}
else if (subType.contains("ps") || subType.contains("postscript")){
imgUrl = ImageUtil.getPSImage();
}
else if (subType.contains("word") || subType.contains("doc")){
imgUrl = ImageUtil.getWordImage();
}
else if (subType.contains("excel") || subType.contains("xsl")){
imgUrl = ImageUtil.getExcelImage();
}
else if (subType.contains("powerpoint") || subType.contains("ppt")){
imgUrl = ImageUtil.getPowerpointImage();
}
if (imgUrl!=null){
ogMeta.put("image",imgUrl);
}
}
else{
logger.error("unable to handle media type: "+type);
}
return obj;
}
public MeaningfulWebObject extractFromUrl(String url) throws IOException{
HttpClientService httpClient = HttpClientFactory.getHttpClientService();
HttpGet httpget = null;
String resolvedUrl;
// if the uri is invalid try and clean it up a little before fetching
boolean isValidURI = URIUtils.isValidURI(url);
if (!isValidURI) {
String fixed = URIUtils.fixInvalidUri(url);
logger.info("Fixed invalid URI: " + url + " to " + fixed);
url = fixed;
}
httpget = new HttpGet(url);
MeaningfulWebObject obj = new MeaningfulWebObject();
try{
HttpContext context = new BasicHttpContext();
HttpResponse response = httpClient.process(httpget,context);
HttpUriRequest currentReq = (HttpUriRequest) context.getAttribute(
ExecutionContext.HTTP_REQUEST);
HttpHost currentHost = (HttpHost) context.getAttribute(
ExecutionContext.HTTP_TARGET_HOST);
resolvedUrl = currentHost.toURI() + currentReq.getURI();
int statusCode = response.getStatusLine().getStatusCode();
HttpEntity entity = response.getEntity();
long contentLen = entity.getContentLength();
if (statusCode < 400 && contentLen < MAX_CONTENT_LEN) {
Metadata metadata = new Metadata();
metadata.add(Metadata.RESOURCE_NAME_KEY, resolvedUrl);
metadata.add(Metadata.CONTENT_TYPE,entity.getContentType().getValue());
InputStream is = null;
try{
is = entity.getContent();
obj = extract(resolvedUrl, is, metadata);
}
catch(Exception e){
logger.error(e.getMessage(),e);
}
finally{
httpget.abort();
}
}
else{
httpget.abort();
}
Map<String,String> metaMap = obj.getMeta();
metaMap.put(RESOLVED_URL, resolvedUrl);
metaMap.put(STATUS_CODE, String.valueOf(statusCode));
String domain = URLUtil.extractDomainFromUrl(resolvedUrl);
if (domain!=null){
obj.setDomain(domain);
}
}
catch(IOException e){
httpget.abort();
throw e;
}
return obj;
}
public static void main(String[] args) throws Exception{
MetaContentExtractor extractor = new MetaContentExtractor();
String url = "http://bit.ly/il10nD";
//String url = "http://www.amazon.co.jp/gp/product/B004O6LVMM?ie=UTF8&ref_=pd_ts_d_3&s=dvd&linkCode=shr&camp=1207&creative=8411&tag=pokopon0e-22";
//String url ="http://www.useit.com/papers/anti-mac.html";
//String url ="http://sns.mx/WGdXy4";
//String url = "http://bit.ly/eL7wGH";
//String url = "http://bit.ly/dK8DdN";
MeaningfulWebObject obj = extractor.extractFromUrl(url);
System.out.println(obj);
}
public static void main2(String[] args) throws Exception{
String urlFile = new String("/Users/john/github/meaningfulweb/nulls.txt");
File f = new File(urlFile);
File outFile = new File("/Users/john/github/meaningfulweb/outfile.txt");
BufferedWriter writer = new BufferedWriter(new FileWriter(outFile));
BufferedReader reader = new BufferedReader(new FileReader(f));
int offset = "skipping url with null title: ".length();
MetaContentExtractor extractor = new MetaContentExtractor();
int count = 0;
while(true){
String line = reader.readLine();
if (line==null) break;
if (count%10 == 0){
System.out.println(count+" urls processed.");
}
count++;
try{
line = line.substring(offset);
MeaningfulWebObject obj = extractor.extractFromUrl(line);
String title = obj.getTitle();
if (title==null){
writer.write("no title: "+line+"\n");
}
else if ("null".equals(title)){
writer.write("null title: "+line+"\n");
}
writer.flush();
}
catch(Exception e){
e.printStackTrace();
}
Thread.sleep(200);
}
reader.close();
writer.close();
}
}