package org.meaningfulweb.cext.processors; import java.util.Collection; import java.util.HashMap; import java.util.LinkedHashSet; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Set; import org.apache.commons.lang.StringUtils; import org.jdom.Content; import org.jdom.Document; import org.jdom.Element; import org.meaningfulweb.cext.HtmlContentProcessor; import org.meaningfulweb.opengraph.OGObject; import org.meaningfulweb.opengraph.OpenGraphParser; import org.meaningfulweb.util.JDomUtils; public class OpengraphContentProcessor extends HtmlContentProcessor { private Set<String> names = new LinkedHashSet<String>(); private boolean includeAll = false; private boolean skipUnescapingHtml = false; public Collection<String> getNames() { return names; } public void setNames(Collection<String> names) { if (names != null) { if (names instanceof Set) { this.names = (Set<String>)names; } else { Set<String> newNames = new LinkedHashSet<String>(); newNames.addAll(names); this.names = newNames; } } } public boolean isIncludeAll() { return includeAll; } public void setIncludeAll(boolean includeAll) { this.includeAll = includeAll; } public boolean isSkipUnescapingHtml() { return skipUnescapingHtml; } public void setSkipUnescapingHtml(boolean skipUnescapingHtml) { this.skipUnescapingHtml = skipUnescapingHtml; } @Override public boolean processContent(Document document) { Element rootElem = document.getRootElement(); List<Content> contents = rootElem.getContent(); for (Content child : contents) { // loop through each element in the root, which should be just head, body Element elem = (Element)child; String name = StringUtils.lowerCase(elem.getName()); // get the head element if (name.equalsIgnoreCase("head")) { // create a datamap for open graph processing Map<String, String> datamap = new HashMap<String, String>(); // get all meta tags for the head element List<Element> metatags = JDomUtils.getElementsByName(elem, "meta"); for (Element metaElem : metatags) { String metaIdent = JDomUtils.getAttributeValue(metaElem, "property"); if (metaIdent != null && metaIdent.startsWith("og:")) { metaIdent = StringUtils.trim(StringUtils.lowerCase(metaIdent)); metaIdent = StringUtils.substring(metaIdent, OpenGraphParser.OG_PREFIX_CHAR_COUNT); String metaValue = JDomUtils.getAttributeValue(metaElem, "content"); if (StringUtils.isNotBlank(metaValue)) { datamap.put(metaIdent, metaValue); } } } // parse the elements with opengraph Set<String> fieldsToUnescape = skipUnescapingHtml ? null : OpenGraphParser.UNESCAPE_HTML_FIELDS; OGObject ogObj = OpenGraphParser.parse(datamap,fieldsToUnescape); if (!ogObj.isEmpty()) { Map<String, String> metaMap = ogObj.getMeta(); if (metaMap.size() > 0) { for (Entry<String, String> ogEntry : metaMap.entrySet()) { String ogName = ogEntry.getKey(); if (includeAll || names.contains(ogName)) { addExtractedValue(ogName, ogEntry.getValue()); } } if (includeAll || names.contains("audio")) { Map<String, String> audioMap = ogObj.getAudio(); if (audioMap.size() > 0) { addExtractedValue("audio", audioMap); } } if (includeAll || names.contains("video")) { Map<String, String> videoMap = ogObj.getVideo(); if (videoMap.size() > 0) { addExtractedValue("video", videoMap); } } } } } } return true; } }