package org.meaningfulweb.cext;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import org.apache.commons.lang.StringUtils;
import org.htmlcleaner.TagNode;
import org.jdom.Attribute;
import org.jdom.Element;
import org.meaningfulweb.util.HtmlExtractUtils;
public class ExtractUtils {
public static String getExtractedStringValue(Map<String, Object> extracted,
String name) {
Object current = extracted.get(name);
if (current instanceof List) {
List<String> values = (List<String>)current;
StringBuilder builder = new StringBuilder();
for (int i = 0; i < values.size(); i++) {
String curVal = values.get(i);
if (StringUtils.isNotBlank(curVal)) {
builder.append(curVal);
if (i < (values.size() - 1)) {
builder.append(" ");
}
}
}
return builder.toString();
}
else if (current instanceof String) {
return (String)current;
}
return null;
}
public static Map<String, String> getAndConvertExtractedFields(
Map<String, Object> extracted, Map<String, String> extractMapping) {
Map<String, String> output = new LinkedHashMap<String, String>();
if (extracted != null && extracted.size() > 0) {
for (Entry<String, String> extractEntry : extractMapping.entrySet()) {
String extractName = extractEntry.getKey();
String fieldName = extractEntry.getValue();
boolean isNested = extractName.contains("[");
// if nested objects otherwise if exact name
if (isNested) {
// split on [ the nested object character, will remove ending later
// easiest way to get the nested names
String[] nameParts = StringUtils.split(extractName, "[");
int numNameParts = nameParts != null ? nameParts.length : 0;
if (numNameParts > 0) {
// set the current map to be the
int partIndex = 0;
Map<String, Object> currentMap = extracted;
// loop through the name parts
while (partIndex < numNameParts) {
// if there is a nested object with the part name, remove the
// ending nested object character
String currentPart = nameParts[partIndex];
currentPart = StringUtils.removeEnd(currentPart, "]");
// is there a nested object
if (currentMap.containsKey(currentPart)) {
// if more name parts, get the next nested map, if it is a map
// if no more parts get the value as a string
if (partIndex < (numNameParts - 1)) {
Object nestedObj = currentMap.get(currentPart);
if (!(nestedObj instanceof Map)) {
break;
}
currentMap = (Map<String, Object>)nestedObj;
}
else {
String extractStr = getExtractedStringValue(currentMap,
currentPart);
output.put(fieldName, extractStr);
}
}
else {
break;
}
partIndex++;
} // end loop over name parts for nested objects
} // end number of name parts
}
else if (extracted.containsKey(extractName)) {
String extractStr = getExtractedStringValue(extracted, extractName);
output.put(fieldName, extractStr);
}
}
}
return output;
}
public static List<String> getClassAndIdVals(Element elem) {
List<String> attrVals = new ArrayList<String>();
Attribute classAttr = elem.getAttribute("class");
if (classAttr != null) {
String classVal = StringUtils.lowerCase(classAttr.getValue());
if (StringUtils.isNotBlank(classVal)) {
attrVals.add(classVal);
}
}
Attribute idAttr = elem.getAttribute("id");
if (idAttr != null) {
String idVal = StringUtils.lowerCase(idAttr.getValue());
if (StringUtils.isNotBlank(idVal)) {
attrVals.add(idVal);
}
}
return attrVals;
}
public static void cleanInvalidAttributes(TagNode parent) {
List nodes = parent.getChildren();
if (nodes != null) {
for (int i = 0; i < nodes.size(); i++) {
Object curChild = nodes.get(i);
if (curChild instanceof TagNode) {
TagNode curNode = (TagNode)curChild;
Map attrMap = curNode.getAttributes();
Set<String> toRemove = new HashSet<String>();
for (Object entryObj : attrMap.entrySet()) {
Entry entry = (Entry)entryObj;
String attrName = (String)entry.getKey();
if (!HtmlExtractUtils.isValidAttribute(attrName)) {
toRemove.add(attrName);
}
}
for (String remove : toRemove) {
curNode.removeAttribute(remove);
}
cleanInvalidAttributes(curNode);
}
}
}
}
}