package ecologylab.bigsemantics.documentparsers;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringWriter;
import java.lang.reflect.Field;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Stack;
import javax.xml.namespace.QName;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import ecologylab.bigsemantics.actions.SemanticActionHandler;
import ecologylab.bigsemantics.actions.SemanticsConstants;
import ecologylab.bigsemantics.collecting.DocumentDownloadedEventHandler;
import ecologylab.bigsemantics.collecting.DocumentDownloadingMonitor;
import ecologylab.bigsemantics.collecting.DownloadStatus;
import ecologylab.bigsemantics.collecting.LinkedMetadataMonitor;
import ecologylab.bigsemantics.metadata.Metadata;
import ecologylab.bigsemantics.metadata.MetadataBase;
import ecologylab.bigsemantics.metadata.MetadataClassDescriptor;
import ecologylab.bigsemantics.metadata.MetadataFieldDescriptor;
import ecologylab.bigsemantics.metadata.builtins.Document;
import ecologylab.bigsemantics.metadata.scalar.MetadataParsedURL;
import ecologylab.bigsemantics.metadata.scalar.types.MetadataParsedURLScalarType;
import ecologylab.bigsemantics.metadata.scalar.types.MetadataScalarType;
import ecologylab.bigsemantics.metametadata.DefVar;
import ecologylab.bigsemantics.metametadata.FilterLocation;
import ecologylab.bigsemantics.metametadata.MetaMetadata;
import ecologylab.bigsemantics.metametadata.MetaMetadataCollectionField;
import ecologylab.bigsemantics.metametadata.MetaMetadataCompositeField;
import ecologylab.bigsemantics.metametadata.MetaMetadataField;
import ecologylab.bigsemantics.metametadata.MetaMetadataNestedField;
import ecologylab.bigsemantics.metametadata.MetaMetadataRepository;
import ecologylab.bigsemantics.metametadata.MetaMetadataScalarField;
import ecologylab.bigsemantics.metametadata.MetaMetadataValueField;
import ecologylab.bigsemantics.metametadata.ScalarDependencyException;
import ecologylab.bigsemantics.metametadata.ScalarDependencyManager;
import ecologylab.bigsemantics.metametadata.fieldops.FieldOp;
import ecologylab.bigsemantics.metametadata.fieldparsers.FieldParser;
import ecologylab.bigsemantics.metametadata.fieldparsers.FieldParserElement;
import ecologylab.bigsemantics.metametadata.fieldparsers.FieldParserForRegexSplit;
import ecologylab.bigsemantics.namesandnums.DocumentParserTagNames;
import ecologylab.collections.Scope;
import ecologylab.generic.HashMapArrayList;
import ecologylab.generic.ReflectionTools;
import ecologylab.generic.StringTools;
import ecologylab.net.ParsedURL;
import ecologylab.serialization.ClassDescriptor;
import ecologylab.serialization.DeserializationHookStrategy;
import ecologylab.serialization.SIMPLTranslationException;
import ecologylab.serialization.ScalarUnmarshallingContext;
import ecologylab.serialization.SimplTypesScope;
import ecologylab.serialization.XMLTools;
import ecologylab.serialization.formatenums.Format;
import ecologylab.serialization.formatenums.StringFormat;
import ecologylab.serialization.types.ScalarType;
/**
* This is the base class for the all the document type which we create using meta-metadata.
*
* @author amathur
*
*/
@SuppressWarnings({"rawtypes", "unchecked"})
public abstract class ParserBase<D extends Document> extends HTMLDOMParser<D>
implements ScalarUnmarshallingContext, SemanticsConstants
{
static Logger logger = LoggerFactory.getLogger(ParserBase.class);
protected XPath xpath;
protected ParsedURL truePURL;
protected SemanticActionHandler handler;
public ParserBase()
{
super();
xpath = XPathFactory.newInstance().newXPath();
}
@Override
public ParsedURL purlContext()
{
return purl();
}
@Override
public File fileContext()
{
return null;
}
@Override
public ParsedURL getTruePURL()
{
return (truePURL != null) ? truePURL : super.getTruePURL();
}
/**
* populate associated metadata with the container and handler.
*
* @param document
* @param metaMetadata
* @param dom
*
* @param handler
* @return
*/
public abstract Document populateMetadata(Document document,
MetaMetadataCompositeField metaMetadata,
org.w3c.dom.Document dom,
SemanticActionHandler handler) throws IOException;
public final Document parse(Document document,
MetaMetadataCompositeField metaMetadata,
org.w3c.dom.Document dom) throws IOException
{
// init
handler = new SemanticActionHandler(getSemanticsScope(), this);
truePURL = document.getLocation();
initializeParameterScope(metaMetadata);
// build the metadata object
// DomTools.prettyPrint(DOM);
Document resultingMetadata = populateMetadata(document, metaMetadata, dom, handler);
resultingMetadata.setMetadataChanged(true);
if (this.getSemanticsScope().ifLookForFavicon())
{
findFaviconPath(resultingMetadata, xpath);
}
try
{
logger.info("Metadata parsed from: " + document.getLocation());
if (resultingMetadata != null)
{
logger.debug(SimplTypesScope.serialize(resultingMetadata, StringFormat.XML).toString());
}
}
catch (Exception e)
{
logger.error("Cannot serialize extracted metadata in XML: " + resultingMetadata, e);
return null;
}
if (resultingMetadata != null)
{
handler.takeSemanticActions(resultingMetadata);
// make sure termVector is built here
if (!resultingMetadata.ignoreInTermVector())
{
resultingMetadata.rebuildCompositeTermVector();
}
else
{
logger.debug("Do not build term vector because ignore_in_term_vector is true");
}
// linking
MetaMetadataRepository metaMetaDataRepository =
getSemanticsScope().getMetaMetadataRepository();
LinkedMetadataMonitor monitor = metaMetaDataRepository.getLinkedMetadataMonitor();
monitor.tryLink(metaMetaDataRepository, resultingMetadata);
monitor.addMonitors(resultingMetadata);
}
return resultingMetadata;
}
/**
* (1) Populate Metadata. (2) Rebuild composite term vector. (3) Take semantic actions.
*
* @throws IOException
*/
@Override
public void parse() throws IOException
{
parse(getDocument(), getMetaMetadata(), getDom());
}
/**
* Instantiate MetaMetadata variables that are used during XPath information extraction, and in
* semantic actions.
*
* @param metaMetadata
*/
private void initializeParameterScope(MetaMetadataCompositeField metaMetadata)
{
Node documentRoot = null;
try
{
documentRoot = getDom();
if (documentRoot != null)
{
Scope<Object> parameters = handler.getSemanticActionVariableMap();
parameters.put(DOCUMENT_ROOT_NODE, documentRoot);
updateDefVars(metaMetadata, documentRoot);
}
}
catch (IOException e)
{
logger.error("Cannot get DOM for document " + truePURL, e);
}
}
private void updateDefVars(MetaMetadataNestedField nestedField, Node contextNode)
{
assert (handler != null);
Scope<Object> parameters = handler.getSemanticActionVariableMap();
ArrayList<DefVar> defVars = nestedField.getDefVars();
if (defVars != null)
{
for (DefVar defVar : defVars)
{
List<String> xpaths = defVar.getXpaths();
if (xpaths != null)
{
for (String xpath : xpaths)
{
if (extractDefVar(defVar, contextNode, xpath, parameters))
{
break;
}
}
}
}
}
}
private boolean extractDefVar(DefVar defVar,
Node contextNode,
String xpathExpression,
Scope<Object> params)
{
String varName = defVar.getName();
QName varType = defVar.getType();
try
{
String contextNodeName = defVar.getContextNode();
String varValue = defVar.getValue();
// Does the var have a constant value?
if (varValue == null)
{
// No. Evaluate the Xpath to obtain the value.
if (contextNodeName != null)
{
// get the context node from parameters
contextNode = (Node) params.get(contextNodeName);
}
if (contextNode != null)
{
if (varType != null)
{
Object evalResult = xpath.evaluate(xpathExpression, contextNode, varType);
params.put(varName, evalResult);
}
else
{
// its gonna be a simple string evaluation
String evaluation = xpath.evaluate(xpathExpression, contextNode);
params.put(varName, evaluation);
}
}
}
else
{
// If we have a variable value, we'll just use it!
params.put(varName, varValue);
}
return true;
}
catch (Exception e)
{
String msg =
String.format("Error in <def_var>: contextNode=%s, xpath=%s, returnObjectType=%s",
contextNode, xpathExpression, varType);
logger.error(msg, e);
return false;
}
}
/**
* This helper class is used for returning information from extractNestedHelper().
*
* @author quyin
*/
private static class NestedFieldHelper
{
public Node node;
public Map<String, String> fieldParserContext;
public NodeList nodeList;
public List<Map<String, String>> fieldParserContextList;
private int listSize = -1;
public int getListSize()
{
if (listSize < 0)
{
if (fieldParserContextList != null)
listSize = fieldParserContextList.size();
else if (nodeList != null)
listSize = nodeList.getLength();
else
listSize = 0;
}
return listSize;
}
}
/**
* Recursively extract information from the sub DOM tree rooted at current context node to a given
* field on the given metadata, using given meta-metadata field information.
*
* @param mmdField
* The guiding meta-metadata field, indicating which field of <code>metadata</code>
* should be extracted, and containing extraction rules.
* @param metadata
* The metadata object holding the field to be extracted.
* @param contextNode
* The context node for extraction.
* @param fieldParserContext
* The context of field parsers, if any.
* @param params
* The scope containing variables during parsing and semantic actions
* @return true if some information is extracted, and every required field has value. false if
* nothing is extracted or a required field doesn't have value.
*/
protected boolean recursiveExtraction(MetaMetadataNestedField mmdField,
Metadata metadata,
Node contextNode,
Map<String, String> fieldParserContext,
Scope<Object> params)
{
HashMapArrayList<String, MetaMetadataField> fieldSet = mmdField.getChildrenMap();
if (fieldSet == null || fieldSet.isEmpty())
{
return false;
}
pushSurroundingMmd(params, mmdField);
updateDefVars(mmdField, contextNode);
boolean result = true;
synchronized (fieldSet)
{
if (fieldsetContainsFieldsWithDependencies(fieldSet))
{
try
{
ScalarDependencyManager depMan = new ScalarDependencyManager(fieldSet);
fieldSet = depMan.sortFieldSetByDependencies(metadata);
}
catch (ScalarDependencyException e)
{
logger.error("Error resolving scalar dependencies; "
+ "proceeding anyway but will result in null values.",
e);
}
}
for (MetaMetadataField field : fieldSet)
{
try
{
boolean suc = false;
if (field instanceof MetaMetadataCompositeField)
{
MetaMetadataCompositeField mmcf = (MetaMetadataCompositeField) field;
suc = extractComposite(mmcf, metadata, contextNode, fieldParserContext, params);
}
else if (field instanceof MetaMetadataCollectionField)
{
MetaMetadataCollectionField mmcf = (MetaMetadataCollectionField) field;
if (mmcf != null)
{
suc = extractCollection(mmcf, metadata, contextNode, fieldParserContext, params);
}
}
else
{
// scalar
MetaMetadataScalarField mmsf = (MetaMetadataScalarField) field;
suc = extractScalar(mmsf, metadata, contextNode, fieldParserContext, params);
}
if (field.isRequired() && !suc)
{
result = false;
break;
}
}
catch (Exception e)
{
logger.error("Error extracting " + field, e);
}
}
}
popSurroundingMmd(params);
return result;
}
private void pushSurroundingMmd(Scope<Object> params, MetaMetadataNestedField surroundingMmd)
{
Stack<MetaMetadataField> surroundingMmdStack =
(Stack<MetaMetadataField>) params.get(SURROUNDING_META_METADATA_STACK);
if (surroundingMmdStack == null)
{
surroundingMmdStack = new Stack<MetaMetadataField>();
params.put(SURROUNDING_META_METADATA_STACK, surroundingMmdStack);
}
surroundingMmdStack.push(surroundingMmd);
}
private void popSurroundingMmd(Scope<Object> params)
{
Stack<MetaMetadataField> surroundingMmdStack =
(Stack<MetaMetadataField>) params.get(SURROUNDING_META_METADATA_STACK);
if (surroundingMmdStack != null && surroundingMmdStack.size() > 0)
{
surroundingMmdStack.pop();
}
}
private MetaMetadataField getCurrentSurroundingField(Scope<Object> params)
{
Stack<MetaMetadataField> stack =
(Stack<MetaMetadataField>) params.get(SURROUNDING_META_METADATA_STACK);
if (stack != null && stack.size() > 0)
{
return stack.peek();
}
return null;
}
/**
* Determines if a given collection of metametadata fields contain dependencies (for concatenation
* or other value semantics) on other mmd fields.
*
* @param fieldSet
* The set of metametadata fields to check.
* @return True if there are dependencies that need to be handled correctly
*/
private boolean fieldsetContainsFieldsWithDependencies(HashMapArrayList<String, MetaMetadataField> fieldSet)
{
for (MetaMetadataField field : fieldSet)
{
if (field instanceof MetaMetadataScalarField)
{
MetaMetadataScalarField m = (MetaMetadataScalarField) field;
if (m.hasValueDependencies())
{
return true;
}
}
}
return false;
}
/**
* This helper method builds the context for extracting a nested field, e.g. context node and
* field parser context.
*
* @param mmdField
* @param contextNode
* @param fieldParserContext
* @param params
* @return A helper object holding necessary information, or null if no information is obtained.
*/
private NestedFieldHelper extractNestedHelper(MetaMetadataNestedField mmdField,
Node contextNode,
Map<String, String> fieldParserContext,
Scope<Object> params)
{
// get context node, field parser definition & key: basic information for following
contextNode = findContextNodeIfNecessary(mmdField, contextNode, params);
FieldParserElement fieldParserElement = mmdField.getFieldParserElement();
String fieldParserKey = mmdField.getFieldParserKey();
// init result
NestedFieldHelper result = new NestedFieldHelper();
if (mmdField instanceof MetaMetadata) // this should not happen, currently
{
result.node = contextNode;
return result;
}
try
{
if (fieldParserElement == null)
{
evaluateXpathForField(mmdField, contextNode, params, result);
}
else
{
FieldParser fieldParser =
getSemanticsScope().getFieldParserRegistry().get(fieldParserElement.getName());
if (mmdField instanceof MetaMetadataCompositeField)
{
String valueString = null;
if (fieldParserKey != null && fieldParserKey.length() > 0)
{
valueString = getFieldParserValueByKey(fieldParserContext, fieldParserKey);
}
else
{
evaluateXpathForField(mmdField, contextNode, params, result);
if (result.node != null)
{
if (mmdField.isExtractAsHtml())
{
valueString = getInnerHtml(result.node);
}
else
{
valueString = result.node.getTextContent();
}
}
}
if (valueString != null && valueString.length() > 0)
{
result.fieldParserContext =
fieldParser.getKeyValuePairResult(fieldParserElement, valueString.trim());
}
}
else if (mmdField instanceof MetaMetadataCollectionField)
{
if (!((MetaMetadataCollectionField) mmdField).isCollectionOfScalars()
&& fieldParserElement.isForEachElement())
{
evaluateXpathForField(mmdField, contextNode, params, result);
result.fieldParserContextList = new ArrayList<Map<String, String>>();
for (int i = 0; i < result.nodeList.getLength(); ++i)
{
Node node = result.nodeList.item(i);
String valueString = null;
if (mmdField.isExtractAsHtml())
{
valueString = getInnerHtml(node);
}
else
{
valueString = node.getTextContent();
}
if (valueString != null && valueString.length() > 0)
{
Map<String, String> aContext =
fieldParser.getKeyValuePairResult(fieldParserElement, valueString.trim());
result.fieldParserContextList.add(aContext);
}
}
}
else
{
String valueString = null;
if (fieldParserKey != null && fieldParserKey.length() > 0)
{
valueString = getFieldParserValueByKey(fieldParserContext, fieldParserKey);
}
else
{
evaluateXpathForField(mmdField, contextNode, params, result);
if (result.nodeList != null && result.nodeList.getLength() >= 1)
{
if (mmdField.isExtractAsHtml())
{
valueString = getInnerHtml(result.nodeList.item(0));
}
else
{
valueString = result.nodeList.item(0).getTextContent();
}
}
}
if (valueString != null && valueString.length() > 0)
{
result.fieldParserContextList =
fieldParser.getCollectionResult(fieldParserElement, valueString.trim());
}
}
}
}
}
catch (Exception e)
{
logger.error("Error extracting " + mmdField + ", contextNode=" + contextNode,
e);
}
if (result.node == null
&& result.nodeList == null
&& result.fieldParserContext == null
&& result.fieldParserContextList == null)
{
return null;
}
return result;
}
private Node findContextNodeIfNecessary(MetaMetadataField mmdField,
Node currentContextNode,
Scope<Object> params)
{
String contextNodeName = mmdField.getContextNode();
if (contextNodeName != null)
{
currentContextNode = (Node) params.get(contextNodeName);
}
// if (currentContextNode == null)
// {
// currentContextNode = (Node) params.get(DOCUMENT_ROOT_NODE);
// }
return currentContextNode;
}
/**
* Evaluate the xpath associated with a field.
*
* For a scalar field, returns the evaluation results as a string.
*
* For a nested field, returns null, but fills the result parameter for outputing.
*
* @param mmdField
* @param contextNode
* @param params
* @param result
* @return
* @throws XPathExpressionException
*/
private String evaluateXpathForField(MetaMetadataField mmdField,
Node contextNode,
Scope<Object> params,
NestedFieldHelper result) throws XPathExpressionException
{
String evaluation = null;
if (contextNode != null)
{
MetaMetadataField surroundingField = getCurrentSurroundingField(params);
int i = 0;
do
{
// This loop need to be executed at least once.
String xpathString = mmdField.getXpath(i);
if ((xpathString == null || xpathString.length() == 0)
&& mmdField.parent() == surroundingField
&& mmdField instanceof MetaMetadataNestedField
&& !((MetaMetadataNestedField) mmdField).isPolymorphicInherently())
{
// If there is no xpath associated with the current nested field, and
// the current field is intentionally authored, the runtime will just
// create the structure, and pass the contextNode to nested fields.
xpathString = ".";
}
// at this point, if the xpathString is still null, then:
// - if this field is a scalar, it has no xpath attached, and can be ignored safely;
// - if this field is a nested, it has no xpath attached and is purely inherited, thus can
// be ignored safely.
if (xpathString != null)
{
xpathString = getSemanticsScope().getXPathAmender().amend(xpathString, params);
if (mmdField instanceof MetaMetadataCompositeField)
{
result.node = (Node) xpath.evaluate(xpathString, contextNode, XPathConstants.NODE);
if (result.node != null)
{
return null;
}
}
else if (mmdField instanceof MetaMetadataCollectionField)
{
result.nodeList = (NodeList) xpath.evaluate(xpathString,
contextNode,
XPathConstants.NODESET);
if (result.nodeList != null && result.nodeList.getLength() > 0)
{
return null;
}
}
else if (mmdField instanceof MetaMetadataScalarField)
{
if (mmdField.isExtractAsHtml())
{
Node targetNode =
(Node) xpath.evaluate(xpathString, contextNode, XPathConstants.NODE);
if (targetNode != null)
{
evaluation = getInnerHtml(targetNode);
}
}
else
{
evaluation = xpath.evaluate(xpathString, contextNode);
}
if (evaluation != null && evaluation.length() > 0)
{
logger.debug("Extraction succeeded for {}, using xpath \"{}\", result={}",
mmdField, xpathString, evaluation);
return evaluation;
}
} // composite / collection / scalar
} // if (xpathString != null)
i++;
} while (i < mmdField.getXpathsSize());
}
return evaluation;
}
private String getFieldParserValueByKey(Map<String, String> fieldParserContext,
String fieldParserKey)
{
int pos = fieldParserKey.indexOf('|');
if (pos < 0)
return fieldParserContext.get(fieldParserKey);
String[] keys = fieldParserKey.split("\\|");
for (String key : keys)
if (fieldParserContext.containsKey(key))
return fieldParserContext.get(key);
return null;
}
/**
* Extract a composite field of the given metadata object.
*
* @param mmdField
* @param metadata
* @param contextNode
* @param fieldParserContext
* @param params
* @return if extraction was successful.
*/
private boolean extractComposite(MetaMetadataCompositeField mmdField,
Metadata metadata,
Node contextNode,
Map<String, String> fieldParserContext,
Scope<Object> params)
{
NestedFieldHelper helper = extractNestedHelper(mmdField, contextNode, fieldParserContext,
params);
if (helper == null)
return false;
// will be used for child fields
Node thisNode = helper.node;
Map<String, String> thisFieldParserContext = helper.fieldParserContext;
// create a metadata instance for this field
Class<? extends Metadata> metadataClass = mmdField.getMetadataClass();
Class[] argClasses = new Class[] { MetaMetadataCompositeField.class };
Object[] argObjects = new Object[] { mmdField };
Metadata thisMetadata = ReflectionTools.getInstance(metadataClass, argClasses, argObjects);
thisMetadata.setSemanticsSessionScope(getSemanticsScope());
if (recursiveExtraction(mmdField, thisMetadata, thisNode, thisFieldParserContext, params))
{
if (getSemanticsScope().ifAutoUpdateDocRefs())
{
Document downloadedMetadata = lookupDownloadedDocument(thisMetadata);
if (downloadedMetadata != null)
{
thisMetadata = downloadedMetadata;
}
setupDocumentChangedEventListener(mmdField, metadata, thisMetadata);
}
thisMetadata.setMetaMetadata(mmdField);
Metadata changedMetadata = lookupTrueMetaMetadata(mmdField.getRepository(), thisMetadata);
if (changedMetadata != null)
{
Metadata.fieldWiseCopy(changedMetadata, thisMetadata);
thisMetadata = changedMetadata;
}
// TODO check for polymorphism. if this is an inherent polymorphic fields, we may need to
// replace thisMetadata completely if its type changes.
// here everything seems ok. assign result composite back to input metadata object
Field javaField = mmdField.getMetadataFieldDescriptor().getField();
ReflectionTools.setFieldValue(metadata, javaField, thisMetadata);
// try to link result metadata
MetaMetadataRepository repository = getSemanticsScope().getMetaMetadataRepository();
LinkedMetadataMonitor monitor = repository.getLinkedMetadataMonitor();
monitor.tryLink(repository, thisMetadata);
return true;
}
return false;
}
/**
*
* @param mmdField
* @param hostMetadata
* @param docToDownload
* @param isCollection
*/
private void setupDocumentChangedEventListener(MetaMetadataNestedField mmdField,
Metadata hostMetadata, Metadata docToDownload)
{
if (docToDownload instanceof Document)
{
Document doc = (Document) docToDownload;
if (doc.getDownloadStatus() != DownloadStatus.DOWNLOAD_DONE)
{
ParsedURL listeningLoc = doc.getLocation();
DocumentDownloadingMonitor monitor = getSemanticsScope().getDocumentDownloadingMonitor();
DocumentDownloadedEventHandler listener = new DocumentDownloadedEventHandler();
Field javaField = mmdField.getMetadataFieldDescriptor().getField();
monitor.listenForDocumentDownloading(hostMetadata, listeningLoc, javaField, listener);
}
}
}
/**
* looking at the global document collection, and reuse exising document object if it is already
* downloaded.
*
* @param metadata
* @return
*/
protected Document lookupDownloadedDocument(Metadata metadata)
{
if (metadata instanceof Document)
{
Document doc = (Document) metadata;
ParsedURL location = doc.getLocationOrFirstAdditionLocation();
if (location != null)
{
Document existingDoc = getSemanticsScope().lookupDocument(location);
if (existingDoc != null && existingDoc.getDownloadStatus() == DownloadStatus.DOWNLOAD_DONE)
{
// add the replaced Document object as a mixin in the downloaded Document.
existingDoc.addMixin(doc);
return existingDoc;
}
}
}
return null;
}
/**
* if we got a document, we may want to look up its true meta-metadata type by location. before
* doing connect(), what we can do to find out the true meta-metadata type is quite limited
* (location, suffix, tag name). here we do location & suffix. tag name is mainly used by direct
* binding cases.
*
* @param repository
* @param thisMetadata
*/
protected Metadata lookupTrueMetaMetadata(MetaMetadataRepository repository,
Metadata thisMetadata)
{
if (thisMetadata instanceof Document)
{
ParsedURL thisMetadataLocation = thisMetadata.getLocation();
if (thisMetadataLocation != null)
{
MetaMetadata locMmd = repository.getRichDocumentMM(thisMetadataLocation);
if (locMmd != null
&& !locMmd.getName().equals(DocumentParserTagNames.RICH_DOCUMENT_TAG))
{
Class thisMetadataClass = thisMetadata.getClass();
Class trueMetadataClass = locMmd.getMetadataClass();
if (thisMetadataClass.isAssignableFrom(trueMetadataClass))
{
logger.debug("Changing mmd for extracted value {} to {}", thisMetadata, locMmd);
if (thisMetadataClass == trueMetadataClass)
{
// when the two metadata classes are the same, we can safely change the meta-metadata
// since they have exactly the same set of fields, and thus no binding errors will
// occur.
thisMetadata.setMetaMetadata(locMmd);
}
else
{
// when the two metadata classes are not the same, we need to be careful. create
// the right metadata object and copy values.
Metadata changedMetadata = locMmd.constructMetadata();
changedMetadata.setMetaMetadata(locMmd);
return changedMetadata;
}
}
else
{
logger.error("Type mismatch when changing mmd for extracted value {} to {}, "
+ "expected type: {}; "
+ "check <selector> to see if it is not specific enough.",
thisMetadata, locMmd, thisMetadata.getMetaMetadata());
}
}
}
}
return null;
}
/**
* Extract a collection field of the given metadata object.
*
* @param mmdField
* @param metadata
* @param contextNode
* @param fieldParserContext
* @param params
* @return true if the extraction was successful and the result collection is not empty; otherwise
* false. The result collection field will not contain null references or failed elements
* (elements that has no actual information or lacks required field values).
*/
private boolean extractCollection(MetaMetadataCollectionField mmdField,
Metadata metadata,
Node contextNode,
Map<String, String> fieldParserContext,
Scope<Object> params)
{
NestedFieldHelper helper = extractNestedHelper(mmdField, contextNode, fieldParserContext,
params);
if (helper == null)
return false;
// will be used for child fields
NodeList nodeList = helper.nodeList;
List<Map<String, String>> fieldParserContextList = helper.fieldParserContextList;
int size = helper.getListSize();
// get class of elements in the collection
Class elementClass = null;
MetadataScalarType scalarType = null;
MetadataFieldDescriptor metadataFieldDescriptor = mmdField.getMetadataFieldDescriptor();
if (mmdField.isCollectionOfScalars())
{
// registered at MetadataScalarScalarType.init()
ScalarType theScalarType = mmdField.getChildScalarType();
if (theScalarType != null && theScalarType instanceof MetadataScalarType)
{
scalarType = (MetadataScalarType) theScalarType;
elementClass = scalarType.getJavaClass();
}
else
{
logger.error("child_scalar_type not specified or registered: " + mmdField);
return false;
}
}
else
{
if (metadataFieldDescriptor != null)
{
ClassDescriptor elementClassDescriptor =
metadataFieldDescriptor.getElementClassDescriptor();
if (metadataFieldDescriptor.isPolymorphic())
{
String polymorphTagName =
mmdField.getElementComposite().getTypeMmd().getTagForTypesScope();
if (polymorphTagName != null)
{
elementClassDescriptor =
metadataFieldDescriptor.elementClassDescriptor(polymorphTagName);
}
}
if (elementClassDescriptor != null)
{
elementClass = elementClassDescriptor.getDescribedClass();
}
}
else
{
logger.warn("metadataFieldDescriptor not found in " + mmdField);
}
}
if (elementClass == null)
{
// we cannot determine the class of this collection. this may be due to lack of type
// specification there, but it may also be correct, e.g. for polymorphic fields
return false;
}
// build the result list and populate
ArrayList elements = new ArrayList();
Class[] argClasses = new Class[] { MetaMetadataCompositeField.class };
Object[] argObjects = new Object[] { mmdField.getElementComposite() };
for (int i = 0; i < size; ++i)
{
Node thisNode = (nodeList == null) ? null : nodeList.item(i);
Map<String, String> thisFieldParserContext =
(fieldParserContextList == null) ? null : fieldParserContextList.get(i);
if (!mmdField.isCollectionOfScalars())
{
Metadata element =
(Metadata) ReflectionTools.getInstance(elementClass, argClasses, argObjects);
element.setSemanticsSessionScope(getSemanticsScope());
// the index of the current element in the current collection may be useful for further
// extraction.
params.put(ELEMENT_INDEX_IN_COLLECTION, i);
if (recursiveExtraction(mmdField, element, thisNode, thisFieldParserContext, params))
{
if (getSemanticsScope().ifAutoUpdateDocRefs())
{
Document downloadedDocument = lookupDownloadedDocument(element);
if (downloadedDocument != null)
{
element = downloadedDocument;
}
setupDocumentChangedEventListener(mmdField, metadata, element);
}
element.setMetaMetadata(mmdField);
Metadata changedElement = lookupTrueMetaMetadata(mmdField.getRepository(), element);
if (changedElement != null)
{
Metadata.fieldWiseCopy(changedElement, element);
element = changedElement;
}
// TODO check for polymorphism. if this is an inherent polymorphic fields, we may need to
// replace element completely if its type changes.
elements.add(element);
}
}
else
{
String value = null;
if (fieldParserContextList != null)
{
if (thisFieldParserContext != null)
{
value = thisFieldParserContext.get(FieldParserForRegexSplit.DEFAULT_KEY);
}
}
else if (thisNode != null)
{
if (mmdField.isExtractAsHtml())
{
value = getInnerHtml(thisNode);
}
else
{
value = thisNode.getTextContent();
}
}
if (value != null)
{
try
{
value = applyFieldOps(value, mmdField);
MetadataBase element;
element = (MetadataBase) scalarType.getInstance(value, null, this);
if (element != null)
{
elements.add(element);
}
}
catch (Exception e)
{
logger.error("Exception applying field ops.", e);
}
}
}
}
// if more than 0 elements are extracted, assign the collection back
if (elements.size() > 0)
{
Field javaField = metadataFieldDescriptor.getField();
ReflectionTools.setFieldValue(metadata, javaField, elements);
return true;
}
return false;
}
/**
* Extract a scalar field of a given metadata object.
*
* @param mmdField
* @param metadata
* @param contextNode
* @param fieldParserContext
* @param params
* @return true if the scalar value is not null / empty, or false. If <filter> defined it
* will be applied before checking null / empty value.
* @throws Exception
*/
private boolean extractScalar(MetaMetadataScalarField mmdField,
Metadata metadata,
Node contextNode,
Map<String, String> fieldParserContext,
Scope<Object> params) throws Exception
{
String xpathString = mmdField.getXpath();
String fieldParserKey = mmdField.getFieldParserKey();
contextNode = findContextNodeIfNecessary(mmdField, contextNode, params);
String evaluation = null;
if (xpathString != null
&& xpathString.length() > 0
&& contextNode != null
&& fieldParserKey == null)
{
try
{
evaluation = evaluateXpathForField(mmdField, contextNode, params, null);
}
catch (Exception e)
{
logger.error("Error extracting " + mmdField + ", contextNode=" + contextNode.getNodeName(),
e);
}
}
else if (fieldParserKey != null)
{
evaluation = fieldParserContext == null ? null : fieldParserContext.get(fieldParserKey);
}
else if (!mmdField.hasConcatenateValues())
{
return false; // This is the final catch all.
}
logger.info("evaluation");
logger.info(evaluation.toString());
evaluation = concatenateValues(evaluation, mmdField, metadata, params);
MetadataFieldDescriptor fd = mmdField.getMetadataFieldDescriptor();
ScalarType fdScalarType = fd == null ? null : fd.getScalarType();
if (fdScalarType != null && fdScalarType instanceof MetadataParsedURLScalarType)
{
MetadataParsedURL wholePurl =
(MetadataParsedURL) fdScalarType.getInstance(evaluation, null, this);
evaluation = wholePurl.toString();
}
// after we have evaluated the expression we might need to modify it.
evaluation = applyFieldOps(evaluation, mmdField);
if (StringTools.isNullOrEmpty(evaluation))
return false;
if (fdScalarType != null && fdScalarType instanceof MetadataParsedURLScalarType)
{
// if this is a ParsedURL, we try to filter it using <filter_location>, if applicable.
MetadataParsedURL metadataPurl = (MetadataParsedURL) fdScalarType.getInstance(evaluation,
null,
this);
logger.info("ParsedURL");
logger.info( metadataPurl.getValue().toString());
if (metadataPurl != null)
{
ParsedURL purl = metadataPurl.getValue();
ParsedURL filteredPurl =
FilterLocation.filterIfNeeded(metadataPurl.getValue(), null, getSemanticsScope());
if (filteredPurl != null)
{
if (purl == null || !purl.equals(filteredPurl))
metadataPurl.setValue(filteredPurl);
}
}
if (metadataPurl != null && metadataPurl.getValue() != null)
{
fd.setField(metadata, metadataPurl);
}
}
else
{
// for other scalar types, we only need to create and assign the value.
metadata.setByFieldName(mmdField.getFieldNameInJava(false), evaluation, this);
}
return true;
}
/**
* Handles concatenation semantics for a field value.
*
* @param evaluation
* String originally in the scalar value (will be appended at the beginning of the string
* @param mmdField
* The scalar field with values to concatenate
* @param metadata
* Metadata object that contains the field
* @param params
* Scope of parsing with variables / etc
* @return String value concatenated to pass onto other tasks (like regexing)
*/
private String concatenateValues(String evaluation,
MetaMetadataScalarField mmdField,
Metadata metadata,
Scope<Object> params)
{
if (mmdField.hasConcatenateValues())
{
List<MetaMetadataValueField> fields = mmdField.getConcatenateValues();
StringBuffer buffy = new StringBuffer();
if (evaluation != null)
{
// If we have a value already for the mmd field, append it at the beginning
buffy.append(evaluation);
}
for (MetaMetadataValueField v : fields)
{
String varValue = v.getReferencedValue(mmdField, metadata, params);
if (varValue == null)
{
varValue = "";
logger.warn("Attempted to concatenate null value from value referenced as: "
+ v.getReferenceName());
}
buffy.append(varValue);
}
return buffy.toString();
}
else
{
return evaluation;
}
}
/**
* Modifies evaluation results based on FieldOps defined for that field.
*
* @param evaluation
* @param field
* @return Modified evaluation.
* @throws Exception
*/
private String applyFieldOps(String evaluation, MetaMetadataField field) throws Exception
{
if (evaluation == null)
{
return null;
}
Object result = XMLTools.unescapeXML(evaluation);
List<FieldOp> fieldOps = field.getFieldOps();
if (fieldOps != null)
{
for (int i = 0; i < fieldOps.size(); ++i)
{
result = fieldOps.get(i).operateOn(result);
}
}
if (result == null)
{
return null;
}
return result.toString().trim();
}
private static Properties innerHtmlProps = new Properties();
static
{
innerHtmlProps.put(OutputKeys.METHOD, "html");
innerHtmlProps.put(OutputKeys.INDENT, "yes");
}
/**
* using javax.xml.transform.Transformer to get the inner HTML of a node.
*
* @param node
* @return
*/
private static String getInnerHtml(Node node)
{
node.normalize();
StringWriter w = new StringWriter();
try
{
Transformer t = XmlTransformerPool.get().acquire();
t.setOutputProperties(innerHtmlProps);
t.transform(new DOMSource(node), new StreamResult(w));
XmlTransformerPool.get().release(t);
}
catch (Exception e)
{
logger.error("Error getting inner HTML for " + node, e);
return null;
}
return w.toString();
}
/**
* @return Document subclass metadata resulting from s.im.pl deserialization of the input stream.
* @throws IOException
*/
protected Document directBindingPopulateMetadata() throws IOException
{
Document newDocument = null;
try
{
// this must be a top-level metadata object (i.e. not a field)
// thus it must have a MetaMetadata attached (i.e. not a MetaMetadataCompositeField)
// thus this conversion is safe
MetaMetadata metaMetadata = (MetaMetadata) this.getMetaMetadata();
SimplTypesScope tscope = metaMetadata.getLocalMetadataTypesScope();
InputStream istream = getDownloadController().getHttpResponse().getContentAsStream();
DeserializationHookStrategy<Metadata, MetadataFieldDescriptor> strategy =
new ParserDeserializationHookStrategy(this);
newDocument = (Document) tscope.deserialize(istream, strategy, Format.XML);
logger.debug(SimplTypesScope.serialize(newDocument, StringFormat.XML).toString());
// the old document is basic, so give it basic meta-metadata (so recycle does not tank)
Document oldDocument = getDocumentClosure().getDocument();
oldDocument.setMetaMetadata(getSemanticsScope().DOCUMENT_META_METADATA);
getDocumentClosure().changeDocument(newDocument);
}
catch (SIMPLTranslationException e)
{
logger.error("Cannot serialize direct binding result metadata: " + newDocument, e);
}
return newDocument;
}
/**
* Helper class for direct binding.
*
* @author quyin
*/
public static class ParserDeserializationHookStrategy
implements DeserializationHookStrategy<Metadata, MetadataFieldDescriptor>
{
DocumentParser parser;
Stack<MetaMetadataNestedField> currentMMstack = new Stack<MetaMetadataNestedField>();
boolean deserializingRoot = true;
boolean polymorphMmd = false;
public ParserDeserializationHookStrategy(DocumentParser parser)
{
this.parser = parser;
}
/**
* For the root, compare the meta-metadata from the binding with the one we started with. Down
* the hierarchy, try to perform similar bindings.
*/
@Override
public void deserializationPreHook(Metadata deserializedMetadata, MetadataFieldDescriptor mfd)
{
if (deserializingRoot)
{
deserializingRoot = false;
Document document = parser.getDocument();
MetaMetadataCompositeField preMM = document.getMetaMetadata();
MetadataClassDescriptor mcd =
(MetadataClassDescriptor) ClassDescriptor.getClassDescriptor(deserializedMetadata);
MetaMetadataCompositeField metaMetadata;
String tagName = mcd.getTagName();
if (preMM.getTagForTypesScope().equals(tagName))
{
metaMetadata = preMM;
}
else
{
// just match in translation scope
// TODO use local TranslationScope if there is one
metaMetadata = parser.getSemanticsScope().getMetaMetadataRepository()
.getMMByName(tagName);
}
deserializedMetadata.setMetaMetadata(metaMetadata);
polymorphMmd = true;
currentMMstack.push(metaMetadata);
}
else
{
String mmName = mfd.getMmName();
MetaMetadataNestedField currentMM = currentMMstack.peek();
// the following fails for collections :-(
MetaMetadataNestedField childMMNested =
(MetaMetadataNestedField) currentMM.lookupChild(mmName);
if (childMMNested == null)
{
String msg =
String.format("Can't find composite child meta-metadata for %s amidst %s; "
+ "This probably means mmd repository do not match metadata classes; "
+ "Have you changed built-in metadata classes w/o updating primitives.xml?",
mmName,
mfd);
logger.error(msg);
throw new RuntimeException(msg);
}
MetaMetadataCompositeField childMMComposite = null;
if (childMMNested.isPolymorphicInherently())
{
String tagName = ClassDescriptor.getClassDescriptor(deserializedMetadata).getTagName();
childMMComposite =
parser.getSemanticsScope().getMetaMetadataRepository().getMMByName(tagName);
polymorphMmd = true;
}
else
{
childMMComposite = childMMNested.metaMetadataCompositeField();
}
deserializedMetadata.setMetaMetadata(childMMComposite);
currentMMstack.push(childMMComposite);
}
}
@Override
public void deserializationInHook(Metadata deserializedMetadata, MetadataFieldDescriptor mfd)
{
// for efficiency; if it is not polymorphic case we don't have to look up mmd
// at this point of time
if (polymorphMmd)
{
String mmName = deserializedMetadata.getMetaMetadataName();
if (mmName != null && mmName.length() > 0)
{
MetaMetadata trueMmd =
parser.getSemanticsScope().getMetaMetadataRepository().getMMByName(mmName);
if (trueMmd != null)
{
logger.debug("setting [{}].metaMetadata to {} (mm_name={})...",
deserializedMetadata, trueMmd, mmName);
deserializedMetadata.setMetaMetadata(trueMmd);
}
else
{
logger.warn("polymorphicly looking up meta-metadata failed: cannot find mmd named as "
+ mmName);
}
}
polymorphMmd = true;
}
}
@Override
public void deserializationPostHook(Metadata deserializedMetadata, MetadataFieldDescriptor mfd)
{
currentMMstack.pop();
}
@Override
public Metadata changeObjectIfNecessary(Metadata deserializedMetadata,
MetadataFieldDescriptor mdf)
{
if (parser.getSemanticsScope().ifAutoUpdateDocRefs())
{
Document downloadedDoc = parser.lookupDownloadedDocument(deserializedMetadata);
return downloadedDoc == null ? deserializedMetadata : downloadedDoc;
}
return deserializedMetadata;
}
}
}