package ecologylab.bigsemantics.documentparsers;
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.HashMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import ecologylab.bigsemantics.collecting.SemanticsGlobalScope;
import ecologylab.bigsemantics.html.DOMParserInterface;
import ecologylab.bigsemantics.html.ImgElement;
import ecologylab.bigsemantics.html.utils.HTMLNames;
import ecologylab.bigsemantics.html.utils.StringBuilderUtils;
import ecologylab.bigsemantics.metadata.builtins.AnonymousDocument;
import ecologylab.bigsemantics.metadata.builtins.Document;
import ecologylab.bigsemantics.metadata.builtins.DocumentClosure;
import ecologylab.bigsemantics.metadata.builtins.Image;
import ecologylab.bigsemantics.metadata.builtins.ImageClipping;
import ecologylab.generic.Continuation;
import ecologylab.generic.DomTools;
import ecologylab.net.ParsedURL;
import ecologylab.serialization.XMLTools;
public class HTMLFragmentDOMParser extends HTMLDOMParser implements DOMParserInterface, HTMLNames
{
InputStream fragmentStream;
Reader reader;
ArrayList<ImageClipping> imageClippings = new ArrayList<ImageClipping>();
ParsedURL containerPurl;
Document containerDocument;
Document textOutlink;
StringBuilder bodyTextBuffy = new StringBuilder();
SpecialImageUrlHandler specialImageUrlHandler = new SpecialImageUrlHandler();
private static HashMap<String, Integer> namesOfBreaklineNodeNames = null;
public HTMLFragmentDOMParser(Reader reader, InputStream inputStream)
{
super();
fragmentStream = inputStream;
this.reader = reader;
AnonymousDocument anonymousDocument = new AnonymousDocument();
setDocumentClosure(anonymousDocument.getOrConstructClosure());
}
@Override
public void parse() throws IOException
{
org.w3c.dom.Document dom = getDom();
//DomTools.prettyPrint(dom);
int containerNodeIndex = 0;
NodeList bodyNodeList = dom.getElementsByTagName(BODY);
if (bodyNodeList.getLength() > 0)
{
Node bodyNode = bodyNodeList.item(0);
parseText(bodyTextBuffy, bodyNode);
checkForSimplSourceLocation(bodyNode);
checkForMetadata(bodyNode);
}
parseImages(dom);
}
private void parseImages(org.w3c.dom.Document dom)
{
NodeList imgNodeList = dom.getElementsByTagName(IMG);
int numImages = imgNodeList.getLength();
if (numImages > 0)
{
for (int i = 0; i < numImages; i++)
{
Node imgNode = imgNodeList.item(i);
Node parent = imgNode.getParentNode();
Document outlink = null;
boolean changeSourceDoc = false;
String src = DomTools.getAttribute(imgNode, SRC);
src = specialImageUrlHandler.changeImageUrlIfNeeded(src);
ParsedURL imgPurl = ImgElement.constructPurl(containerPurl, src);
do
{
if (A.equals(parent.getNodeName()))
{
String hrefString = DomTools.getAttribute(parent, HREF);
if (hrefString != null)
{
try
{
if (imgPurl == null)
{
String srcUrl = specialImageUrlHandler.getImageUrlFromParameters(hrefString);
if (srcUrl != null)
imgPurl = ImgElement.constructPurl(containerPurl, srcUrl);
}
if (imgPurl == null)
break;
StringBuilder newImgHrefBuf = StringBuilderUtils.acquire();
changeSourceDoc = specialImageUrlHandler.changeImageRefUrlAndSourceDocIfNeeded(hrefString, newImgHrefBuf);
hrefString = newImgHrefBuf.length() > 0 ? newImgHrefBuf.toString() : hrefString;
StringBuilderUtils.release(newImgHrefBuf);
}
catch (UnsupportedEncodingException e)
{
error("Image ref URL cannot be decoded because it is using unsupported encoding. " +
"We support UTF-8 only.");
e.printStackTrace();
}
ParsedURL aHref = ImgElement.constructPurl(containerPurl, hrefString);
if (aHref != null)
outlink = getSemanticsScope().getOrConstructDocument(aHref);
}
break;
}
parent = parent.getParentNode();
} while (parent != null);
if (imgPurl == null)
continue;
SemanticsGlobalScope semanticsSessionScope = getSemanticsScope();
Image image = semanticsSessionScope.getOrConstructImage(imgPurl);
if (image != null)
{
String altText = DomTools.getAttribute(imgNode, ALT);
final ImageClipping imageClipping = image.constructClipping(containerDocument, null /*outlink*/, altText, null);
if (changeSourceDoc)
{
outlink.queueDownload(new Continuation<DocumentClosure>()
{
@Override
public void callback(DocumentClosure o)
{
Document downloadedDoc = o.getDocument();
if (downloadedDoc != null && !downloadedDoc.isRecycled())
{
imageClipping.setSourceDoc(downloadedDoc);
imageClipping.setOutlinks(null);
imageClipping.setMetadataChanged(true);
}
}
});
}
imageClippings.add(imageClipping);
}
}
}
}
public void parseText(StringBuilder buffy, Node bodyNode)
{
//debug("Node:" + bodyNode.getNodeName() + ":" + bodyNode.getNodeValue());
NodeList children = bodyNode.getChildNodes();
boolean addLine = false; // this is outside of the loop below to make it work correctly
for (int i = 0; i < children.getLength(); i++)
{
Node kid = children.item(i);
if (A.equals(kid.getNodeName()) && textOutlink == null) // first cut; needs refinement
{
String hrefString = DomTools.getAttribute(kid, HREF);
if (hrefString != null)
{
ParsedURL aHref = ImgElement.constructPurl(containerPurl, hrefString);
if (aHref != null)
textOutlink = getSemanticsScope().getOrConstructDocument(aHref);
}
}
if (addLine == false)
addLine = shouldBreakLineWithNodeName(kid.getNodeName());
if (kid.getNodeValue() != null)
{
String v = kid.getNodeValue();
if (kid.getNodeName().equals(HASH_COMMENT))
continue;
addWithOneSpaceBetween(buffy, v, false);
if (addLine)
{
buffy.append('\n');
addLine = false;
}
}
else if(shouldBreakLineWithNodeName(kid.getNodeName()))
{
buffy.append('\n');
}
//addWithOneSpaceBetween(buffy, walkDomAddingTextAndAddNewlines(kid), true);
parseText(buffy, kid);
}
}
private static void addWithOneSpaceBetween(StringBuilder buffy, String v, boolean newlineOK)
{
char lastChar = (buffy.length() > 0) ? buffy.charAt(buffy.length() - 1) : ' ';
if (lastChar != '\n')
buffy.append(' ');
if (!newlineOK )
v = v.replaceAll("\\n", " ");
v = v.replaceAll("^[\\s]+", "");
v = v.replaceAll("[\\s]+", " ");
if (v.length() > 0)
buffy.append(v);
}
/**
* @author rhema returns true when a breakline would make sense based on the node name.
*
* @param nodeName
* such as p, div, br
* @return
*/
private static boolean shouldBreakLineWithNodeName(String nodeName)
{
String name = nodeName.toLowerCase();
if (namesOfBreaklineNodeNames == null)
{
namesOfBreaklineNodeNames = new HashMap<String, Integer>();
namesOfBreaklineNodeNames.put("p", 1);
namesOfBreaklineNodeNames.put("h1", 1);
namesOfBreaklineNodeNames.put("h2", 1);
namesOfBreaklineNodeNames.put("h3", 1);
namesOfBreaklineNodeNames.put("h4", 1);
namesOfBreaklineNodeNames.put("h5", 1);
namesOfBreaklineNodeNames.put("h6", 1);
namesOfBreaklineNodeNames.put("br", 1);
namesOfBreaklineNodeNames.put("div", 1);
}
return namesOfBreaklineNodeNames.containsKey(name);
}
void checkForSimplSourceLocation(Node node)
{
node.getAttributes();
if (node.getAttributes() != null && setContainerLocation(node) != null)
{
return;
}
NodeList children = node.getChildNodes();
for (int i = 0; i < children.getLength(); i++)
{
checkForSimplSourceLocation(children.item(i));
}
}
private ParsedURL setContainerLocation(Node elementNode)
{
if (containerPurl == null && elementNode != null)
{
String containerLocation = DomTools.getAttribute(elementNode, SIMPL_SOURCE_LOCATION);
if (containerLocation == null || containerLocation.length() == 0)
containerLocation = DomTools.getAttribute(elementNode, SIMPL);
if (containerLocation == null || containerLocation.length() == 0)
containerLocation = DomTools.getAttribute(elementNode, CONTAINER);
if (containerLocation != null && containerLocation.length() > 0)
{
containerLocation = XMLTools.unescapeXML(containerLocation);
containerPurl = ParsedURL.getAbsolute(containerLocation);
containerDocument = getSemanticsScope().getOrConstructDocument(containerPurl);
}
}
return containerPurl;
}
void checkForMetadata(Node node)
{
node.getAttributes();
if (node.getAttributes() != null && parseInjectedMetadata(node) != null)
{
return;
}
NodeList children = node.getChildNodes();
for (int i = 0; i < children.getLength(); i++)
{
checkForMetadata(children.item(i));
}
}
private ParsedURL parseInjectedMetadata(Node elementNode)
{
if (containerPurl == null && elementNode != null)
{
String containerMetadata = DomTools.getAttribute(elementNode, SIMPL_METADATA);
DomTools.prettyPrint(elementNode);
if (containerMetadata != null && containerMetadata.length() > 0)
{
System.out.println("\n\nsimpl:metadata:\n"+containerMetadata+"\n\n");
Document metadataFromBrowser = Document.constructAndMapFromJson(containerMetadata, getSemanticsScope());
if (metadataFromBrowser != null)
{
// workflows need to be modified to accomodate metadata coming from drag
System.out.println("\nSetting container document to injected metadata\n");
containerDocument = metadataFromBrowser;
containerPurl = metadataFromBrowser.getLocation();
}
}
}
return containerPurl;
}
@Override
public InputStream inputStream()
{
return fragmentStream;
}
@Override
public Reader reader()
{
return reader;
}
public String getBodyText()
{
return bodyTextBuffy.toString();
}
public Document getTextOutlink()
{
return textOutlink;
}
public ArrayList<ImageClipping> getImageClippings()
{
return imageClippings;
}
public void setContent()
{
}
public void setIndexPage()
{
}
public Document getContainerDocument()
{
return containerDocument;
}
public ParsedURL getContainerPurl()
{
return containerPurl;
}
@Override
public void recycle()
{
fragmentStream = null;
reader = null;
imageClippings.clear();
imageClippings = null;
containerPurl = null;
containerDocument = null;
textOutlink = null;
bodyTextBuffy = null;
super.recycle();
}
}