OldHTMLDOMParser.java example

Explorer
BigSemanticsJava-master
package ecologylab.bigsemantics.html.standalone;

import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.Reader;
import java.io.StringWriter;
import java.util.ArrayList;
import java.util.TreeMap;

import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;


import ecologylab.bigsemantics.html.AElement;
import ecologylab.bigsemantics.html.DOMParserInterface;
import ecologylab.bigsemantics.html.DOMWalkInformationTagger;
import ecologylab.bigsemantics.html.ImgElement;
import ecologylab.bigsemantics.html.ParagraphText;
import ecologylab.bigsemantics.html.documentstructure.AnchorContext;
import ecologylab.bigsemantics.html.documentstructure.ContentPage;
import ecologylab.bigsemantics.html.documentstructure.ImageCollectionPage;
import ecologylab.bigsemantics.html.documentstructure.ImageFeatures;
import ecologylab.bigsemantics.html.documentstructure.IndexPage;
import ecologylab.bigsemantics.html.documentstructure.RecognizedDocumentStructure;
import ecologylab.bigsemantics.html.documentstructure.TextOnlyPage;
import ecologylab.bigsemantics.html.dom.IDOMProvider;
import ecologylab.bigsemantics.html.utils.HTMLNames;
import ecologylab.bigsemantics.html.utils.StringBuilderUtils;
import ecologylab.generic.StringTools;
import ecologylab.net.PURLConnection;
import ecologylab.net.ParsedURL;
import ecologylab.serialization.XMLTools;


/**
 * WARNING: This code is deprecated in cF and ecologylabSemantics.
 * It remains only for the purpose of maintaining some standalone classes, which Andruid
 * imagines are for algorithm verification (and thus, useful.)
 * <p/>
 * 
 * Connect to JTidy parser to parse HTML pages for standalone algorithm measurement.
 * This parsing code integrates with the Image-Text Surrogate extractor code.
 * 
 * @author eunyee
 *
 */
@Deprecated
public class OldHTMLDOMParser
implements HTMLNames, IDOMProvider
{
	PURLConnection purlConnection;
	IDOMProvider provider;

	/**
	 * because Tidy extends Serializable
	 */
	private static final long serialVersionUID = 1L;

	public OldHTMLDOMParser()
	{	
		super();
	}

	/**
	 * Parse HTML Document, and return the root DOM node
	 * 
	 * @param in
	 * @param purl TODO
	 * @param out
	 * @param tidyInterface
	 * @throws IOException 
	 */
	public org.w3c.dom.Document parse(PURLConnection purlConnection) throws IOException
	{
		this.purlConnection		= purlConnection;
		return provider.parseDOM(purlConnection.inputStream(), null);
	}

	/**
	 * Extract Image and Text surrogates while walk through DOM 
	 * 
	 * @param in
	 * @param htmlType
	 * @throws IOException 
	 */
	public void parse(PURLConnection purlConnection, DOMParserInterface htmlType) throws IOException
	{
		Document parsedDoc = parse(purlConnection);

		
		DOMWalkInformationTagger taggedDoc = walkAndTagDom(parsedDoc, htmlType);
		
		extractImageTextSurrogates(taggedDoc, htmlType);
		
		//Now, find hrefs, with their context and generate containers with metadata
		ArrayList<AnchorContext> anchorContexts = buildAnchorContexts(taggedDoc.getAllAnchorNodes(), purl());
		
  	if(htmlType != null)
			htmlType.generateCandidateContainersFromContexts(anchorContexts, false);
  	
  	anchorContexts.clear();
		taggedDoc.recycle();
	}

	/**
	 * This is the walk of the dom that calls print tree, and the parser methods such as closeHref etc.
	 * @param doc
	 * @param htmlType
	 * @return
	 */
	public DOMWalkInformationTagger walkAndTagDom(Node rootTdNode, DOMParserInterface htmlType)
	{

//		jtidyPrettyOutput.state = StreamIn.FSM_ASCII;
//		jtidyPrettyOutput.encoding = configuration.CharEncoding;

		DOMWalkInformationTagger domTagger = new DOMWalkInformationTagger(purlConnection.getPurl(), htmlType);

		StringWriter writer = new StringWriter();
		// walk through the HTML document object.
		// gather all paragraphText and image objects in the data structure.
		//FIXME -- get rid of this call and object!
		domTagger.tagTree(rootTdNode);
		return domTagger;
	}

	/**
	 * Extract Image and Text Surrogates while walk through DOM
	 * 
	 * historically was called as pprint() in JTidy. 
	 */
	public void extractImageTextSurrogates(DOMWalkInformationTagger taggedDoc, DOMParserInterface htmlType)
	{

		Node contentBody = RecognizedDocumentStructure.recognizeContentBody(taggedDoc);
		//System.out.println("\n\ncontentBody = " + contentBody);       
		ArrayList<ImgElement> imgNodes = taggedDoc.getAllImgNodes();

		recognizeDocumentStructureToGenerateSurrogate(htmlType, taggedDoc, contentBody, imgNodes);
	}
	
	/**
	 * Recognize the page type based on whether the page has contentBody node or not, 
	 * text length in the whole page, and whether the informative images reside in the page. 
	 * 
	 * Based on the recognized page type, it generates surrogates.  
	 * 
	 * @param htmlType
	 * @param domWalkInfoTagger
	 * @param contentBody
	 * @param imgNodes
	 */
	private void recognizeDocumentStructureToGenerateSurrogate(DOMParserInterface htmlType,
			DOMWalkInformationTagger domWalkInfoTagger, Node contentBody,
			ArrayList<ImgElement> imgNodes) 
	{
		RecognizedDocumentStructure pageCategory = null;

		if( contentBody!=null )
		{
			// Content Pages
			pageCategory = new ContentPage(purl());
		}
		else
		{
			final int numImgNodes = imgNodes.size();
			if( (numImgNodes>0) && ((domWalkInfoTagger.getTotalTxtLength()/numImgNodes)<200) )
			{	
				// High probability to be an image-collection page
				pageCategory = new ImageCollectionPage(purl());
			}
			else if( numImgNodes!=0 )
			{
				// Index Pages (include index-content pages)
				//FIXME -- should also look at text only pages & especially use link ratio as a feature!!!!
				pageCategory = new IndexPage(purl());
			}
		}
		TreeMap<Integer, ParagraphText> paragraphTextsTMap = domWalkInfoTagger.getParagraphTextsTMap();

		if (pageCategory != null)
		{
			pageCategory.generateSurrogates(contentBody, imgNodes, domWalkInfoTagger.getTotalTxtLength(), paragraphTextsTMap, htmlType);
		}

		// No Informative images are in this document. Form surrogate only with text.  	
		// We cannot tell whether the images in the pages are informative or not until downloding all, thus this is the case after we 
		// look through all the images in the page and determine no image is worth displaying.
		if( (htmlType.numExtractedClippings()==0) && (paragraphTextsTMap.size()>0) )
		{
			pageCategory = new TextOnlyPage(purl());
			pageCategory.generateSurrogates(contentBody, imgNodes, domWalkInfoTagger.getTotalTxtLength(), paragraphTextsTMap, htmlType);
		}
		if (pageCategory != null)
			htmlType.setRecognizedDocumentStructure(pageCategory.getClass());
	}


	/**
	 * Transform an set of AElements (HTML a) into a set of AnchorContexts.
	 * In some cases, an AElement may result in no entry, because the anchor text and anchor context are both empty.
	 * @param anchorElements
	 * 
	 * @return
	 */
	public ArrayList<AnchorContext> buildAnchorContexts(ArrayList<AElement> anchorElements, ParsedURL sourcePurl)
	{
		ArrayList<AnchorContext> anchorNodeContexts = new ArrayList<AnchorContext>();
		
		for (AElement aElement : anchorElements)
		{
			AnchorContext aContext= constructAnchorContext(aElement, sourcePurl);
			if(aContext!=null)
			{
					anchorNodeContexts.add(aContext);
			}
		}
		return anchorNodeContexts;
	}
	
	/**
	 * Given the a element from the HTML, get its anchor text (text between open and close a tags),
	 * and its anchor context (surrounding text). If either of these is not null, then return an
	 * AnchorContext object.
	 * 
	 * The surrounding text is defined as all the text in the a element's parent node.
	 * This definition should perhaps be expanded, for example, by trying grandparent if parent
	 * is either null or the same as anchor text.
	 * 
	 * @param aElement	Anchor HTMLElement (a href=...)
	 * 
	 * @return					AnchorContext object, or null.
	 */
	public AnchorContext constructAnchorContext(AElement aElement, ParsedURL sourcePurl)
	{
		Node anchorNodeNode 				  = aElement.getNode();
		ParsedURL href 									= aElement.getHref();
		if (href != null)
		{
			Node parent 							  = anchorNodeNode.getParentNode();
			//FIXME -- this routine drops all sorts of significant stuff because it does not concatenate across tags.
			StringBuilder anchorContext 	= getTextInSubTree(parent, false);
			
			//TODO: provide ability to specify alternate anchorContext
			StringBuilder anchorText 			= getTextInSubTree(anchorNodeNode, true);
			if ((anchorContext != null) || (anchorText != null))
			{
				String anchorContextString	= null;
				if (anchorContext != null)
				{
					XMLTools.unescapeXML(anchorContext);
					StringTools.toLowerCase(anchorContext);
					anchorContextString				= StringTools.toString(anchorContext);
					StringBuilderUtils.release(anchorContext);
				}
				String anchorTextString			= null;
				if (anchorText != null)
				{
					XMLTools.unescapeXML(anchorText);
					StringTools.toLowerCase(anchorText);
					anchorTextString					= StringTools.toString(anchorText);
					StringBuilderUtils.release(anchorText);
				}
				return new AnchorContext(href, anchorTextString, anchorContextString, sourcePurl, false, false);
			}
		}
		return null;
	}
	
  public static StringBuilder getTextInSubTree(Node node, boolean recurse)
  {
  	return getTextinSubTree(node, recurse, null);
  }

	/**
   * Non-recursive method to get the text for the <code>node</code>
   * Collects the text even if the node contains other nodes in between,
   * specifically the <code>anchor</code>. It does not however include the 
   * text from the anchor node.
   * @param node
   * @param te
   * @return
   */
	//FIXME -- why is text in anchor node not included?
  public static StringBuilder getTextinSubTree(Node node, boolean recurse, StringBuilder result)
  {
  	NodeList children = node.getChildNodes();
  	for (int i=0; i<children.getLength(); i++)
  	{
  		Node childNode = children.item(i);
			if (recurse && (childNode.getNodeName()!=null) && (!childNode.getNodeName().equals("script")))
			{
				//Recursive call with the childNode
				result = getTextinSubTree(childNode, true, result);
			}	
			else if (childNode.getNodeType() == Node.TEXT_NODE )
  		{
  			int length	= 0;
				if (result != null)
				{
					result.append(' ');							// append space to separate text chunks
					length		= result.length();
				}
  			result			= StringBuilderUtils.trimAndDecodeUTF8(result, childNode, 0, true);
  			
  			if ((result != null) && (length == result.length()))
  					result.setLength(length - 1);	// take the space off if nothing was appended
  		} 
  		else if ("img".equals(childNode.getNodeName()))
  		{
  			Node altAtt	= childNode.getAttributes().getNamedItem(ALT);
  			String alt		= (altAtt != null) ? altAtt.getNodeValue() : null;
  			if (!ImageFeatures.altIsBogus(alt))
  			{
  				if (result == null)
  					result		= StringBuilderUtils.acquire();
  				else
  					result.append(' ');
  				result.append(alt);
  			}
  		}
  	}
  	if (result != null)
  		XMLTools.unescapeXML(result);

  	return result;
  }
  
  ParsedURL purl()
  {
  	return purlConnection.getPurl();
  }

	@Override
	public void setQuiet(boolean b)
	{
		// TODO Auto-generated method stub
		
	}

	@Override
	public void setShowWarnings(boolean b)
	{
		// TODO Auto-generated method stub
		
	}

	@Override
	public Document parseDOM(InputStream inputStream, OutputStream out)
	{
		// TODO Auto-generated method stub
		return null;
	}

	@Override
	public String xPathTagNamesToLower(String xpath)
	{
		// TODO Auto-generated method stub
		return null;
	}

	@Override
	public Document parseDOM(Reader reader, OutputStream out) throws IOException
	{
		// TODO Auto-generated method stub
		return null;
	}

}