ContentBodyRecognize.java example

Explorer
BigSemanticsJava-master
package ecologylab.bigsemantics.html.standalone;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.StringWriter;
import java.net.MalformedURLException;

import org.w3c.dom.Node;

import ecologylab.bigsemantics.html.DOMWalkInformationTagger;
import ecologylab.bigsemantics.html.ImgElement;
import ecologylab.bigsemantics.html.documentstructure.RecognizedDocumentStructure;
import ecologylab.net.PURLConnection;
import ecologylab.net.ParsedURL;


@Deprecated
public class ContentBodyRecognize extends OldHTMLDOMParser
{
	RecognizedDocumentStructure recPageType;
	
	@Override
	public org.w3c.dom.Document parse(PURLConnection purlConnection) throws IOException
	{
		recPageType	= new RecognizedDocumentStructure(purlConnection.getPurl());
		return super.parse(purlConnection);
	}

	public Node pprint(org.w3c.dom.Document doc, OutputStream out, ParsedURL purl)
	{
//		Out o = new OutJavaImpl(this.getConfiguration(), null);
		StringWriter o = new StringWriter();
		Node document;

//		if (!(doc instanceof DOMDocumentImpl)) {
//			return null;
//		}
		document = doc.getDocumentElement();

//		o.state = StreamIn.FSM_ASCII;
//		o.encoding = configuration.CharEncoding;

		//      if (out != null)
			//      {
			// Instantiate PPrint constructor that connects to combinFormation
		DOMWalkInformationTagger pprint = new DOMWalkInformationTagger(purl, null);

//		o.out = out;
//		if (configuration.xmlTags)
//			pprint.printXMLTree(o, (short)0, 0, null, document);
//		else
			pprint.tagTree(document);
		Node articleMain = RecognizedDocumentStructure.recognizeContentBody(pprint);

		if( articleMain!=null )
		{
//			recPageType.findImgsInContentBodySubTree(articleMain.parent(), imgNodes);
			informativeImages();
		}

		return articleMain;

	}

	protected void informativeImages()
	{
		for(int i=0; i<recPageType.getImgNodesInContentBody().size(); i++ )
		{
			ImgElement imgElement = recPageType.getImgNodesInContentBody().get(i);

			ParsedURL imgPurl 			= imgElement.getSrc();
			int width 					= imgElement.getWidth();
			int height 					= imgElement.getHeight();

			float aspectRatio 	= (float)width / (float)height;
			aspectRatio 				= (aspectRatio>1.0) ?  (float)1.0/aspectRatio : aspectRatio;

			String altStr 			= imgElement.getAlt();
			boolean parentHref 	= imgElement.getNode().getParentNode().getNodeName().equals("a");  		
			boolean articleImg 	= true;

			// Advertisement Keyword in the "alt" value
			if( altStr!=null && altStr.toLowerCase().contains("advertis") )  
				articleImg = false;

			//FIXME -- andruid -- restore this!!!
			/*
			if( imgUrl!=null )
			{
				//FIXME -- use compiled regex!
				String urlChunks[] = imgUrl.split("/");
				for (int j=0; j<urlChunks.length; j++)
				{
					String temp = urlChunks[j].toLowerCase();
					//	System.out.println("url Chunk:" + temp);
					if (temp.equals("adv") || temp.contains("advertis") ) // || temp.equals("ad")
					{
						articleImg = false;
						break;
					}
				}
			}
	*/

			if( (width!=-1 && width<100) || (height!=-1 && height<100) )
				articleImg = false;

			if( articleImg )
			{
				recPageType.getImgNodesInContentBody().add(imgElement);
			}

		}
	}


	protected String getContentBody(ParsedURL labelFilePurl, String contentBodyID)
	{
		/*		try 
		{
			DocumentState ds = (DocumentState) ElementState.translateFromXML(labelFile, TranslationScope.get("collectionBrowseServlet", "collectionBrowseServlet"));
			if( ds!=null )
			{
				totalLabeledDocument++;

				PartitionState partitionState = ds.getPartitionSet().get(0);

				String mainPartitionTag_ID = partitionState.getTag_id();
System.out.println("contentBody Tag_ID : " + contentBodyID + "   mainPartitionTag_ID:" + mainPartitionTag_ID);
				if( contentBodyID.equals(mainPartitionTag_ID) )
				{
					System.out.println("ContentBody equals to main partition Tag_ID");
					correctContentBody++;
					return "yes";
				}


				InformTextSet informTextSet = partitionState.getInformTextSet();
				for( int i=0; i<informTextSet.size(); i++ )
				{
					InformTextState informText = informTextSet.get(i);
					String informTextID = informText.getTag_id();
System.out.println("informTextID : " + informTextID);					
					if( contentBodyID.equals(informTextID) )
					{
						System.out.println("Main partition Tag_ID equals to informTextID ");
						correctContentBody++;
						return "yes";
					}
				}
		 */				
		/*
				InformImgSet informImgSet = partitionState.getInformImgSet();
				for( int i=0; i<informImgSet.size(); i++ )
				{
					InformImgState informImgState = informImgSet.get(i);
					String informImgTag_ID = informImgState.getTag_id();
					ArrayList imageNodes = this.getArticleImgNodes();
					for(int j=0; j<imageNodes.size(); j++ )
					{
						TdNode node = (TdNode) imageNodes.get(j);
						String imgNodeID = node.getAttrByName("tag_id").value;
//System.out.println(" ImageNodeID=" + imgNodeID + "   informImgTag_ID=" + informImgTag_ID);						
						if( imgNodeID.equals(informImgTag_ID) )
						{
							correctImage++;
							return "yes";
						}
					}
				}
		 */

		/*			}
		} 
		catch (XMLTranslationException e) 
		{
			e.printStackTrace();
		}
		 */
		return "no";
	}

	int totalLabeledDocument = 0;
	int correctContentBody = 0;
	int correctImage = 0;
	
	static final ParsedURL TEST_COLLECITON_BASE	= ParsedURL.getAbsolute("http://csdll.cs.tamu.edu:9080/TestCollections/websites/ResearchArticle/");;

	///*
	public static void main(String args[])
	{
		ContentBodyRecognize cbr = new ContentBodyRecognize();
		try 
		{
			File ff = new File( "researchSites.txt"); //"folderList.txt" )  

			InputStream ii = new FileInputStream(ff);
			BufferedReader myInput 	= new BufferedReader(new InputStreamReader(ii));

			String temp = null;
			while( (temp=myInput.readLine())!=null )
			{
				ParsedURL	purl			= TEST_COLLECITON_BASE.getRelative(temp.trim() + "/");
				ParsedURL labelPurl	= purl.getRelative("label.xml");
				System.out.println(purl.toString());
				PURLConnection purlConnection	= purl.connect();
				Node contentBodyNode = cbr.pprint( cbr.parseDOM(purlConnection.inputStream(), null), null, purl); 
				PURLConnection labelConnection	= labelPurl.connect();
				try
				{
					if ((labelConnection != null) && (labelConnection.urlConnection().getContent()!=null) 
							&& (contentBodyNode!=null) && (contentBodyNode.getAttributes().getNamedItem("tag_id")!=null) )
					{
						String returnVal=cbr.getContentBody(labelPurl, contentBodyNode.getAttributes().getNamedItem("tag_id").getNodeValue());
						if( returnVal.equals("no") )
							System.out.println("WHY NOT THIS!!!!!!!!!!! " + purl);
					}
					else 
						System.out.println("NO LABE for this document");

				}
				catch(FileNotFoundException e)
				{
					continue;
				}
				finally
				{
					purlConnection.recycle();
					if (labelConnection != null)
						labelConnection.recycle();
				}

				System.out.println("\n");
			}

			System.out.println("STAT : " + cbr.totalLabeledDocument + " : " + cbr.correctContentBody
					+ " :  images = " + cbr.correctImage );


		} 
		catch (MalformedURLException e) 
		{
			e.printStackTrace();
		} 
		catch (IOException e) 
		{
			e.printStackTrace();
		}		
	}

	//	*/
	/*
	public static void main(String args[])
	{
		ContentBodyRecognize cbr = new ContentBodyRecognize();
		String urlString = "http://csdll.cs.tamu.edu:9080/TestCollections/websites/Article/1204410974562/";
		String labelURLStr = urlString + "label.xml";

		try{
			URL url = new URL(urlString);
			System.out.println(urlString);
			InputStream in = url.openConnection().getInputStream();
			TdNode contentBodyNode = cbr.pprint( cbr.parseDOM(in, null), null, urlString); 
	System.out.println("\n\n" + urlString );				
			URL labelURL = new URL(labelURLStr);

			if( (labelURL!=null) && (labelURL.openConnection()!=null) && (labelURL.getContent()!=null) 
					&& (contentBodyNode!=null) && (contentBodyNode.getAttrByName("tag_id")!=null) )
			{
				String returnVal=cbr.getContentBody(labelURL, contentBodyNode.getAttrByName("tag_id").value);
				if( returnVal.equals("no") )
					System.out.println("WHY NOT THIS!!!!!!!!!!! " + urlString);
			}
			else 
				System.out.println("NO LABE for this document");

		}catch(Exception e)
		{
			e.printStackTrace();
		}
		System.out.println("\n");

	}
	 */
}