/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.util;
import java.io.ByteArrayInputStream;
import junit.framework.TestCase;
import org.apache.xerces.parsers.DOMParser;
import org.w3c.dom.Node;
import org.xml.sax.InputSource;
/** Unit tests for NodeWalker methods. */
public class TestNodeWalker extends TestCase {
public TestNodeWalker(String name) {
super(name);
}
/* a snapshot of the nutch webpage */
private final static String WEBPAGE=
"<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">"
+ "<html xmlns=\"http://www.w3.org/1999/xhtml\" lang=\"en\" xml:lang=\"en\"><head><title>Nutch</title></head>"
+ "<body>"
+ "<ul>"
+ "<li>crawl several billion pages per month</li>"
+ "<li>maintain an index of these pages</li>"
+ "<li>search that index up to 1000 times per second</li>"
+ "<li>provide very high quality search results</li>"
+ "<li>operate at minimal cost</li>"
+ "</ul>"
+ "</body>"
+ "</html>";
private final static String[] ULCONTENT = new String[4];
protected void setUp() throws Exception{
ULCONTENT[0]="crawl several billion pages per month" ;
ULCONTENT[1]="maintain an index of these pages" ;
ULCONTENT[2]="search that index up to 1000 times per second" ;
ULCONTENT[3]="operate at minimal cost" ;
}
public void testSkipChildren() {
DOMParser parser= new DOMParser();
try {
parser.parse(new InputSource(new ByteArrayInputStream(WEBPAGE.getBytes())));
} catch (Exception e) {
e.printStackTrace();
}
StringBuffer sb = new StringBuffer();
NodeWalker walker = new NodeWalker(parser.getDocument());
while (walker.hasNext()) {
Node currentNode = walker.nextNode();
short nodeType = currentNode.getNodeType();
if (nodeType == Node.TEXT_NODE) {
String text = currentNode.getNodeValue();
text = text.replaceAll("\\s+", " ");
sb.append(text);
}
}
assertTrue("UL Content can NOT be found in the node", findSomeUlContent(sb.toString()));
StringBuffer sbSkip = new StringBuffer();
NodeWalker walkerSkip = new NodeWalker(parser.getDocument());
while (walkerSkip.hasNext()) {
Node currentNode = walkerSkip.nextNode();
String nodeName = currentNode.getNodeName();
short nodeType = currentNode.getNodeType();
if ("ul".equalsIgnoreCase(nodeName)) {
walkerSkip.skipChildren();
}
if (nodeType == Node.TEXT_NODE) {
String text = currentNode.getNodeValue();
text = text.replaceAll("\\s+", " ");
sbSkip.append(text);
}
}
assertFalse("UL Content can be found in the node", findSomeUlContent(sbSkip.toString()));
}
public boolean findSomeUlContent(String str) {
for(int i=0; i<ULCONTENT.length ; i++){
if(str.contains(ULCONTENT[i])) return true;
}
return false;
}
}