package org.apache.lucene.analysis.charfilter; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.BufferedReader; import java.io.File; import java.io.FileReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.Reader; import java.io.StringReader; import java.util.HashSet; import java.util.Set; import org.apache.lucene.analysis.CharReader; import org.apache.lucene.util.LuceneTestCase; public class HTMLStripCharFilterTest extends LuceneTestCase { public HTMLStripCharFilterTest(String s) { super(s); } @Override protected void setUp() throws Exception { super.setUp(); } @Override protected void tearDown() throws Exception { super.tearDown(); } //this is some text here is a link and another link . This is an entity: & plus a <. Here is an & // public void test() throws IOException { String html = "<div class=\"foo\">this is some text</div> here is a <a href=\"#bar\">link</a> and " + "another <a href=\"http://lucene.apache.org/\">link</a>. " + "This is an entity: & plus a <. Here is an &. <!-- is a comment -->"; String gold = " this is some text here is a link and " + "another link . " + "This is an entity: & plus a <. Here is an &. "; HTMLStripCharFilter reader = new HTMLStripCharFilter(CharReader.get(new StringReader(html))); StringBuilder builder = new StringBuilder(); int ch = -1; char [] goldArray = gold.toCharArray(); int position = 0; while ((ch = reader.read()) != -1){ char theChar = (char) ch; builder.append(theChar); assertTrue("\"" + theChar + "\"" + " at position: " + position + " does not equal: " + goldArray[position] + " Buffer so far: " + builder + "<EOB>", theChar == goldArray[position]); position++; } assertEquals(gold, builder.toString()); } //Some sanity checks, but not a full-fledged check public void testHTML() throws Exception { InputStream stream = getClass().getResourceAsStream("htmlStripReaderTest.html"); HTMLStripCharFilter reader = new HTMLStripCharFilter(CharReader.get(new InputStreamReader(stream, "UTF-8"))); StringBuilder builder = new StringBuilder(); int ch = -1; while ((ch = reader.read()) != -1){ builder.append((char)ch); } String str = builder.toString(); assertTrue("Entity not properly escaped", str.indexOf("<") == -1);//there is one > in the text assertTrue("Forrest should have been stripped out", str.indexOf("forrest") == -1 && str.indexOf("Forrest") == -1); assertTrue("File should start with 'Welcome to Solr' after trimming", str.trim().startsWith("Welcome to Solr")); assertTrue("File should start with 'Foundation.' after trimming", str.trim().endsWith("Foundation.")); } public void testGamma() throws Exception { String test = "Γ"; String gold = "\u0393"; Set<String> set = new HashSet<String>(); set.add("reserved"); Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)), set); StringBuilder builder = new StringBuilder(); int ch = 0; while ((ch = reader.read()) != -1){ builder.append((char)ch); } String result = builder.toString(); // System.out.println("Resu: " + result + "<EOL>"); // System.out.println("Gold: " + gold + "<EOL>"); assertTrue(result + " is not equal to " + gold + "<EOS>", result.equals(gold) == true); } public void testEntities() throws Exception { String test = "  <foo> Übermensch = Γ bar Γ"; String gold = " <foo> \u00DCbermensch = \u0393 bar \u0393"; Set<String> set = new HashSet<String>(); set.add("reserved"); Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)), set); StringBuilder builder = new StringBuilder(); int ch = 0; while ((ch = reader.read()) != -1){ builder.append((char)ch); } String result = builder.toString(); // System.out.println("Resu: " + result + "<EOL>"); // System.out.println("Gold: " + gold + "<EOL>"); assertTrue(result + " is not equal to " + gold + "<EOS>", result.equals(gold) == true); } public void testMoreEntities() throws Exception { String test = "  <junk/>   ! @ and ’"; String gold = " <junk/> ! @ and ’"; Set<String> set = new HashSet<String>(); set.add("reserved"); Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)), set); StringBuilder builder = new StringBuilder(); int ch = 0; while ((ch = reader.read()) != -1){ builder.append((char)ch); } String result = builder.toString(); // System.out.println("Resu: " + result + "<EOL>"); // System.out.println("Gold: " + gold + "<EOL>"); assertTrue(result + " is not equal to " + gold, result.equals(gold) == true); } public void testReserved() throws Exception { String test = "aaa bbb <reserved ccc=\"ddddd\"> eeee </reserved> ffff <reserved ggg=\"hhhh\"/> <other/>"; Set<String> set = new HashSet<String>(); set.add("reserved"); Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)), set); StringBuilder builder = new StringBuilder(); int ch = 0; while ((ch = reader.read()) != -1){ builder.append((char)ch); } String result = builder.toString(); // System.out.println("Result: " + result); assertTrue("Escaped tag not preserved: " + result.indexOf("reserved"), result.indexOf("reserved") == 9); assertTrue("Escaped tag not preserved: " + result.indexOf("reserved", 15), result.indexOf("reserved", 15) == 38); assertTrue("Escaped tag not preserved: " + result.indexOf("reserved", 41), result.indexOf("reserved", 41) == 54); assertTrue("Other tag should be removed", result.indexOf("other") == -1); } public void testMalformedHTML() throws Exception { String test = "a <a hr<ef=aa<a>> </close</a>"; String gold = "a <a hr<ef=aa > </close "; Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test))); StringBuilder builder = new StringBuilder(); int ch = 0; while ((ch = reader.read()) != -1){ builder.append((char)ch); } String result = builder.toString(); // System.out.println("Resu: " + result + "<EOL>"); // System.out.println("Gold: " + gold + "<EOL>"); assertTrue(result + " is not equal to " + gold + "<EOS>", result.equals(gold) == true); } public void testBufferOverflow() throws Exception { StringBuilder testBuilder = new StringBuilder(HTMLStripCharFilter.DEFAULT_READ_AHEAD + 50); testBuilder.append("ah<?> "); appendChars(testBuilder, HTMLStripCharFilter.DEFAULT_READ_AHEAD + 500); processBuffer(testBuilder.toString(), "Failed on pseudo proc. instr.");//processing instructions testBuilder.setLength(0); testBuilder.append("<!--");//comments appendChars(testBuilder, 3*HTMLStripCharFilter.DEFAULT_READ_AHEAD + 500);//comments have two lookaheads testBuilder.append("-->foo"); processBuffer(testBuilder.toString(), "Failed w/ comment"); testBuilder.setLength(0); testBuilder.append("<?"); appendChars(testBuilder, HTMLStripCharFilter.DEFAULT_READ_AHEAD + 500); testBuilder.append("?>"); processBuffer(testBuilder.toString(), "Failed with proc. instr."); testBuilder.setLength(0); testBuilder.append("<b "); appendChars(testBuilder, HTMLStripCharFilter.DEFAULT_READ_AHEAD + 500); testBuilder.append("/>"); processBuffer(testBuilder.toString(), "Failed on tag"); } private void appendChars(StringBuilder testBuilder, int numChars) { int i1 = numChars / 2; for (int i = 0; i < i1; i++){ testBuilder.append('a').append(' ');//tack on enough to go beyond the mark readahead limit, since <?> makes HTMLStripCharFilter think it is a processing instruction } } private void processBuffer(String test, String assertMsg) throws IOException { // System.out.println("-------------------processBuffer----------"); Reader reader = new HTMLStripCharFilter(CharReader.get(new BufferedReader(new StringReader(test))));//force the use of BufferedReader int ch = 0; StringBuilder builder = new StringBuilder(); try { while ((ch = reader.read()) != -1){ builder.append((char)ch); } } finally { // System.out.println("String (trimmed): " + builder.toString().trim() + "<EOS>"); } assertTrue(assertMsg + "::: " + builder.toString() + " is not equal to " + test, builder.toString().equals(test) == true); } public void testComment() throws Exception { String test = "<!--- three dashes, still a valid comment ---> "; String gold = " "; Reader reader = new HTMLStripCharFilter(CharReader.get(new BufferedReader(new StringReader(test))));//force the use of BufferedReader int ch = 0; StringBuilder builder = new StringBuilder(); try { while ((ch = reader.read()) != -1){ builder.append((char)ch); } } finally { // System.out.println("String: " + builder.toString()); } assertTrue(builder.toString() + " is not equal to " + gold + "<EOS>", builder.toString().equals(gold) == true); } public void doTestOffsets(String in) throws Exception { HTMLStripCharFilter reader = new HTMLStripCharFilter(CharReader.get(new BufferedReader(new StringReader(in)))); int ch = 0; int off = 0; // offset in the reader int strOff = -1; // offset in the original string while ((ch = reader.read()) != -1) { int correctedOff = reader.correctOffset(off); if (ch == 'X') { strOff = in.indexOf('X',strOff+1); assertEquals(strOff, correctedOff); } off++; } } public void testOffsets() throws Exception { doTestOffsets("hello X how X are you"); doTestOffsets("hello <p> X<p> how <p>X are you"); doTestOffsets("X & X ( X < > X"); // test backtracking doTestOffsets("X < &zz >X &# < X > < &l > &g < X"); } }