/**
* Licensed to Cloudera, Inc. under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. Cloudera, Inc. licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.arabidopsis.ahocorasick;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Iterator;
import java.util.NoSuchElementException;
import java.util.Set;
import junit.framework.TestCase;
/**
* Junit test cases for AhoCorasick.
*/
@SuppressWarnings("unchecked")
public class TestAhoCorasick extends TestCase {
private AhoCorasick tree;
public void setUp() {
this.tree = new AhoCorasick();
}
public void testConstruction() {
tree.add("hello".getBytes(), "hello".getBytes());
tree.add("hi".getBytes(), "hi".getBytes());
tree.prepare();
State s0 = tree.getRoot();
State s1 = s0.get((byte) 'h');
State s2 = s1.get((byte) 'e');
State s3 = s2.get((byte) 'l');
State s4 = s3.get((byte) 'l');
State s5 = s4.get((byte) 'o');
State s6 = s1.get((byte) 'i');
assertTrue(s6 != null);
assertEquals(s0, s1.getFail());
assertEquals(s0, s2.getFail());
assertEquals(s0, s3.getFail());
assertEquals(s0, s4.getFail());
assertEquals(s0, s5.getFail());
assertEquals(s0, s6.getFail());
assertEquals(0, s0.getOutputs().size());
assertEquals(0, s1.getOutputs().size());
assertEquals(0, s2.getOutputs().size());
assertEquals(0, s3.getOutputs().size());
assertEquals(0, s4.getOutputs().size());
assertEquals(1, s5.getOutputs().size());
assertEquals(1, s6.getOutputs().size());
}
public void testExample() {
tree.add("he".getBytes(), "he".getBytes());
tree.add("she".getBytes(), "she".getBytes());
tree.add("his".getBytes(), "his".getBytes());
tree.add("hers".getBytes(), "hers".getBytes());
assertEquals(10, tree.getRoot().size());
tree.prepare(); // after prepare, we can't call size()
State s0 = tree.getRoot();
State s1 = s0.get((byte) 'h');
State s2 = s1.get((byte) 'e');
State s3 = s0.get((byte) 's');
State s4 = s3.get((byte) 'h');
State s5 = s4.get((byte) 'e');
State s6 = s1.get((byte) 'i');
State s7 = s6.get((byte) 's');
State s8 = s2.get((byte) 'r');
State s9 = s8.get((byte) 's');
assertEquals(s0, s1.getFail());
assertEquals(s0, s2.getFail());
assertEquals(s0, s3.getFail());
assertEquals(s0, s6.getFail());
assertEquals(s0, s8.getFail());
assertEquals(s1, s4.getFail());
assertEquals(s2, s5.getFail());
assertEquals(s3, s7.getFail());
assertEquals(s3, s9.getFail());
assertEquals(0, s1.getOutputs().size());
assertEquals(0, s3.getOutputs().size());
assertEquals(0, s4.getOutputs().size());
assertEquals(0, s6.getOutputs().size());
assertEquals(0, s8.getOutputs().size());
assertEquals(1, s2.getOutputs().size());
assertEquals(1, s7.getOutputs().size());
assertEquals(1, s9.getOutputs().size());
assertEquals(2, s5.getOutputs().size());
}
public void testStartSearchWithSingleResult() {
tree.add("apple".getBytes(), "apple".getBytes());
tree.prepare();
SearchResult result = tree.startSearch("washington cut the apple tree"
.getBytes());
assertEquals(1, result.getOutputs().size());
assertEquals("apple", new String((byte[]) result.getOutputs().iterator()
.next()));
assertEquals(24, result.getLastIndex());
assertEquals(null, tree.continueSearch(result));
}
public void testStartSearchWithAdjacentResults() {
tree.add("john".getBytes(), "john".getBytes());
tree.add("jane".getBytes(), "jane".getBytes());
tree.prepare();
SearchResult firstResult = tree.startSearch("johnjane".getBytes());
SearchResult secondResult = tree.continueSearch(firstResult);
assertEquals(null, tree.continueSearch(secondResult));
}
public void testStartSearchOnEmpty() {
tree.add("cipher".getBytes(), Integer.valueOf(0));
tree.add("zip".getBytes(), Integer.valueOf(1));
tree.add("nought".getBytes(), Integer.valueOf(2));
tree.prepare();
SearchResult result = tree.startSearch("".getBytes());
assertEquals(null, result);
}
public void testMultipleOutputs() {
tree.add("x".getBytes(), "x");
tree.add("xx".getBytes(), "xx");
tree.add("xxx".getBytes(), "xxx");
tree.prepare();
SearchResult result = tree.startSearch("xxx".getBytes());
assertEquals(1, result.getLastIndex());
assertEquals(new HashSet(Arrays.asList(new String[] { "x" })), result
.getOutputs());
result = tree.continueSearch(result);
assertEquals(2, result.getLastIndex());
assertEquals(new HashSet(Arrays.asList(new String[] { "xx", "x" })), result
.getOutputs());
result = tree.continueSearch(result);
assertEquals(3, result.getLastIndex());
assertEquals(new HashSet(Arrays.asList(new String[] { "xxx", "xx", "x" })),
result.getOutputs());
assertEquals(null, tree.continueSearch(result));
}
public void testIteratorInterface() {
tree.add("moo".getBytes(), "moo");
tree.add("one".getBytes(), "one");
tree.add("on".getBytes(), "on");
tree.add("ne".getBytes(), "ne");
tree.prepare();
Iterator iter = tree.search("one moon ago".getBytes());
assertTrue(iter.hasNext());
SearchResult r = (SearchResult) iter.next();
assertEquals(new HashSet(Arrays.asList(new String[] { "on" })), r
.getOutputs());
assertEquals(2, r.getLastIndex());
assertTrue(iter.hasNext());
r = (SearchResult) iter.next();
assertEquals(new HashSet(Arrays.asList(new String[] { "one", "ne" })), r
.getOutputs());
assertEquals(3, r.getLastIndex());
assertTrue(iter.hasNext());
r = (SearchResult) iter.next();
assertEquals(new HashSet(Arrays.asList(new String[] { "moo" })), r
.getOutputs());
assertEquals(7, r.getLastIndex());
assertTrue(iter.hasNext());
r = (SearchResult) iter.next();
assertEquals(new HashSet(Arrays.asList(new String[] { "on" })), r
.getOutputs());
assertEquals(8, r.getLastIndex());
assertFalse(iter.hasNext());
try {
iter.next();
fail();
} catch (NoSuchElementException e) {
}
}
public void largerTextExample() {
String text = "The ga3 mutant of Arabidopsis is a gibberellin-responsive dwarf. We present data showing that the ga3-1 mutant is deficient in ent-kaurene oxidase activity, the first cytochrome P450-mediated step in the gibberellin biosynthetic pathway. By using a combination of conventional map-based cloning and random sequencing we identified a putative cytochrome P450 gene mapping to the same location as GA3. Relative to the progenitor line, two ga3 mutant alleles contained single base changes generating in-frame stop codons in the predicted amino acid sequence of the P450. A genomic clone spanning the P450 locus complemented the ga3-2 mutant. The deduced GA3 protein defines an additional class of cytochrome P450 enzymes. The GA3 gene was expressed in all tissues examined, RNA abundance being highest in inflorescence tissue.";
String[] terms = { "microsome", "cytochrome", "cytochrome P450 activity",
"gibberellic acid biosynthesis", "GA3", "cytochrome P450",
"oxygen binding", "AT5G25900.1", "protein", "RNA", "gibberellin",
"Arabidopsis", "ent-kaurene oxidase activity", "inflorescence",
"tissue", };
for (int i = 0; i < terms.length; i++) {
tree.add(terms[i].getBytes(), terms[i]);
}
tree.prepare();
Set termsThatHit = new HashSet();
for (Iterator iter = tree.search(text.getBytes()); iter.hasNext();) {
SearchResult result = (SearchResult) iter.next();
termsThatHit.addAll(result.getOutputs());
}
assertEquals(new HashSet(Arrays.asList(new String[] { "cytochrome", "GA3",
"cytochrome P450", "protein", "RNA", "gibberellin", "Arabidopsis",
"ent-kaurene oxidase activity", "inflorescence", "tissue", })),
termsThatHit);
}
}