/*
* Copyright 2008 Glencoe Software, Inc. All rights reserved.
* Use is subject to license terms supplied in LICENSE.txt
*/
package ome.server.utests;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import ome.services.fulltext.FullTextAnalyzer;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Searcher;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.jmock.MockObjectTestCase;
import org.testng.annotations.Test;
@Test(timeOut = 1000) // Lucene initialization takes longer than default 200ms.
public class TokenizationTest extends MockObjectTestCase {
void assertTokenizes(String text, String... tokens) {
List<Token> results = tokenize(text);
assertEquals(tokens.length, results.size());
for (int i = 0; i < tokens.length; i++) {
String term = results.get(i).term();
assertEquals(String.format("%s!=%s:%s", tokens[i], term,
results.toString()), tokens[i], term);
}
}
@Test(groups = "ticket:3164")
public void testProperHandlingOfFileNames() {
assertTokenizes(".tif", "tif");
assertTokenizes("*.tif", "tif");
assertTokenizes("s*.tif", "s", "tif");
}
@Test
public void testDefaults() {
assertTokenizes("foo bar", "foo", "bar");
assertTokenizes("foo/bar", "foo", "bar");
assertTokenizes("foo||bar", "foo", "bar");
assertTokenizes("foo;;bar", "foo", "bar");
assertTokenizes("foo||bar;;qaz", "foo", "bar", "qaz");
assertTokenizes("foo-bar", "foo", "bar");
assertTokenizes("foo_bar", "foo", "bar");
assertTokenizes("foo.bar", "foo", "bar");
assertTokenizes("U.S.A.", "u", "s", "a");
assertTokenizes("26.8.06-antiCSFV/CSFV-GFP/CSFV-GFP01_1_R3D_D3D.dv",
"26", "8", "06", "anticsfv", "csfv", "gfp", "csfv", "gfp01",
"1", "r3d", "d3d", "dv");
assertTokenizes("...FRAP-23.8.05/IAGFP-Noc05_R3D.dv", "frap", "23",
"8", "05", "iagfp", "noc05", "r3d", "dv");
assertTokenizes("will/Desktop/CSFV-GFP01_3_R3D_D3D.dv", "will",
"desktop", "csfv", "gfp01", "3", "r3d", "d3d", "dv");
assertTokenizes("Documents/biology-data/CSFV-GFP01_3_R3D_D3D.dv",
"documents", "biology", "data", "csfv", "gfp01", "3", "r3d",
"d3d", "dv");
}
@Test
public void testTokenizationWithQuery() throws Exception {
Searcher searcher = null;
try {
Directory directory = new RAMDirectory();
Analyzer analyzer = new FullTextAnalyzer();
IndexWriter writer = new IndexWriter(directory, analyzer,
IndexWriter.MaxFieldLength.UNLIMITED);
String[] docs = { "GFP-CSFV-abc", "GFP-H2B-123", "GFP_H2B-456" };
addDocuments(writer, docs);
searcher = new IndexSearcher(directory);
Map<String, Integer> queryToResults = new HashMap();
queryToResults.put("GFP", 3);
queryToResults.put("GFP*", 3);
queryToResults.put("GFP-H2B", 2);
queryToResults.put("\"GFP H2B\"", 2);
queryToResults.put("\"H2B GFP\"", 0);
QueryParser parser = new QueryParser("contents", analyzer);
for (String queryStr : queryToResults.keySet()) {
Query query = parser.parse(queryStr);
System.out.println("Query: " + query.toString("contents"));
ScoreDoc[] hits = searcher.search(query, null, docs.length).scoreDocs;
assertEquals(queryStr, queryToResults.get(queryStr).intValue(),
hits.length);
System.out.println(hits.length + " total results");
}
} finally {
if (searcher != null) {
searcher.close();
}
}
}
// Helpers
// =============================================================
private void addDocuments(IndexWriter writer, String[] docs)
throws CorruptIndexException, IOException {
for (int j = 0; j < docs.length; j++) {
Document d = new Document();
d.add(new Field("contents", docs[j], Field.Store.YES,
Field.Index.ANALYZED));
writer.addDocument(d);
}
writer.close();
}
private List<Token> tokenize(String a) {
// StandardAnalyzer sa = new StandardAnalyzer();
FullTextAnalyzer sa = new FullTextAnalyzer();
TokenStream ts = sa.tokenStream("field", new StringReader(a));
List<Token> tokens = new ArrayList<Token>();
try {
while (true) {
Token t = new Token();
t = ts.next(t);
if (t == null) {
break;
}
tokens.add(t);
}
} catch (IOException io) {
// ok
}
return tokens;
}
}