package org.apache.lucene.index.codecs.preflex;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.store.*;
import org.apache.lucene.document.*;
import org.apache.lucene.analysis.*;
import org.apache.lucene.index.*;
import org.apache.lucene.index.codecs.preflexrw.PreFlexRWCodec;
import org.apache.lucene.util.*;
import java.util.*;
import java.io.IOException;
import static org.junit.Assert.*;
import org.junit.Test;
public class TestSurrogates extends LuceneTestCaseJ4 {
private static String makeDifficultRandomUnicodeString(Random r) {
final int end = r.nextInt(20);
if (end == 0) {
// allow 0 length
return "";
}
final char[] buffer = new char[end];
for (int i = 0; i < end; i++) {
int t = r.nextInt(5);
if (0 == t && i < end - 1) {
// hi
buffer[i++] = (char) (0xd800 + r.nextInt(2));
// lo
buffer[i] = (char) (0xdc00 + r.nextInt(2));
} else if (t <= 3) {
buffer[i] = (char) ('a' + r.nextInt(2));
} else if (4 == t) {
buffer[i] = (char) (0xe000 + r.nextInt(2));
}
}
return new String(buffer, 0, end);
}
private String toHexString(Term t) {
return t.field() + ":" + UnicodeUtil.toHexString(t.text());
}
private String getRandomString(Random r) {
String s;
if (r.nextInt(5) == 1) {
if (r.nextInt(3) == 1) {
s = makeDifficultRandomUnicodeString(r);
} else {
s = _TestUtil.randomUnicodeString(r);
}
} else {
s = _TestUtil.randomRealisticUnicodeString(r);
}
return s;
}
private static class SortTermAsUTF16Comparator implements Comparator<Term> {
public int compare(Term o1, Term o2) {
return o1.compareToUTF16(o2);
}
}
private static final SortTermAsUTF16Comparator termAsUTF16Comparator = new SortTermAsUTF16Comparator();
// single straight enum
private void doTestStraightEnum(List<Term> fieldTerms, IndexReader reader, int uniqueTermCount) throws IOException {
if (VERBOSE) {
System.out.println("\nTEST: top now enum reader=" + reader);
}
FieldsEnum fieldsEnum = MultiFields.getFields(reader).iterator();
{
// Test straight enum:
String field;
int termCount = 0;
while((field = fieldsEnum.next()) != null) {
TermsEnum termsEnum = fieldsEnum.terms();
BytesRef text;
BytesRef lastText = null;
while((text = termsEnum.next()) != null) {
Term exp = fieldTerms.get(termCount);
if (VERBOSE) {
System.out.println(" got term=" + field + ":" + UnicodeUtil.toHexString(text.utf8ToString()));
System.out.println(" exp=" + exp.field() + ":" + UnicodeUtil.toHexString(exp.text().toString()));
System.out.println();
}
if (lastText == null) {
lastText = new BytesRef(text);
} else {
assertTrue(lastText.compareTo(text) < 0);
lastText.copy(text);
}
assertEquals(exp.field(), field);
assertEquals(exp.bytes(), text);
termCount++;
}
if (VERBOSE) {
System.out.println(" no more terms for field=" + field);
}
}
assertEquals(uniqueTermCount, termCount);
}
}
// randomly seeks to term that we know exists, then next's
// from there
private void doTestSeekExists(Random r, List<Term> fieldTerms, IndexReader reader) throws IOException {
final Map<String,TermsEnum> tes = new HashMap<String,TermsEnum>();
// Test random seek to existing term, then enum:
if (VERBOSE) {
System.out.println("\nTEST: top now seek");
}
int num = 100 * RANDOM_MULTIPLIER;
for (int iter = 0; iter < num; iter++) {
// pick random field+term
int spot = r.nextInt(fieldTerms.size());
Term term = fieldTerms.get(spot);
String field = term.field();
if (VERBOSE) {
System.out.println("TEST: exist seek field=" + field + " term=" + UnicodeUtil.toHexString(term.text()));
}
// seek to it
TermsEnum te = tes.get(field);
if (te == null) {
te = MultiFields.getTerms(reader, field).iterator();
tes.put(field, te);
}
if (VERBOSE) {
System.out.println(" done get enum");
}
// seek should find the term
assertEquals(TermsEnum.SeekStatus.FOUND,
te.seek(term.bytes()));
// now .next() this many times:
int ct = _TestUtil.nextInt(r, 5, 100);
for(int i=0;i<ct;i++) {
if (VERBOSE) {
System.out.println("TEST: now next()");
}
if (1+spot+i >= fieldTerms.size()) {
break;
}
term = fieldTerms.get(1+spot+i);
if (term.field() != field) {
assertNull(te.next());
break;
} else {
BytesRef t = te.next();
if (VERBOSE) {
System.out.println(" got term=" + (t == null ? null : UnicodeUtil.toHexString(t.utf8ToString())));
System.out.println(" exp=" + UnicodeUtil.toHexString(term.text().toString()));
}
assertEquals(term.bytes(), t);
}
}
}
}
private void doTestSeekDoesNotExist(Random r, int numField, List<Term> fieldTerms, Term[] fieldTermsArray, IndexReader reader) throws IOException {
final Map<String,TermsEnum> tes = new HashMap<String,TermsEnum>();
if (VERBOSE) {
System.out.println("TEST: top random seeks");
}
{
int num = 100 * RANDOM_MULTIPLIER;
for (int iter = 0; iter < num; iter++) {
// seek to random spot
String field = ("f" + r.nextInt(numField)).intern();
Term tx = new Term(field, getRandomString(r));
int spot = Arrays.binarySearch(fieldTermsArray, tx);
if (spot < 0) {
if (VERBOSE) {
System.out.println("TEST: non-exist seek to " + field + ":" + UnicodeUtil.toHexString(tx.text()));
}
// term does not exist:
TermsEnum te = tes.get(field);
if (te == null) {
te = MultiFields.getTerms(reader, field).iterator();
tes.put(field, te);
}
if (VERBOSE) {
System.out.println(" got enum");
}
spot = -spot - 1;
if (spot == fieldTerms.size() || fieldTerms.get(spot).field() != field) {
assertEquals(TermsEnum.SeekStatus.END, te.seek(tx.bytes()));
} else {
assertEquals(TermsEnum.SeekStatus.NOT_FOUND, te.seek(tx.bytes()));
if (VERBOSE) {
System.out.println(" got term=" + UnicodeUtil.toHexString(te.term().utf8ToString()));
System.out.println(" exp term=" + UnicodeUtil.toHexString(fieldTerms.get(spot).text()));
}
assertEquals(fieldTerms.get(spot).bytes(),
te.term());
// now .next() this many times:
int ct = _TestUtil.nextInt(r, 5, 100);
for(int i=0;i<ct;i++) {
if (VERBOSE) {
System.out.println("TEST: now next()");
}
if (1+spot+i >= fieldTerms.size()) {
break;
}
Term term = fieldTerms.get(1+spot+i);
if (term.field() != field) {
assertNull(te.next());
break;
} else {
BytesRef t = te.next();
if (VERBOSE) {
System.out.println(" got term=" + (t == null ? null : UnicodeUtil.toHexString(t.utf8ToString())));
System.out.println(" exp=" + UnicodeUtil.toHexString(term.text().toString()));
}
assertEquals(term.bytes(), t);
}
}
}
}
}
}
}
@Test
public void testSurrogatesOrder() throws Exception {
Random r = newRandom();
Directory dir = newDirectory(r);
RandomIndexWriter w = new RandomIndexWriter(r,
dir,
newIndexWriterConfig(r, TEST_VERSION_CURRENT,
new MockAnalyzer()).setCodecProvider(_TestUtil.alwaysCodec(new PreFlexRWCodec())));
final int numField = _TestUtil.nextInt(r, 2, 5);
int uniqueTermCount = 0;
int tc = 0;
List<Term> fieldTerms = new ArrayList<Term>();
for(int f=0;f<numField;f++) {
String field = "f" + f;
final int numTerms = 10000 * RANDOM_MULTIPLIER;
final Set<String> uniqueTerms = new HashSet<String>();
for(int i=0;i<numTerms;i++) {
String term = getRandomString(r) + "_ " + (tc++);
uniqueTerms.add(term);
fieldTerms.add(new Term(field, term));
Document doc = new Document();
doc.add(new Field(field, term, Field.Store.NO, Field.Index.NOT_ANALYZED));
w.addDocument(doc);
}
uniqueTermCount += uniqueTerms.size();
}
IndexReader reader = w.getReader();
if (VERBOSE) {
Collections.sort(fieldTerms, termAsUTF16Comparator);
System.out.println("\nTEST: UTF16 order");
for(Term t: fieldTerms) {
System.out.println(" " + toHexString(t));
}
}
// sorts in code point order:
Collections.sort(fieldTerms);
if (VERBOSE) {
System.out.println("\nTEST: codepoint order");
for(Term t: fieldTerms) {
System.out.println(" " + toHexString(t));
}
}
Term[] fieldTermsArray = fieldTerms.toArray(new Term[fieldTerms.size()]);
//SegmentInfo si = makePreFlexSegment(r, "_0", dir, fieldInfos, codec, fieldTerms);
//FieldsProducer fields = codec.fieldsProducer(new SegmentReadState(dir, si, fieldInfos, 1024, 1));
//assertNotNull(fields);
doTestStraightEnum(fieldTerms, reader, uniqueTermCount);
doTestSeekExists(r, fieldTerms, reader);
doTestSeekDoesNotExist(r, numField, fieldTerms, fieldTermsArray, reader);
reader.close();
w.close();
dir.close();
}
}