AbstractUnicodeKeyBuilderTestCase.java example

Explorer
blazegraph-master
- database-master
/*

Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016.  All rights reserved.

Contact:
     SYSTAP, LLC DBA Blazegraph
     2501 Calvert ST NW #106
     Washington, DC 20008
     licenses@blazegraph.com

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

*/
/*
 * Created on Nov 29, 2007
 */

package com.bigdata.btree.keys;

import java.util.Arrays;
import java.util.Locale;
import java.util.Properties;

import junit.framework.TestCase2;

import com.bigdata.btree.keys.DefaultKeyBuilderFactory;
import com.bigdata.btree.keys.IKeyBuilder;
import com.bigdata.btree.keys.KeyBuilder;
import com.bigdata.btree.keys.StrengthEnum;
import com.bigdata.btree.keys.KeyBuilder.Options;
import com.bigdata.util.BytesUtil;

/**
 * Base class for the test suites that examine support for Unicode sort keys in
 * {@link KeyBuilder}.
 * 
 * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a>
 * @version $Id$
 */
abstract public class AbstractUnicodeKeyBuilderTestCase extends TestCase2 {

    /**
     * 
     */
    public AbstractUnicodeKeyBuilderTestCase() {
    }

    /**
     * @param arg0
     */
    public AbstractUnicodeKeyBuilderTestCase(String arg0) {
        super(arg0);
    }

    /**
     * Test ability to encode unicode data into a variable length byte[] that
     * allows direct byte-by-byte comparisons which maintain the local-specific
     * sort order of the original strings.
     */
    public void test_keyBuilder_unicode_string_key_us_primary() {

        /*
         * Setup an instance for US English with strength := PRIMARY.
         */

        final Properties properties = getProperties();
        
        properties.setProperty(Options.USER_LANGUAGE, Locale.US.getLanguage());

        properties.setProperty(Options.USER_COUNTRY, Locale.US.getCountry());
        
        properties.setProperty(Options.STRENGTH, StrengthEnum.Primary
                .toString());

        final DefaultKeyBuilderFactory factory = new DefaultKeyBuilderFactory(
                properties);

        assertEquals(Locale.US.getLanguage(), factory.getLocale().getLanguage());

        assertEquals(Locale.US.getCountry(), factory.getLocale().getCountry());

        assertEquals(StrengthEnum.Primary, factory.getStrength());
        
        final IKeyBuilder keyBuilder = factory.getKeyBuilder();

//        // verify assumption under that configuration.
//        {
//            RuleBasedCollator usCollator = (RuleBasedCollator) Collator
//                    .getInstance(Locale.US);
//
//            usCollator.setStrength(Collator.PRIMARY);
//
//            assertEquals(0, usCollator.compare("abc", "ABC"));
//        }
            
        byte[] key1 = keyBuilder.reset().append("abc").getKey();
        byte[] key2 = keyBuilder.reset().append("ABC").getKey();
        byte[] key3 = keyBuilder.reset().append("Abc").getKey();

        System.err.println("abc: "+BytesUtil.toString(key1));
        System.err.println("ABC: "+BytesUtil.toString(key2));
        System.err.println("Abc: "+BytesUtil.toString(key3));

        // all are equal using PRIMARY strength.
        assertEquals(0,BytesUtil.compareBytes(key1, key2));
        assertEquals(0,BytesUtil.compareBytes(key2, key3));
        
    }

    public void test_keyBuilder_unicode_string_key_us_identical() {

        /*
         * Setup an instance for US English with strength := IDENTICAL.
         */

        final Properties properties = new Properties();
        
        properties.setProperty(Options.USER_LANGUAGE, Locale.US.getLanguage());

        properties.setProperty(Options.USER_COUNTRY, Locale.US.getCountry());
        
        properties.setProperty(Options.STRENGTH, ""+StrengthEnum.Identical);

        final IKeyBuilder keyBuilder = KeyBuilder.newUnicodeInstance(properties);

//        // verify assumption under that configuration.
//        {
//            RuleBasedCollator usCollator = (RuleBasedCollator) Collator
//                    .getInstance(Locale.US);
//
//            usCollator.setStrength(Collator.IDENTICAL);
//
//            assertNotSame(0, usCollator.compare("abc", "ABC"));
//        }

        // IKeyBuilder keyBuilder = new UnicodeKeyBuilder(usCollator,1000);
            
        byte[] key1 = keyBuilder.reset().append("abc").getKey();
        byte[] key2 = keyBuilder.reset().append("ABC").getKey();
        byte[] key3 = keyBuilder.reset().append("Abc").getKey();

        System.err.println("abc: "+BytesUtil.toString(key1));
        System.err.println("ABC: "+BytesUtil.toString(key2));
        System.err.println("Abc: "+BytesUtil.toString(key3));

        // verify ordering for IDENTICAL comparison.
        assertTrue(BytesUtil.compareBytes(key1, key2)<0);
        assertTrue(BytesUtil.compareBytes(key2, key3)>0);
        
    }

    /**
     * Test verifies that the trailing <code>nul</code> byte is not part of
     * the key when a unicode string is appended to an {@link IKeyBuilder}.
     * <p>
     * Note: The trailing <code>nul</code> byte is appended by the ICU library
     * in order to have compatibility with their C library, but it is not of
     * interest for Java processing. However, note that a <code>nul</code>
     * byte MAY be used to separate components of a complex key.
     */
    public void test_keyBuilder_unicode_String_noTrailingNul() {

        final IKeyBuilder keyBuilder = KeyBuilder.newUnicodeInstance(getProperties());
        
        keyBuilder.append("Hello World!");
        
        final byte[] key = keyBuilder.getKey();
        
        assertNotSame("Not expecting a trailing nul byte.", (byte) 0,
                key[key.length - 1]);
        
    }

    /*
     * Note: This test has been disabled.  The SparseRowStore no longer makes
     * this assumption in order to be compatible with the JDK collator.
     */
//    /**
//     * Test verifies that the <code>nul</code> byte is not part of the key when
//     * a unicode string is appended to an {@link IKeyBuilder}.
//     * <p>
//     * Note: The {@link SparseRowStore} assumes that Unicode sort keys do not
//     * contain embedded <code>nul</code>s.
//     */
//    public void test_keyBuilder_unicode_String_noEmbeddedNuls() {
//
//        final IKeyBuilder keyBuilder = KeyBuilder
//                .newUnicodeInstance(getProperties());
//
//        keyBuilder.append("Hello World!");
//
//        final byte[] key = keyBuilder.getKey();
//
//        for (int i = 0; i < key.length; i++) {
//            if (key[i] == 0)
//                fail("Embedded nuls: key=" + BytesUtil.toString(key));
//        }
//        
//    }

    /**
     * Test of the ability to normalize trailing pad characters.
     */
    public void test_keyBuilder_normalizeTrailingPadCharacters() {
        
        final KeyBuilder keyBuilder = (KeyBuilder) KeyBuilder
                .newUnicodeInstance(getProperties());
    
        assertEquals(//
                keyBuilder.normalizeText(""),//
                keyBuilder.normalizeText(" ")//
                );
        assertEquals(//
                keyBuilder.normalizeText(""),//
                keyBuilder.normalizeText("  ")//
                );
        assertEquals(//
                keyBuilder.normalizeText(""),//
                keyBuilder.normalizeText("      ")//
                );
        assertEquals(//
                keyBuilder.normalizeText(" "),//
                keyBuilder.normalizeText("      ")//
                );
        assertEquals(//
                keyBuilder.normalizeText("abc"),//
                keyBuilder.normalizeText("abc      ")//
                );
        assertEquals(//
                keyBuilder.normalizeText("   abc"),//
                keyBuilder.normalizeText("   abc      ")//
                );
        assertNotSame(//
                keyBuilder.normalizeText("abc"),//
                keyBuilder.normalizeText("   abc      ")//
                );
        
    }
    
    /**
     * Test verifies that very long strings are truncated.
     * 
     * @todo verify that trailing whitespace is removed after truncation rather
     *       than before truncation.
     */
    public void test_keyBuilder_normalizeTruncatesVeryLongStrings() {

        final KeyBuilder keyBuilder = (KeyBuilder)KeyBuilder.newUnicodeInstance(getProperties());

        final String text = TestKeyBuilder.getMaximumLengthText();

        assertEquals(//
                keyBuilder.normalizeText(text),//
                keyBuilder.normalizeText(text+"abc")//
                );
        
    }
    
    /**
     * Test verifies the order among unicode sort keys, including verifying that
     * the pad byte causes a prefix such as "bro" to sort before a term which
     * extends that prefix, such as "brown".
     */
    public void test_keyBuilder_unicode_order() {        

        final KeyBuilder keyBuilder = (KeyBuilder) KeyBuilder.newUnicodeInstance(getProperties());
        
        final KVO<String>[] a = new KVO[] {
          
                new KVO<String>(TestKeyBuilder.asSortKey("bro"),null,"bro"),
                new KVO<String>(TestKeyBuilder.asSortKey("brown"),null,"brown"),
                new KVO<String>(TestKeyBuilder.asSortKey("bre"),null,"bre"),
                new KVO<String>(TestKeyBuilder.asSortKey("break"),null,"break"),
                
        };
        
        // sort by the assigned sort keys.
        Arrays.sort(a);
        
        /*
         * verify that "bre(ak)" is before "bro(wn)" and that "bre" is before
         * "break" and "bro" is before "brown".
         */
        assertEquals("bre", a[0].obj);
        assertEquals("break", a[1].obj);
        assertEquals("bro", a[2].obj);
        assertEquals("brown", a[3].obj);
        
    }
    
    /**
     * <p>
     * Test that lexiographic order is maintain when a variable length Unicode
     * field is followed by another field. This test works by comparing the
     * original multi-field key with the multi-field key formed from the
     * successor of the Unicode field followed by the other field:
     * </p>
     * 
     * <pre>
     *   
     *   [text][nextValue] LT [successor(text)][nextValue]
     *   
     * </pre>
     */
    public void test_keyBuilder_multiField_unicode() {

        final KeyBuilder keyBuilder = (KeyBuilder) KeyBuilder
                .newUnicodeInstance(getProperties());

        TestKeyBuilder.doMultiFieldTests(true/* unicode */, keyBuilder);

        /*
         * Now test some strings that contain code points outside of the 8-bit
         * range.
         */
        
//        final KeyBuilder keyBuilder = (KeyBuilder) KeyBuilder
//                .newUnicodeInstance();

        final boolean unicode = true;
        {
            
            // Note: This is "Japanese" in kanji.
            String text = "\u65E5\u672C\u8A9E / \u306B\u307B\u3093\u3054";
            
            TestKeyBuilder.doMultiFieldTest(keyBuilder, unicode, text, (byte) 0);
            TestKeyBuilder.doMultiFieldTest(keyBuilder, unicode, text, (byte) 1);
            TestKeyBuilder.doMultiFieldTest(keyBuilder, unicode, text, (byte) -1);
            TestKeyBuilder.doMultiFieldTest(keyBuilder, unicode, text, Byte.MIN_VALUE);
            TestKeyBuilder.doMultiFieldTest(keyBuilder, unicode, text, Byte.MAX_VALUE);
        }

    }
    
}