/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.analysis.charfilter; import java.io.Reader; import java.io.StringReader; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Random; import java.util.Set; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.CharFilter; import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.util.TestUtil; import org.apache.lucene.util.UnicodeUtil; public class TestMappingCharFilter extends BaseTokenStreamTestCase { NormalizeCharMap normMap; @Override public void setUp() throws Exception { super.setUp(); NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder(); builder.add( "aa", "a" ); builder.add( "bbb", "b" ); builder.add( "cccc", "cc" ); builder.add( "h", "i" ); builder.add( "j", "jj" ); builder.add( "k", "kkk" ); builder.add( "ll", "llll" ); builder.add( "empty", "" ); // BMP (surrogate pair): builder.add(UnicodeUtil.newString(new int[] {0x1D122}, 0, 1), "fclef"); builder.add("\uff01", "full-width-exclamation"); normMap = builder.build(); } public void testReaderReset() throws Exception { CharFilter cs = new MappingCharFilter( normMap, new StringReader( "x" ) ); char[] buf = new char[10]; int len = cs.read(buf, 0, 10); assertEquals( 1, len ); assertEquals( 'x', buf[0]) ; len = cs.read(buf, 0, 10); assertEquals( -1, len ); // rewind cs.reset(); len = cs.read(buf, 0, 10); assertEquals( 1, len ); assertEquals( 'x', buf[0]) ; } public void testNothingChange() throws Exception { CharFilter cs = new MappingCharFilter( normMap, new StringReader( "x" ) ); TokenStream ts =whitespaceMockTokenizer(cs); assertTokenStreamContents(ts, new String[]{"x"}, new int[]{0}, new int[]{1}, 1); } public void test1to1() throws Exception { CharFilter cs = new MappingCharFilter( normMap, new StringReader( "h" ) ); TokenStream ts =whitespaceMockTokenizer(cs); assertTokenStreamContents(ts, new String[]{"i"}, new int[]{0}, new int[]{1}, 1); } public void test1to2() throws Exception { CharFilter cs = new MappingCharFilter( normMap, new StringReader( "j" ) ); TokenStream ts =whitespaceMockTokenizer(cs); assertTokenStreamContents(ts, new String[]{"jj"}, new int[]{0}, new int[]{1}, 1); } public void test1to3() throws Exception { CharFilter cs = new MappingCharFilter( normMap, new StringReader( "k" ) ); TokenStream ts =whitespaceMockTokenizer(cs); assertTokenStreamContents(ts, new String[]{"kkk"}, new int[]{0}, new int[]{1}, 1); } public void test2to4() throws Exception { CharFilter cs = new MappingCharFilter( normMap, new StringReader( "ll" ) ); TokenStream ts =whitespaceMockTokenizer(cs); assertTokenStreamContents(ts, new String[]{"llll"}, new int[]{0}, new int[]{2}, 2); } public void test2to1() throws Exception { CharFilter cs = new MappingCharFilter( normMap, new StringReader( "aa" ) ); TokenStream ts =whitespaceMockTokenizer(cs); assertTokenStreamContents(ts, new String[]{"a"}, new int[]{0}, new int[]{2}, 2); } public void test3to1() throws Exception { CharFilter cs = new MappingCharFilter( normMap, new StringReader( "bbb" ) ); TokenStream ts =whitespaceMockTokenizer(cs); assertTokenStreamContents(ts, new String[]{"b"}, new int[]{0}, new int[]{3}, 3); } public void test4to2() throws Exception { CharFilter cs = new MappingCharFilter( normMap, new StringReader( "cccc" ) ); TokenStream ts =whitespaceMockTokenizer(cs); assertTokenStreamContents(ts, new String[]{"cc"}, new int[]{0}, new int[]{4}, 4); } public void test5to0() throws Exception { CharFilter cs = new MappingCharFilter( normMap, new StringReader( "empty" ) ); TokenStream ts =whitespaceMockTokenizer(cs); assertTokenStreamContents(ts, new String[0], new int[]{}, new int[]{}, 5); } public void testNonBMPChar() throws Exception { CharFilter cs = new MappingCharFilter( normMap, new StringReader( UnicodeUtil.newString(new int[] {0x1D122}, 0, 1) ) ); TokenStream ts =whitespaceMockTokenizer(cs); assertTokenStreamContents(ts, new String[]{"fclef"}, new int[]{0}, new int[]{2}, 2); } public void testFullWidthChar() throws Exception { CharFilter cs = new MappingCharFilter( normMap, new StringReader( "\uff01") ); TokenStream ts =whitespaceMockTokenizer(cs); assertTokenStreamContents(ts, new String[]{"full-width-exclamation"}, new int[]{0}, new int[]{1}, 1); } // // 1111111111222 // 01234567890123456789012 //(in) h i j k ll cccc bbb aa // // 1111111111222 // 01234567890123456789012 //(out) i i jj kkk llll cc b a // // h, 0, 1 => i, 0, 1 // i, 2, 3 => i, 2, 3 // j, 4, 5 => jj, 4, 5 // k, 6, 7 => kkk, 6, 7 // ll, 8,10 => llll, 8,10 // cccc,11,15 => cc,11,15 // bbb,16,19 => b,16,19 // aa,20,22 => a,20,22 // public void testTokenStream() throws Exception { String testString = "h i j k ll cccc bbb aa"; CharFilter cs = new MappingCharFilter( normMap, new StringReader( testString ) ); TokenStream ts =whitespaceMockTokenizer(cs); assertTokenStreamContents(ts, new String[]{"i","i","jj","kkk","llll","cc","b","a"}, new int[]{0,2,4,6,8,11,16,20}, new int[]{1,3,5,7,10,15,19,22}, testString.length() ); } // // // 0123456789 //(in) aaaa ll h //(out-1) aa llll i //(out-2) a llllllll i // // aaaa,0,4 => a,0,4 // ll,5,7 => llllllll,5,7 // h,8,9 => i,8,9 public void testChained() throws Exception { String testString = "aaaa ll h"; CharFilter cs = new MappingCharFilter( normMap, new MappingCharFilter( normMap, new StringReader( testString ) ) ); TokenStream ts =whitespaceMockTokenizer(cs); assertTokenStreamContents(ts, new String[]{"a","llllllll","i"}, new int[]{0,5,8}, new int[]{4,7,9}, testString.length() ); } public void testRandom() throws Exception { Analyzer analyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); return new TokenStreamComponents(tokenizer, tokenizer); } @Override protected Reader initReader(String fieldName, Reader reader) { return new MappingCharFilter(normMap, reader); } }; int numRounds = RANDOM_MULTIPLIER * 10000; checkRandomData(random(), analyzer, numRounds); analyzer.close(); } //@Ignore("wrong finalOffset: https://issues.apache.org/jira/browse/LUCENE-3971") public void testFinalOffsetSpecialCase() throws Exception { final NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder(); builder.add("t", ""); // even though this below rule has no effect, the test passes if you remove it!! builder.add("tmakdbl", "c"); final NormalizeCharMap map = builder.build(); Analyzer analyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); return new TokenStreamComponents(tokenizer, tokenizer); } @Override protected Reader initReader(String fieldName, Reader reader) { return new MappingCharFilter(map, reader); } }; String text = "gzw f quaxot"; checkAnalysisConsistency(random(), analyzer, false, text); analyzer.close(); } //@Ignore("wrong finalOffset: https://issues.apache.org/jira/browse/LUCENE-3971") public void testRandomMaps() throws Exception { int numIterations = atLeast(3); for (int i = 0; i < numIterations; i++) { final NormalizeCharMap map = randomMap(); Analyzer analyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); return new TokenStreamComponents(tokenizer, tokenizer); } @Override protected Reader initReader(String fieldName, Reader reader) { return new MappingCharFilter(map, reader); } }; int numRounds = 100; checkRandomData(random(), analyzer, numRounds); analyzer.close(); } } private NormalizeCharMap randomMap() { Random random = random(); NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder(); // we can't add duplicate keys, or NormalizeCharMap gets angry Set<String> keys = new HashSet<>(); int num = random.nextInt(5); //System.out.println("NormalizeCharMap="); for (int i = 0; i < num; i++) { String key = TestUtil.randomSimpleString(random); if (!keys.contains(key) && key.length() != 0) { String value = TestUtil.randomSimpleString(random); builder.add(key, value); keys.add(key); //System.out.println("mapping: '" + key + "' => '" + value + "'"); } } return builder.build(); } public void testRandomMaps2() throws Exception { final Random random = random(); final int numIterations = atLeast(3); for(int iter=0;iter<numIterations;iter++) { if (VERBOSE) { System.out.println("\nTEST iter=" + iter); } final char endLetter = (char) TestUtil.nextInt(random, 'b', 'z'); final Map<String,String> map = new HashMap<>(); final NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder(); final int numMappings = atLeast(5); if (VERBOSE) { System.out.println(" mappings:"); } while (map.size() < numMappings) { final String key = TestUtil.randomSimpleStringRange(random, 'a', endLetter, 7); if (key.length() != 0 && !map.containsKey(key)) { final String value = TestUtil.randomSimpleString(random); map.put(key, value); builder.add(key, value); if (VERBOSE) { System.out.println(" " + key + " -> " + value); } } } final NormalizeCharMap charMap = builder.build(); if (VERBOSE) { System.out.println(" test random documents..."); } for(int iter2=0;iter2<100;iter2++) { final String content = TestUtil.randomSimpleStringRange(random, 'a', endLetter, atLeast(1000)); if (VERBOSE) { System.out.println(" content=" + content); } // Do stupid dog-slow mapping: // Output string: final StringBuilder output = new StringBuilder(); // Maps output offset to input offset: final List<Integer> inputOffsets = new ArrayList<>(); int cumDiff = 0; int charIdx = 0; while(charIdx < content.length()) { int matchLen = -1; String matchRepl = null; for(Map.Entry<String,String> ent : map.entrySet()) { final String match = ent.getKey(); if (charIdx + match.length() <= content.length()) { final int limit = charIdx+match.length(); boolean matches = true; for(int charIdx2=charIdx;charIdx2<limit;charIdx2++) { if (match.charAt(charIdx2-charIdx) != content.charAt(charIdx2)) { matches = false; break; } } if (matches) { final String repl = ent.getValue(); if (match.length() > matchLen) { // Greedy: longer match wins matchLen = match.length(); matchRepl = repl; } } } } if (matchLen != -1) { // We found a match here! if (VERBOSE) { System.out.println(" match=" + content.substring(charIdx, charIdx+matchLen) + " @ off=" + charIdx + " repl=" + matchRepl); } output.append(matchRepl); final int minLen = Math.min(matchLen, matchRepl.length()); // Common part, directly maps back to input // offset: for(int outIdx=0;outIdx<minLen;outIdx++) { inputOffsets.add(output.length() - matchRepl.length() + outIdx + cumDiff); } cumDiff += matchLen - matchRepl.length(); charIdx += matchLen; if (matchRepl.length() < matchLen) { // Replacement string is shorter than matched // input: nothing to do } else if (matchRepl.length() > matchLen) { // Replacement string is longer than matched // input: for all the "extra" chars we map // back to a single input offset: for(int outIdx=matchLen;outIdx<matchRepl.length();outIdx++) { inputOffsets.add(output.length() + cumDiff - 1); } } else { // Same length: no change to offset } assert inputOffsets.size() == output.length(): "inputOffsets.size()=" + inputOffsets.size() + " vs output.length()=" + output.length(); } else { inputOffsets.add(output.length() + cumDiff); output.append(content.charAt(charIdx)); charIdx++; } } final String expected = output.toString(); if (VERBOSE) { System.out.print(" expected:"); for(int charIdx2=0;charIdx2<expected.length();charIdx2++) { System.out.print(" " + expected.charAt(charIdx2) + "/" + inputOffsets.get(charIdx2)); } System.out.println(); } final MappingCharFilter mapFilter = new MappingCharFilter(charMap, new StringReader(content)); final StringBuilder actualBuilder = new StringBuilder(); final List<Integer> actualInputOffsets = new ArrayList<>(); // Now consume the actual mapFilter, somewhat randomly: while (true) { if (random.nextBoolean()) { final int ch = mapFilter.read(); if (ch == -1) { break; } actualBuilder.append((char) ch); } else { final char[] buffer = new char[TestUtil.nextInt(random, 1, 100)]; final int off = buffer.length == 1 ? 0 : random.nextInt(buffer.length-1); final int count = mapFilter.read(buffer, off, buffer.length-off); if (count == -1) { break; } else { actualBuilder.append(buffer, off, count); } } if (random.nextInt(10) == 7) { // Map offsets while(actualInputOffsets.size() < actualBuilder.length()) { actualInputOffsets.add(mapFilter.correctOffset(actualInputOffsets.size())); } } } // Finish mappping offsets while(actualInputOffsets.size() < actualBuilder.length()) { actualInputOffsets.add(mapFilter.correctOffset(actualInputOffsets.size())); } final String actual = actualBuilder.toString(); // Verify: assertEquals(expected, actual); assertEquals(inputOffsets, actualInputOffsets); } } } }