/*
* Copyright (c) 2013-2017 Cinchapi Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.cinchapi.concourse.server.storage.db;
import static com.cinchapi.concourse.server.GlobalState.STOPWORDS;
import java.util.Collection;
import java.util.Collections;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import javax.annotation.Nullable;
import javax.annotation.concurrent.ThreadSafe;
import com.cinchapi.concourse.annotate.DoNotInvoke;
import com.cinchapi.concourse.annotate.PackagePrivate;
import com.cinchapi.concourse.server.model.Position;
import com.cinchapi.concourse.server.model.PrimaryKey;
import com.cinchapi.concourse.server.model.Text;
import com.cinchapi.concourse.util.TStrings;
import com.google.common.collect.HashMultimap;
import com.google.common.collect.Maps;
import com.google.common.collect.Multimap;
import com.google.common.collect.Sets;
import com.google.common.collect.TreeMultimap;
/**
* A collection of n-gram indexes that enable fulltext infix searching. For
* every word in a {@link Value}, each substring index is mapped to a
* {@link Position}. The entire SearchIndex contains a collection of these
* mappings.
*
* @author Jeff Nelson
*/
@PackagePrivate
@ThreadSafe
final class SearchRecord extends Record<Text, Text, Position> {
/**
* DO NOT INVOKE. Use {@link Record#createSearchRecord(Text)} or
* {@link Record#createSearchRecordPartial(Text, Text)} instead.
*
* @param locator
* @param key
*/
@PackagePrivate
@DoNotInvoke
SearchRecord(Text locator, @Nullable Text key) {
super(locator, key);
}
/**
* Return the Set of primary keys for records that match {@code query}.
*
* @param query
* @return the Set of PrimaryKeys
*/
public Set<PrimaryKey> search(Text query) {
read.lock();
try {
Multimap<PrimaryKey, Integer> reference = HashMultimap.create();
String[] toks = query
.toString()
.toLowerCase()
.split(TStrings.REGEX_GROUP_OF_ONE_OR_MORE_WHITESPACE_CHARS);
boolean initial = true;
int offset = 0;
for (String tok : toks) {
Multimap<PrimaryKey, Integer> temp = HashMultimap.create();
if(STOPWORDS.contains(tok)) {
// When skipping a stop word, we must record an offset to
// correctly determine if the next term match is in the
// correct relative position to the previous term match
++offset;
continue;
}
Set<Position> positions = get(Text.wrap(tok));
for (Position position : positions) {
PrimaryKey key = position.getPrimaryKey();
int pos = position.getIndex();
if(initial) {
temp.put(key, pos);
}
else {
for (int current : reference.get(key)) {
if(pos == current + 1 + offset) {
temp.put(key, pos);
}
}
}
}
initial = false;
reference = temp;
offset = 0;
}
// Result Scoring: Scoring is simply the number of times the query
// appears in a document [e.g. the number of Positions mapped from
// key: #reference.get(key).size()]. The total number of positions
// in #reference is equal to the total number of times a document
// appears in the corpus [e.g. reference.asMap().values().size()].
Multimap<Integer, PrimaryKey> sorted = TreeMultimap.create(
Collections.<Integer> reverseOrder(),
PrimaryKey.Sorter.INSTANCE);
for (Entry<PrimaryKey, Collection<Integer>> entry : reference
.asMap().entrySet()) {
sorted.put(entry.getValue().size(), entry.getKey());
}
return Sets.newLinkedHashSet(sorted.values());
}
finally {
read.unlock();
}
}
@Override
protected Map<Text, Set<Position>> mapType() {
return Maps.newHashMap();
}
}