package org.apache.solr.search;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.UUID;
import java.util.zip.DeflaterOutputStream;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;
import java.util.zip.InflaterInputStream;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.search.BitSetQuery;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.MatchNoDocsQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.SolrCacheWrapper;
import org.apache.lucene.util.BitSet;
import org.apache.lucene.util.FixedBitSet;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrException.ErrorCode;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.SolrInputField;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.Base64;
import org.apache.solr.common.util.ContentStream;
import org.apache.solr.common.util.ContentStreamBase;
import org.apache.solr.common.util.ContentStreamBase.StringStream;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.handler.loader.CSVLoader;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.schema.FieldType;
import org.apache.solr.schema.SchemaField;
import org.apache.solr.schema.StrField;
import org.apache.solr.schema.TextField;
import org.apache.solr.schema.TrieIntField;
import org.apache.solr.uninverting.UninvertingReader;
import org.apache.solr.update.AddUpdateCommand;
import org.apache.solr.update.processor.UpdateRequestProcessor;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/*
* Flexible framework to query SOLR by (a large set)
* of ID's. This can be used to implement remote/fast
* bitset operations or to retrieve data that cannot
* be easily expressed using a query language; typically
* st like: id:1 OR id:2 OR id:3.............. id:n
*/
public class BitSetQParserPlugin extends QParserPlugin {
public static final Logger log = LoggerFactory.getLogger(BitSetQParserPlugin.class);
public static String NAME = "bitset";
private Set<String> allowedFields = new HashSet<String>();
private static Map<String, String> cacheMapping = new HashMap<String, String>();
private int maxAllowedGetSize = 5000;
@SuppressWarnings("rawtypes")
@Override
public void init(NamedList args) {
NamedList defs = (NamedList) args.get("defaults");
if (defs == null) {
defs = new NamedList();
}
if (defs.get("cache-mapping") != null) {
for (String s: ((String)defs.get("cache-mapping")).split(",")) {
String[] parts = s.split(":");
if (parts.length == 2) {
cacheMapping.put(parts[0], parts[1]);
}
else {
throw new SolrException(ErrorCode.SERVER_ERROR, "Wrong mapping format: " + s);
}
}
}
if (defs.get("allowed-fields") != null) {
for (String s: ((String) defs.get("allowed-fields")).split(",")) {
allowedFields.add(s);
}
}
if (defs.get("max-allowed-get-size") != null) {
maxAllowedGetSize = Integer.parseInt((String) defs.get("max-allowed-get-size"));
}
}
@Override
public QParser createParser(String qstr, SolrParams localParams,
SolrParams params, SolrQueryRequest req) {
return new QParser(qstr, localParams, params, req) {
@Override
public Query parse() {
List<DataProcessor> processors = new ArrayList<DataProcessor>();
try {
Iterable<ContentStream> streams = req.getContentStreams();
if (streams != null) {
for (ContentStream cs: req.getContentStreams()) {
DataProcessor streamProcessor = getStreamProcessor(cs);
if (streamProcessor != null) {
processors.add(streamProcessor);
}
}
}
String data;
// we also allow passing of data inside normal parametes (useful for testing)
data = localParams.get(QueryParsing.V);
if (data != null && data.length() > 0) {
if (data.length() > maxAllowedGetSize) { // solr loaded it anyway, but at least we are educating people ;)
throw new SolrException(ErrorCode.FORBIDDEN, "The data you sent is too big for GET requests. Use data streams instead");
}
StringStream cs = new ContentStreamBase.StringStream(data);
cs.setContentType("big-query/" + localParams.get("type", "bitset")
+ "-" + localParams.get("encoding", "none")
+ "; compression:" + localParams.get("compression", "none")
);
DataProcessor streamProcessor = getStreamProcessor(cs);
if (streamProcessor != null) {
processors.add(streamProcessor);
}
}
if (processors.size() == 0) {
return new MatchNoDocsQuery();
}
String[] operator = localParams.get("operator","and").split(",");
if (operator.length > 1 && operator.length != processors.size()-1) {
throw new SolrException(ErrorCode.BAD_REQUEST,
"There is " + processors.size() + " data streams, but inconsistent number of operators: " + localParams.get("operator","and"));
}
FixedBitSet topBits = null;
int i = 0;
for (DataProcessor processor : processors) {
FixedBitSet bits = processor.getBits();
if (bits == null) {
if (operator.length > 0) {
i++;
}
continue;
}
if (topBits == null) {
topBits = bits;
continue;
}
String op = operator[i];
if (op.equals("and")) {
topBits.and(bits);
}
else if (op.equals("or")) {
topBits.or(bits);
}
else if (op.equals("not")) {
topBits.andNot(bits);
}
else if (op.equals("xor")) {
topBits.xor(bits);
}
else {
throw new SolrException(ErrorCode.BAD_REQUEST, "Unknown bitset operator: " + op);
}
if (operator.length > 0) {
i++;
}
}
if (topBits.cardinality() < 1)
return new MatchNoDocsQuery();
BitSetQuery q = new BitSetQuery(topBits);
if (localParams.get("uniqueId", null) != null) {
q.setUUID(UUID.randomUUID().toString());
}
return q;
}
catch (Exception e) {
throw new SolrException(ErrorCode.SERVER_ERROR, e);
}
}
private DataProcessor getStreamProcessor(ContentStream cs) throws Exception {
// if 'streamId' is set, we may limit ourselves to grabbing only
// some streams
if (localParams.get("streamId", null) != null) {
if (!cs.getContentType().contains(localParams.get("streamId"))) {
return null;
}
}
String ct = cs.getContentType();
if (ct.contains("big-query/csv")) {
DataProcessor p = new DataProcessor(req);
CSVLoader loader = new CSVLoader();
loader.load(req, null, cs, p);
return p;
}
else if (ct.contains("big-query/bitset")) {
DataProcessor p = new DataProcessor(req) {
@Override
public FixedBitSet getBits() {
// we must harvest lucene docids
LeafReader reader = req.getSearcher().getSlowAtomicReader();
byte[] data;
try {
data = readBase64String(localParams.get(QueryParsing.V),
localParams.get("compression", "none"));
} catch (IOException e1) {
throw new SolrException(ErrorCode.BAD_REQUEST, e1);
}
FixedBitSet bits = fromByteArray(data,
localParams.getBool("little_endian", false)
? LITTLE_ENDIAN_BIT_MASK : BIG_ENDIAN_BIT_MASK);
// now, the bitset can contain lucene docids or it can be
// set of integer values that need translation into lucene
// docids; this depends on presence/absence of 'field' param
if (localParams.get("field", null) == null)
return bits;
String fieldName = localParams.get("field");
SchemaField field = req.getSchema().getField(fieldName);
if (field.multiValued()) {
throw new SolrException(ErrorCode.BAD_REQUEST, "I am sorry, you can't use bitset with multi-valued fields");
}
if (allowedFields.size() > 0 && !allowedFields.contains(fieldName)) {
throw new SolrException(ErrorCode.BAD_REQUEST, "I am sorry, you can't search against field " + fieldName + " (reason: field forbidden#!@#!)");
}
FieldType ftype = field.getType();
Class<? extends FieldType> c = ftype.getClass();
boolean fieldIsInt = true;
if (c.isAssignableFrom(TextField.class) || c.isAssignableFrom(StrField.class)) {
fieldIsInt = false;
}
else if (c.isAssignableFrom(TrieIntField.class)) {
//pass
}
else {
throw new SolrException(ErrorCode.BAD_REQUEST, "You make me sad - this field: " + fieldName + " is not indexed as integer :(");
}
FixedBitSet translatedBitSet = new FixedBitSet(reader.maxDoc());
SolrCacheWrapper<SolrCache<Object,Integer>> cacheWrapper = super.getCache(fieldName);
if (cacheWrapper != null) { // we are lucky and we have a cache that can translate values for us
for (int i = bits.nextSetBit(0); i >= 0 && i < DocIdSetIterator.NO_MORE_DOCS; i = bits.nextSetBit(i+1)) {
if (fieldIsInt) {
int v = cacheWrapper.getLuceneDocId(0, i);
if (v == -1)
continue;
translatedBitSet.set(v);
}
else {
int v = cacheWrapper.getLuceneDocId(0, Integer.toString(i));
if (v == -1)
continue;
translatedBitSet.set(v);
}
}
bits = translatedBitSet;
}
else {
if (!fieldIsInt) {
throw new SolrException(ErrorCode.BAD_REQUEST, "You make me sad - this field: " + fieldName + " is not indexed as integer :(");
}
Map<String, UninvertingReader.Type> mapping = new HashMap();
mapping.put(fieldName, UninvertingReader.Type.INTEGER_POINT);
UninvertingReader uninvertingReader = new UninvertingReader(reader, mapping);
NumericDocValues cache;
try {
cache = uninvertingReader.getNumericDocValues(fieldName);
} catch (IOException e) {
return translatedBitSet;
}
// suckers, we have to translate whateve integer value into a lucene docid
log.warn("We are translating values for a field without a cache: {}. Terrible, terrible idea!", fieldName);
int docid = 0; // lucene docid
int maxDoc = reader.maxDoc();
int docValue;
while(docid < maxDoc) {
docValue = (int) cache.get(docid);
if (docValue < bits.length() && docValue > 0 && bits.get(docValue)) {
translatedBitSet.set(docid);
}
docid++;
}
bits = translatedBitSet;
}
return bits;
}
};
return p;
}
return null;
}
};
}
public static class DataProcessor extends UpdateRequestProcessor {
private ArrayList<SolrInputDocument> docs;
SolrQueryRequest req;
public void processAdd(AddUpdateCommand cmd) throws IOException {
docs.add(cmd.solrDoc);
}
public DataProcessor(SolrQueryRequest req) {
super(null);
docs = new ArrayList<SolrInputDocument>();
this.req = req;
}
public FixedBitSet getBits() throws ParseException {
if (docs.size() == 0) {
return new FixedBitSet(0);
}
FixedBitSet bs = new FixedBitSet(req.getSearcher().maxDoc());
SolrInputDocument d = docs.get(0);
// for csv, we can assume that every doc has the same fields (?)
Iterator<SolrInputField> fi = d.iterator();
HashMap<String, SolrCacheWrapper<SolrCache<Object,Integer>>> translators = new HashMap<String, SolrCacheWrapper<SolrCache<Object,Integer>>>();
while (fi.hasNext()) {
SolrInputField field = fi.next();
SolrCacheWrapper<SolrCache<Object,Integer>> cache = getCache(field.getName());
if (cache == null) {
throw new SolrException(ErrorCode.BAD_REQUEST, "Uff, uff, I have no idea how to map this field (" + field.getName() + ") values into docids! Call 911");
}
translators.put(field.getName(), cache);
}
for (SolrInputDocument doc: docs) {
for (SolrInputField f: doc.values()) {
SolrCacheWrapper<SolrCache<Object,Integer>> c = translators.get(f.getName());
for (Object o: f.getValues()) {
int v = c.getLuceneDocId(0, o);
if (v == -1)
continue;
bs.set(v);
}
}
}
return bs;
}
@SuppressWarnings("unchecked")
public SolrCacheWrapper<SolrCache<Object, Integer>> getCache(String field) {
SolrCache<Object, Integer> sCache = null;
if (cacheMapping.containsKey(field)) {
sCache = (SolrCache<Object, Integer>) req.getSearcher().getCache(cacheMapping.get(field));
}
else {
sCache = (SolrCache<Object, Integer>) req.getSearcher().getCache(field);
}
if (sCache == null) {
return null;
}
return new SolrCacheWrapper<SolrCache<Object, Integer>>(sCache) {
@Override
public int getLuceneDocId(int sourceDocid, Object sourceValue) {
// extra checking necessary (we cannot be sure
// the id will be always correct....
if (sourceValue instanceof String) {
sourceValue = ((String) sourceValue).toLowerCase().trim();
}
Object v = cache.get().get(sourceValue);
if (v == null)
return -1;
return (Integer) v;
}
@Override
public int internalHashCode() {
return this.hashCode();
}
@Override
public String internalToString() {
return cache.get().toString();
}
};
}
}
public static class DataStream {
public DataStream(byte[] data, boolean bool) {
// TODO Auto-generated constructor stub
}
}
protected byte[] readBase64String(String string, String compression)
throws IOException {
byte[] data;
try {
data = decodeBase64(string.trim());
} catch (Exception e1) {
throw new SolrException(ErrorCode.BAD_REQUEST, e1);
}
if (compression != null && !compression.equals("none")) {
if (compression.equals("gzip")) {
data = unGZip(data);
}
else if (compression.equals("zip")) {
data = unZip(data);
}
else if ("none".equals(compression)) {
// do nothing
}
else {
throw new SolrException(ErrorCode.BAD_REQUEST, "Unsupporeted compression: " + compression);
}
}
return data;
}
protected String encodeBase64(byte[] data) throws Exception {
return Base64.byteArrayToBase64(data, 0, data.length);
}
protected byte[] decodeBase64(String data) throws Exception {
return Base64.base64ToByteArray(data);
}
protected byte[] doGZip(byte[] data) throws IOException {
ByteArrayOutputStream baos = new ByteArrayOutputStream();
GZIPOutputStream zipStream = new GZIPOutputStream(baos);
zipStream.write(data);
zipStream.flush();
zipStream.close();
return baos.toByteArray();
}
protected byte[] unGZip(byte[] data) throws IOException {
ByteArrayInputStream bais = new ByteArrayInputStream(data);
GZIPInputStream zipStream = new GZIPInputStream(bais);
ByteArrayOutputStream baos = new ByteArrayOutputStream();
byte[] buffer = new byte[1024];
int len;
while ((len = zipStream.read(buffer)) > 0) {
baos.write(buffer, 0, len);
}
bais.close();
zipStream.close();
return baos.toByteArray();
}
protected byte[] doZip(byte[] data) throws IOException {
ByteArrayOutputStream baos = new ByteArrayOutputStream();
DeflaterOutputStream zipStream = new DeflaterOutputStream(baos);
zipStream.write(data);
zipStream.flush();
zipStream.close();
return baos.toByteArray();
}
private byte[] unZip(byte[] data) throws IOException {
ByteArrayInputStream bais = new ByteArrayInputStream(data);
InflaterInputStream zipStream = new InflaterInputStream(bais);
ByteArrayOutputStream baos = new ByteArrayOutputStream();
byte[] buffer = new byte[1024];
int len;
while ((len = zipStream.read(buffer)) > 0) {
baos.write(buffer, 0, len);
}
bais.close();
zipStream.close();
return baos.toByteArray();
}
protected byte[] toByteArray(BitSet bitSet) {
byte[] bytes = new byte[(bitSet.length() + 7) / 8];
for ( int i = bitSet.nextSetBit(0); i >= 0 && i < DocIdSetIterator.NO_MORE_DOCS; i = bitSet.nextSetBit(i+1) ) {
bytes[i / 8] |= 128 >> (i % 8);
if (i+1 >= bitSet.length())
break;
}
return bytes;
}
// Returns a bitSet containing the values in bytes. Since I am planning to use
// python intbitsets and these are (probably) encoded using little endian
// we must be able to de-construct them properly, however internally, inside
// Java we should be using big endian
protected FixedBitSet fromByteArray(byte[] bytes, int[] bitMask) {
FixedBitSet bs = new FixedBitSet(bytes == null? 0 : bytes.length * 8);
int s = bytes.length * 8;
for (int i = 0; i < s; i++) {
if ((bytes[i/8] & bitMask[i%8]) != 0) // ((bytes[i/8] & (128 >> (i % 8))) != 0)
bs.set(i);
}
return bs;
}
protected FixedBitSet fromByteArray(byte[] bytes) {
return fromByteArray(bytes, BIG_ENDIAN_BIT_MASK);
}
protected int BIG_ENDIAN_BIT_MASK[] = {0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01};
protected int LITTLE_ENDIAN_BIT_MASK[] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80};
}