BitSetQParserPlugin.java example

Explorer
montysolr-master
- contrib
package org.apache.solr.search;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.UUID;
import java.util.zip.DeflaterOutputStream;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;
import java.util.zip.InflaterInputStream;

import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.search.BitSetQuery;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.MatchNoDocsQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.SolrCacheWrapper;
import org.apache.lucene.util.BitSet;
import org.apache.lucene.util.FixedBitSet;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrException.ErrorCode;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.SolrInputField;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.Base64;
import org.apache.solr.common.util.ContentStream;
import org.apache.solr.common.util.ContentStreamBase;
import org.apache.solr.common.util.ContentStreamBase.StringStream;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.handler.loader.CSVLoader;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.schema.FieldType;
import org.apache.solr.schema.SchemaField;
import org.apache.solr.schema.StrField;
import org.apache.solr.schema.TextField;
import org.apache.solr.schema.TrieIntField;
import org.apache.solr.uninverting.UninvertingReader;
import org.apache.solr.update.AddUpdateCommand;
import org.apache.solr.update.processor.UpdateRequestProcessor;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/*
 * Flexible framework to query SOLR by (a large set)
 * of ID's. This can be used to implement remote/fast
 * bitset operations or to retrieve data that cannot
 * be easily expressed using a query language; typically
 * st like: id:1 OR id:2 OR id:3.............. id:n
 */
public class BitSetQParserPlugin extends QParserPlugin {

	public static final Logger log = LoggerFactory.getLogger(BitSetQParserPlugin.class);
	
	public static String NAME = "bitset";
	private Set<String> allowedFields = new HashSet<String>();
	private static Map<String, String> cacheMapping = new HashMap<String, String>();
	private int maxAllowedGetSize = 5000;

	@SuppressWarnings("rawtypes")
  @Override
	public void init(NamedList args) {

		NamedList defs = (NamedList) args.get("defaults");
		if (defs == null) {
			defs = new NamedList();
		}

		if (defs.get("cache-mapping") != null) {
			for (String s: ((String)defs.get("cache-mapping")).split(",")) {
				String[] parts = s.split(":");
				if (parts.length == 2) {
					cacheMapping.put(parts[0], parts[1]);
				}
				else {
					throw new SolrException(ErrorCode.SERVER_ERROR, "Wrong mapping format: " + s);
				}
			}
		}
		
		if (defs.get("allowed-fields") != null) {
			for (String s: ((String) defs.get("allowed-fields")).split(",")) {
				allowedFields.add(s);
			}
		}
		
		if (defs.get("max-allowed-get-size") != null) {
			maxAllowedGetSize = Integer.parseInt((String) defs.get("max-allowed-get-size"));
		}
	}

	@Override
	public QParser createParser(String qstr, SolrParams localParams,
			SolrParams params, SolrQueryRequest req) {
		
		
		return new QParser(qstr, localParams, params, req) {

			@Override
			public Query parse() {
				List<DataProcessor> processors = new ArrayList<DataProcessor>();
				
				try {
					
					Iterable<ContentStream> streams = req.getContentStreams();
					if (streams != null) {
						for (ContentStream cs: req.getContentStreams()) {
							DataProcessor streamProcessor = getStreamProcessor(cs);
							if (streamProcessor != null) {
								processors.add(streamProcessor);
							}
						}
					}
					
					String data;
					// we also allow passing of data inside normal parametes (useful for testing)
					data = localParams.get(QueryParsing.V);
					if (data != null && data.length() > 0) {
						if (data.length() > maxAllowedGetSize) { // solr loaded it anyway, but at least we are educating people ;)
							throw new SolrException(ErrorCode.FORBIDDEN, "The data you sent is too big for GET requests. Use data streams instead");
						}
						
						StringStream cs = new ContentStreamBase.StringStream(data);
						cs.setContentType("big-query/" + localParams.get("type", "bitset") 
								                           + "-" + localParams.get("encoding", "none")
								                           + "; compression:" + localParams.get("compression", "none")
								                           );
						DataProcessor streamProcessor = getStreamProcessor(cs);
						
						if (streamProcessor != null) {
							processors.add(streamProcessor);
						}
						
					}
					
					if (processors.size() == 0) {
						return new MatchNoDocsQuery();
					}
					
					String[] operator = localParams.get("operator","and").split(",");
					if (operator.length > 1 && operator.length != processors.size()-1) {
						throw new SolrException(ErrorCode.BAD_REQUEST, 
								"There is " + processors.size() + " data streams, but inconsistent number of operators: " + localParams.get("operator","and"));
					}
					
					FixedBitSet topBits = null;
					int i = 0;
					for (DataProcessor processor : processors) {
						
						FixedBitSet bits = processor.getBits();
						
						if (bits == null) {
							if (operator.length > 0) {
								i++;
							}
							continue;
						}
							
						
						if (topBits == null) {
							topBits = bits;
							continue;
						}
						
						String op = operator[i];
						if (op.equals("and")) {
							topBits.and(bits);
						}
						else if (op.equals("or")) {
							topBits.or(bits);
						}
						else if (op.equals("not")) {
							topBits.andNot(bits);
						}
						else if (op.equals("xor")) {
							topBits.xor(bits);
						}
						else {
							throw new SolrException(ErrorCode.BAD_REQUEST, "Unknown bitset operator: " + op);
						}
						
						if (operator.length > 0) {
							i++;
						}
					}
					
					
					if (topBits.cardinality() < 1)
						return new MatchNoDocsQuery();
	
					BitSetQuery q = new BitSetQuery(topBits);
					
					if (localParams.get("uniqueId", null) != null) {
						q.setUUID(UUID.randomUUID().toString());
					}
					
					return q;
				}
				catch (Exception e) {
					throw new SolrException(ErrorCode.SERVER_ERROR, e);
				}

			}

			private DataProcessor getStreamProcessor(ContentStream cs) throws Exception {
				
				// if 'streamId' is set, we may limit ourselves to grabbing only
				// some streams
	      if (localParams.get("streamId", null) != null) {
	      	if (!cs.getContentType().contains(localParams.get("streamId"))) {
	      		return null;
	      	}
	      }
	      String ct = cs.getContentType();
	      if (ct.contains("big-query/csv")) {
	      	DataProcessor p = new DataProcessor(req);
	      	
	      	CSVLoader loader = new CSVLoader();
	        loader.load(req, null, cs, p);
	        return p;
	      }
	      else if (ct.contains("big-query/bitset")) {
	      	
	      	DataProcessor p = new DataProcessor(req) {
	      		@Override
	      		public FixedBitSet getBits() {
	      			// we must harvest lucene docids
	    				LeafReader reader = req.getSearcher().getSlowAtomicReader();
	    				byte[] data;
              
	    				try {
	              data = readBase64String(localParams.get(QueryParsing.V), 
	              		localParams.get("compression", "none"));
              } catch (IOException e1) {
	              throw new SolrException(ErrorCode.BAD_REQUEST, e1);
              }
              
	    				FixedBitSet bits = fromByteArray(data, 
	    						localParams.getBool("little_endian", false)
	    						?	LITTLE_ENDIAN_BIT_MASK : BIG_ENDIAN_BIT_MASK);
	    				
	    				// now, the bitset can contain lucene docids or it can be
	    				// set of integer values that need translation into lucene
	    				// docids; this depends on presence/absence of 'field' param
	    				
	    				if (localParams.get("field", null) == null)
	    				  return bits;
	    				
	    				
    					String fieldName = localParams.get("field");
    					SchemaField field = req.getSchema().getField(fieldName);

    					if (field.multiValued()) {
    						throw new SolrException(ErrorCode.BAD_REQUEST, "I am sorry, you can't use bitset with multi-valued fields");
    					}

    					if (allowedFields.size() > 0 && !allowedFields.contains(fieldName)) {
    						throw new SolrException(ErrorCode.BAD_REQUEST, "I am sorry, you can't search against field " + fieldName + " (reason: field forbidden#!@#!)");
    					}

    					
    					
    					FieldType ftype = field.getType();
  						Class<? extends FieldType> c = ftype.getClass();
  						boolean fieldIsInt = true;
  						if (c.isAssignableFrom(TextField.class) || c.isAssignableFrom(StrField.class)) {
  							fieldIsInt = false;
  						}
  						else if (c.isAssignableFrom(TrieIntField.class)) {
  							//pass
  						}
  						else {
  							throw new SolrException(ErrorCode.BAD_REQUEST, "You make me sad - this field: " + fieldName + " is not indexed as integer :(");
  						}
  						
  						FixedBitSet translatedBitSet = new FixedBitSet(reader.maxDoc());
  						
  						
    					SolrCacheWrapper<SolrCache<Object,Integer>> cacheWrapper = super.getCache(fieldName);
    					if (cacheWrapper != null) { // we are lucky and we have a cache that can translate values for us
    						for (int i = bits.nextSetBit(0); i >= 0 && i < DocIdSetIterator.NO_MORE_DOCS; i = bits.nextSetBit(i+1)) {
    					     if (fieldIsInt) {
    					    	 int v = cacheWrapper.getLuceneDocId(0, i);
    					    	 if (v == -1)
    					    		 continue;
  					    		 translatedBitSet.set(v);
    					     }
    					     else {
    					    	 int v = cacheWrapper.getLuceneDocId(0, Integer.toString(i));
    					    	 if (v == -1)
    					    		 continue;
    					    	 translatedBitSet.set(v);
    					     }
    					  }
    						bits = translatedBitSet;
    					}
    					else {
    					
    						if (!fieldIsInt) {
    							throw new SolrException(ErrorCode.BAD_REQUEST, "You make me sad - this field: " + fieldName + " is not indexed as integer :(");
    						}
    						
    						Map<String, UninvertingReader.Type> mapping = new HashMap();
    		        mapping.put(fieldName, UninvertingReader.Type.INTEGER_POINT);
    		        UninvertingReader uninvertingReader = new UninvertingReader(reader, mapping);
    		        NumericDocValues cache;
                try {
                  cache = uninvertingReader.getNumericDocValues(fieldName);
                } catch (IOException e) {
                  return translatedBitSet;
                }
    		        
	    					
	    					// suckers, we have to translate whateve integer value into a lucene docid
	    					log.warn("We are translating values for a field without a cache: {}. Terrible, terrible idea!", fieldName);
	    					
	    					int docid = 0; // lucene docid
	    					int maxDoc = reader.maxDoc();
	    					int docValue;
	    					while(docid < maxDoc) {
	    						docValue = (int) cache.get(docid);
	    						if (docValue < bits.length() && docValue > 0 && bits.get(docValue)) {
	    							translatedBitSet.set(docid);
	    						}
	    						docid++;
	    					}
    					
	    					bits = translatedBitSet;
    					}
	    				return bits;
	      		}
	      	};
	      	return p;
	      }
	      
	      return null;
      }

		};
	}
	
	
	
	public static class DataProcessor extends UpdateRequestProcessor {
		
		private ArrayList<SolrInputDocument> docs;
		SolrQueryRequest req;

		public void processAdd(AddUpdateCommand cmd) throws IOException {
	    docs.add(cmd.solrDoc);
	  }
		
		public DataProcessor(SolrQueryRequest req) {
	    super(null);
	    docs = new ArrayList<SolrInputDocument>();
	    this.req = req;
    }

		public FixedBitSet getBits() throws ParseException {
			
			if (docs.size() == 0) {
				return new FixedBitSet(0);
			}
			
			FixedBitSet bs = new FixedBitSet(req.getSearcher().maxDoc());
			
			SolrInputDocument d = docs.get(0);
			// for csv, we can assume that every doc has the same fields (?)
			Iterator<SolrInputField> fi = d.iterator();
			
			HashMap<String, SolrCacheWrapper<SolrCache<Object,Integer>>> translators = new HashMap<String, SolrCacheWrapper<SolrCache<Object,Integer>>>();
			
			while (fi.hasNext()) {
				SolrInputField field = fi.next();
				SolrCacheWrapper<SolrCache<Object,Integer>> cache = getCache(field.getName());
				if (cache == null) {
					throw new SolrException(ErrorCode.BAD_REQUEST, "Uff, uff, I have no idea how to map this field (" + field.getName() + ") values into docids! Call 911");
				}
				translators.put(field.getName(), cache);
			}
			
			for (SolrInputDocument doc: docs) {
				for (SolrInputField f: doc.values()) {
					SolrCacheWrapper<SolrCache<Object,Integer>> c = translators.get(f.getName());
					for (Object o: f.getValues()) {
						int v = c.getLuceneDocId(0, o);
						if (v == -1)
							continue;
						bs.set(v);
					}
				}
			}
			return bs;
		}
		
    @SuppressWarnings("unchecked")
    public SolrCacheWrapper<SolrCache<Object, Integer>> getCache(String field) {
			
			SolrCache<Object, Integer> sCache = null;
			if (cacheMapping.containsKey(field)) {
				sCache = (SolrCache<Object, Integer>) req.getSearcher().getCache(cacheMapping.get(field));
			}
			else {
				sCache = (SolrCache<Object, Integer>) req.getSearcher().getCache(field);
			}
			
			
			if (sCache == null) {
				return null;
			}
			
			return new SolrCacheWrapper<SolrCache<Object, Integer>>(sCache) {
        @Override
				public int getLuceneDocId(int sourceDocid, Object sourceValue) {
					// extra checking necessary (we cannot be sure
					// the id will be always correct....
					
					if (sourceValue instanceof String) {
						sourceValue = ((String) sourceValue).toLowerCase().trim();
					}
					
				  Object v = cache.get().get(sourceValue);
				  if (v == null)
				  	return -1;
				  return (Integer) v;
			  }

				@Override
        public int internalHashCode() {
          return this.hashCode();
        }

				@Override
        public String internalToString() {
          return cache.get().toString();
        }
			};
		}
	}
	
	public static class DataStream {

		public DataStream(byte[] data, boolean bool) {
	    // TODO Auto-generated constructor stub
    }
		
	}


	protected byte[] readBase64String(String string, String compression) 
	throws IOException {

		byte[] data;
		try {
			data = decodeBase64(string.trim());
		} catch (Exception e1) {
			throw new SolrException(ErrorCode.BAD_REQUEST, e1);
		}

		if (compression != null && !compression.equals("none")) {
				if (compression.equals("gzip")) {
					data = unGZip(data);
				}
				else if (compression.equals("zip")) {
					data = unZip(data);
				}
				else if ("none".equals(compression)) {
					// do nothing
				}
				else {
					throw new SolrException(ErrorCode.BAD_REQUEST, "Unsupporeted compression: " + compression);
				}
		}
		return data;
	}



	protected String encodeBase64(byte[] data) throws Exception {
		return Base64.byteArrayToBase64(data, 0, data.length);
	}

	protected byte[] decodeBase64(String data) throws Exception {
		return Base64.base64ToByteArray(data);
	}

	protected byte[] doGZip(byte[] data) throws IOException {
		ByteArrayOutputStream baos = new ByteArrayOutputStream();
		GZIPOutputStream zipStream = new GZIPOutputStream(baos);
		zipStream.write(data);
		zipStream.flush();
		zipStream.close();
		return baos.toByteArray();
	}

	protected byte[] unGZip(byte[] data) throws IOException {
		ByteArrayInputStream bais = new ByteArrayInputStream(data);
		GZIPInputStream zipStream = new GZIPInputStream(bais);
		ByteArrayOutputStream baos = new ByteArrayOutputStream();

		byte[] buffer = new byte[1024];
		int len;
		while ((len = zipStream.read(buffer)) > 0) {
			baos.write(buffer, 0, len);
		}
		bais.close();
		zipStream.close();
		return baos.toByteArray();
	}

	protected byte[] doZip(byte[] data) throws IOException {
		ByteArrayOutputStream baos = new ByteArrayOutputStream();
		DeflaterOutputStream zipStream = new DeflaterOutputStream(baos);
		zipStream.write(data);
		zipStream.flush();
		zipStream.close();
		return baos.toByteArray();
	}

	private byte[] unZip(byte[] data) throws IOException {
		ByteArrayInputStream bais = new ByteArrayInputStream(data);
		InflaterInputStream zipStream = new InflaterInputStream(bais);
		ByteArrayOutputStream baos = new ByteArrayOutputStream();

		byte[] buffer = new byte[1024];
		int len;
		while ((len = zipStream.read(buffer)) > 0) {
			baos.write(buffer, 0, len);
		}
		bais.close();
		zipStream.close();
		return baos.toByteArray();
	}


	protected byte[] toByteArray(BitSet bitSet) {
		byte[] bytes = new byte[(bitSet.length() + 7) / 8];
		for ( int i = bitSet.nextSetBit(0); i >= 0 && i < DocIdSetIterator.NO_MORE_DOCS; i = bitSet.nextSetBit(i+1) ) {
			bytes[i / 8] |= 128 >> (i % 8);
			if (i+1 >= bitSet.length())
			  break;
		}
		return bytes;
	}

	// Returns a bitSet containing the values in bytes. Since I am planning to use
	// python intbitsets and these are (probably) encoded using little endian
	// we must be able to de-construct them properly, however internally, inside
	// Java we should be using big endian
	protected FixedBitSet fromByteArray(byte[] bytes, int[] bitMask) {
		FixedBitSet bs = new FixedBitSet(bytes == null? 0 : bytes.length * 8);
		int s = bytes.length * 8;
		for (int i = 0; i < s; i++) {
			if ((bytes[i/8] & bitMask[i%8]) != 0) // ((bytes[i/8] & (128 >> (i % 8))) != 0) 
				bs.set(i);
		}
		return bs;
	}

	protected FixedBitSet fromByteArray(byte[] bytes) {
		return fromByteArray(bytes, BIG_ENDIAN_BIT_MASK);
	}

	protected int BIG_ENDIAN_BIT_MASK[] =  {0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01}; 
	protected int LITTLE_ENDIAN_BIT_MASK[] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80};

}