TestCitationsSearch.java example

Explorer
montysolr-master
- contrib
package org.apache.lucene.search;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Random;
import java.util.Set;
import java.util.stream.Collectors;

import org.apache.lucene.document.Document;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery.Builder;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.LegacyNumericUtils;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.search.CitationLRUCache;
import org.apache.solr.search.SolrIndexSearcher;
import org.junit.BeforeClass;

import monty.solr.util.MontySolrAbstractTestCase;
import monty.solr.util.MontySolrSetup;

@SuppressWarnings({"rawtypes", "unchecked"})
public class TestCitationsSearch extends MontySolrAbstractTestCase {

	private boolean debug = true;
	private SolrQueryRequest tempReq;
	
	@BeforeClass
  public static void beforeClass() throws Exception {
		makeResourcesVisible(Thread.currentThread().getContextClassLoader(), new String[] {
					MontySolrSetup.getMontySolrHome() + "/contrib/examples/adsabs/server/solr/collection1/conf",
		      MontySolrSetup.getSolrHome() + "/example/solr/collection1/conf"
		    });
		
    System.setProperty("solr.allow.unsafe.resourceloading", "true");
    schemaString = MontySolrSetup.getMontySolrHome()
        + "/contrib/adsabs/src/test-files/solr/collection1/conf/"
        + "schema-citations-transformer.xml";
      
    configString = MontySolrSetup.getMontySolrHome()
          + "/contrib/adsabs/src/test-files/solr/collection1/conf/"
          + "citation-cache-solrconfig.xml";
    
    initCore(configString, schemaString, MontySolrSetup.getSolrHome()
			    + "/example/solr");
  }
	

	@Override
	public void setUp() throws Exception {
		if (debug) {
			// TODO: set the codec new File(TEMP_DIR,"index-citations")
		}
		super.setUp();
	}
	
	@Override 
	public void tearDown() throws Exception {
		if (tempReq != null) {
			tempReq.close();
		}
		super.tearDown();
	}

	public HashMap<Integer, int[]> createRandomDocs(int start, int numDocs) throws IOException {
		Random randomSeed = new Random();
		
		int[] randData = new int[numDocs/10];
		for (int i=0; i<randData.length; i++) {
			randData[i] = Math.abs(randomSeed.nextInt(numDocs) - start);
		}
		
		
		int x = 0;
		int[][] randi = new int[numDocs-start][];
		for (int i=0; i<numDocs-start; i++) {
			int howMany = randomSeed.nextInt(6);
			randi[i] = new int[howMany];
			for (int j=0; j<howMany; j++) {
				if (x>=randData.length) {
					x = 0;
				}
				randi[i][j] = randData[x++];
			}
		}
		
		HashMap<Integer, int[]> data = new HashMap<Integer, int[]>(randi.length);
		List<String> thisDoc = new ArrayList<String>();
		
		for (int k=0;k<randi.length;k++) {
			thisDoc.clear();
			thisDoc.add("id");
			thisDoc.add(String.valueOf(k+start));
			thisDoc.add("bibcode");
			thisDoc.add("b" + (k+start));
			thisDoc.add("year");
			if (k % 2 == 0) {
			  thisDoc.add("2000");
			}
			else {
			  thisDoc.add("1995");
			}
			int[] row = new int[randi[k].length];
			
			x = 0;
			for (int v: randi[k]) {
				row[x] = v+start;
				thisDoc.add("reference");
				thisDoc.add("b" + String.valueOf(v+start));
				thisDoc.add("ireference");
				thisDoc.add(String.valueOf(v+start));
				x++;
			}
			assertU(adoc(thisDoc.toArray(new String[thisDoc.size()])));
			data.put(k+start, row);
			if (debug) System.out.println(thisDoc);
		}
		
		
		if (debug) System.out.println("Created random docs: " + start + " - " + numDocs);
		return data;
	}
	
	
	public void testCitesCollector() throws Exception {
		
		int maxHits = 1000;
		int maxHitsFound = new Float(maxHits * 0.3f).intValue();
		createRandomDocs(0, new Float(maxHits * 0.4f).intValue());
		assertU(commit("waitSearcher", "true")); // closes the writer, create a new segment

		createRandomDocs(new Float(maxHits * 0.3f).intValue(), new Float(maxHits * 0.7f).intValue());
		assertU(commit("waitSearcher", "true")); // closes the writer, create a new segment

		createRandomDocs(new Float(maxHits * 0.71f).intValue(), new Float(maxHits * 1.0f).intValue());
		assertU(commit("waitSearcher", "true")); // closes the writer, create a new segment
		
		createRandomDocs(0, new Float(maxHits * 0.2f).intValue());
    assertU(commit("waitSearcher", "true")); // closes the writer, create a new segment
    
		// get the cache
		tempReq = req("test");
		SolrIndexSearcher searcher = tempReq.getSearcher();
		
		final CitationLRUCache cache = (CitationLRUCache) searcher.getCache("citations-cache-from-references");
		assert cache != null;
		
		@SuppressWarnings("rawtypes")
		SolrCacheWrapper citationsWrapper = new SolrCacheWrapper.CitationsCache(cache);
		
		@SuppressWarnings("rawtypes")
		SolrCacheWrapper referencesWrapper = new SolrCacheWrapper.ReferencesCache(cache);
		
		
	  // invert ourselves - this is what we expect to find
    HashMap<Integer, int[]> references = reconstructCitationCache(searcher);
    HashMap<Integer, int[]> citations = invert(references);
    
    for (Entry<Integer, int[]> es: references.entrySet()) {
      int docid = es.getKey();
      int docids[] = es.getValue();
      for (int reference: docids) {
        List<Integer> a = Arrays.stream(citations.get(reference)).boxed().collect(Collectors.toList());
        List<Integer> b = Arrays.stream(citationsWrapper.getLuceneDocIds(reference)).boxed().collect(Collectors.toList());
        assertTrue(a.contains(docid));
        assertTrue(b.contains(docid));
        assertEquals(a, b);
      }
    }
    
    for (Entry<Integer, int[]> es: citations.entrySet()) {
      int docid = es.getKey();
      int docids[] = es.getValue();
      for (int reference: docids) {
        List<Integer> a = Arrays.stream(references.get(reference)).boxed().collect(Collectors.toList());
        List<Integer> b = Arrays.stream(referencesWrapper.getLuceneDocIds(reference)).boxed().collect(Collectors.toList());
        Collections.sort(a);
        Collections.sort(b);
        assertTrue(a.contains(docid));
        assertTrue(b.contains(docid));
        assertEquals(docid + " produced diff cache results", a, b);
      }
    }
		
		// to collect the measurements data
		Map<Integer, Integer> histogram = new HashMap<Integer, Integer>();
		
		SecondOrderCollectorCites coll = new SecondOrderCollectorCites(referencesWrapper, new String[]{"reference"});
		coll.searcherInitialization(searcher, null);
		
		
		// run 2nd order through the whole index (no IO error should happen)
		searcher.search(new SecondOrderQuery(new MatchAllDocsQuery(), coll), 10); 
		
		
		ScoreDoc[] hits;
		for (Integer i=0; i<maxHits; i++) {
			
			// int field types must be searched with bytes value (not strings)
			BytesRefBuilder br = new BytesRefBuilder();
			LegacyNumericUtils.intToPrefixCoded(i, 0, br);
			
			ScoreDoc[] doc = searcher.search(new TermQuery(new Term("id", br.get().utf8ToString())), 1000).scoreDocs;
			
			if (doc.length == 0) // that's ok, some docs are missing
			  continue;
			  
			Document document = searcher.getIndexReader().document(doc[0].doc);
			assertEquals("Not found : " + i, 1, doc.length);
			int docid = doc[0].doc;
			
			// references(id:X)
			if (debug)
			  System.out.println(i + " cites : "  + join(references.get(docid)) + " -> " + join(referencesWrapper.getLuceneDocIds(docid)));
			hits = searcher.search(new SecondOrderQuery(new TermQuery(new Term("id", br.get().utf8ToString())),
			    new SecondOrderCollectorCites(referencesWrapper, new String[] {"reference"})), maxHitsFound).scoreDocs;
			hitsEquals(docid, references, hits);
			hits = searcher.search(new SecondOrderQuery(new TermQuery(new Term("id", br.get().utf8ToString())),
						new SecondOrderCollectorCitesRAM(referencesWrapper), false), maxHitsFound).scoreDocs;
			hitsEquals(docid, references, hits);
			
			// citations(id:X)
			if (debug)
			  System.out.println(i + " cited-by : "  + join(citations.get(docid)) + " -> " + join(citationsWrapper.getLuceneDocIds(docid)));
			hits = searcher.search(new SecondOrderQuery(new TermQuery(new Term("id", br.get().utf8ToString())),
          new SecondOrderCollectorCitedBy(citationsWrapper), false), maxHitsFound).scoreDocs;
			hitsEquals(docid, citations, hits);
			
			if (!histogram.containsKey(hits.length)) {
				histogram.put(hits.length, 0);
			}
			histogram.put(hits.length, histogram.get(hits.length) + 1);
			
		  // reference:X + year:195
			Builder builder = new BooleanQuery.Builder();
			builder.add(new BooleanClause(new TermQuery(new Term("reference",document.get("bibcode"))), Occur.MUST));
			builder.add(new BooleanClause(new TermQuery(new Term("year", "1995")), Occur.MUST));
			Query expected = builder.build();
			
			builder = new BooleanQuery.Builder();
      builder.add(new BooleanClause(new SecondOrderQuery(new TermQuery(new Term("id", br.get().utf8ToString())),
          new SecondOrderCollectorCitedBy(citationsWrapper), false), Occur.MUST));
      builder.add(new BooleanClause(new TermQuery(new Term("year", "1995")), Occur.MUST));
      Query seeked = builder.build();
      
      ScoreDoc[] hitsA = searcher.search(expected, maxHitsFound).scoreDocs;
      ScoreDoc[] hitsB = searcher.search(seeked, maxHitsFound).scoreDocs;
      
      if (debug) {
        System.out.print("hitsA: ");
        prn(hitsA);
        System.out.print("hitsB: ");
        prn(hitsB);
      }
      assertScoreDocsEqual(hitsA, hitsB);
			
			if (i % 5000 == 0 && debug) {
				System.out.println("Done: " + i);
			}
		}
		
		
		int sum = 0;
		for ( Entry<Integer, Integer> x : histogram.entrySet()) {
			if (debug) System.out.println(x.getKey() + " : " + x.getValue());
			sum += x.getValue();
		}
		if (debug) System.out.println(sum);
		
	}
	
	private void assertScoreDocsEqual(ScoreDoc[] a, ScoreDoc[] b) {
	  ArrayList<Integer> hitsA = new ArrayList<Integer>();
	  ArrayList<Integer> hitsB = new ArrayList<Integer>();
	  for (ScoreDoc d: a) {
	    hitsA.add(d.doc);
	  }
	  for (ScoreDoc d: a) {
      hitsB.add(d.doc);
    }
	  assertEquals(hitsA, hitsB);
	}
	
	private void prn(ScoreDoc[] hits) {
	  for (ScoreDoc doc: hits) {
	    System.out.print(doc);
	    System.out.print(", ");
	  }
	  System.out.println();
	}
	
	private String join(int[] l) {
	  if (l == null) return "";
	  StringBuilder sb = new StringBuilder();
	  for(int s: l){
	    if (sb.length()>0) sb.append(",");
	    sb.append(s);
	  }
	  return sb.toString();
	}
	
	
	private HashMap<Integer, int[]> reconstructCitationCache(SolrIndexSearcher searcher) 
	    throws IOException {
	  Map<String, Integer> bibcodeToDocid = new HashMap<String, Integer>();
    Map<String, String[]> references = new HashMap<String, String[]>();
    
	  searcher.search(new MatchAllDocsQuery(), new SimpleCollector() {
      
      private LeafReaderContext context;

      @Override
      public boolean needsScores() {
        return false;
      }
      
      @Override
      protected void doSetNextReader(LeafReaderContext context) throws IOException {
        this.context = context;
      }
      
      @Override
      public void collect(int doc) throws IOException {
        Document d = searcher.doc(doc + this.context.docBase);
	      bibcodeToDocid.put(d.get("bibcode"), doc + this.context.docBase);
        references.put(d.get("bibcode"), d.getValues("reference"));
      }
    });
	  HashMap<Integer, int[]> out = new HashMap<Integer, int[]>();
	  for (Entry<String, String[]> es: references.entrySet()) {
	    int docid = bibcodeToDocid.get(es.getKey());
	    Set<Integer> docids = new HashSet<Integer>();
	    String[] refs = es.getValue();
	    for (int i=0; i<refs.length; i++) {
	      if (bibcodeToDocid.get(refs[i]) == null)
	        continue;
	      docids.add(bibcodeToDocid.get(refs[i]));
	    }
	    out.put(docid, Arrays.stream(docids.toArray(new Integer[docids.size()])).mapToInt(Integer::intValue).toArray());
	  }
	  return out;
  }


	
	private HashMap<Integer, int[]> invert(HashMap<Integer, int[]> cites) {
		HashMap<Integer, List<Integer>> result = new HashMap<Integer, List<Integer>>(cites.size());
		for (Entry<Integer, int[]> e: cites.entrySet()) {
			for (int paperId: e.getValue()) {
				if (!result.containsKey(paperId)) {
					result.put(paperId, new ArrayList<Integer>());
				}
				result.get(paperId).add(e.getKey());
			}
		}
		HashMap<Integer, int[]> out = new HashMap<Integer, int[]>();
		for (Entry<Integer, List<Integer>> e: result.entrySet()) {
			List<Integer> list = e.getValue();
			int[] ret = new int[list.size()];
			for(int i = 0;i < ret.length;i++)
			    ret[i] = list.get(i);
			out.put(e.getKey(), ret);
		}
		return out;
	}

	private boolean hitsEquals(int docid, HashMap<Integer, int[]> cache, ScoreDoc[] hits) throws CorruptIndexException, IOException {
		int[] links = cache.get(docid);
		
		ArrayList<Integer> result = new ArrayList<Integer>();
		for (ScoreDoc d: hits) {
			result.add(d.doc);
		}
		ArrayList<Integer> expected = new ArrayList<Integer>();
		if (links != null) {
  		for (int r: links) {
  		  expected.add(r);
  		}
		}
		Collections.sort(expected);
		Collections.sort(result);
    
		
		assertEquals(docid + " differs", expected, result);
		return true;
	}
	

	// Uniquely for Junit 3
	public static junit.framework.Test suite() {
        return new junit.framework.JUnit4TestAdapter(TestCitationsSearch.class);
    }
}