package org.apache.lucene.search; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Random; import java.util.Set; import java.util.stream.Collectors; import org.apache.lucene.document.Document; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.Term; import org.apache.lucene.search.BooleanClause.Occur; import org.apache.lucene.search.BooleanQuery.Builder; import org.apache.lucene.util.BytesRefBuilder; import org.apache.lucene.util.LegacyNumericUtils; import org.apache.solr.request.SolrQueryRequest; import org.apache.solr.search.CitationLRUCache; import org.apache.solr.search.SolrIndexSearcher; import org.junit.BeforeClass; import monty.solr.util.MontySolrAbstractTestCase; import monty.solr.util.MontySolrSetup; @SuppressWarnings({"rawtypes", "unchecked"}) public class TestCitationsSearch extends MontySolrAbstractTestCase { private boolean debug = true; private SolrQueryRequest tempReq; @BeforeClass public static void beforeClass() throws Exception { makeResourcesVisible(Thread.currentThread().getContextClassLoader(), new String[] { MontySolrSetup.getMontySolrHome() + "/contrib/examples/adsabs/server/solr/collection1/conf", MontySolrSetup.getSolrHome() + "/example/solr/collection1/conf" }); System.setProperty("solr.allow.unsafe.resourceloading", "true"); schemaString = MontySolrSetup.getMontySolrHome() + "/contrib/adsabs/src/test-files/solr/collection1/conf/" + "schema-citations-transformer.xml"; configString = MontySolrSetup.getMontySolrHome() + "/contrib/adsabs/src/test-files/solr/collection1/conf/" + "citation-cache-solrconfig.xml"; initCore(configString, schemaString, MontySolrSetup.getSolrHome() + "/example/solr"); } @Override public void setUp() throws Exception { if (debug) { // TODO: set the codec new File(TEMP_DIR,"index-citations") } super.setUp(); } @Override public void tearDown() throws Exception { if (tempReq != null) { tempReq.close(); } super.tearDown(); } public HashMap<Integer, int[]> createRandomDocs(int start, int numDocs) throws IOException { Random randomSeed = new Random(); int[] randData = new int[numDocs/10]; for (int i=0; i<randData.length; i++) { randData[i] = Math.abs(randomSeed.nextInt(numDocs) - start); } int x = 0; int[][] randi = new int[numDocs-start][]; for (int i=0; i<numDocs-start; i++) { int howMany = randomSeed.nextInt(6); randi[i] = new int[howMany]; for (int j=0; j<howMany; j++) { if (x>=randData.length) { x = 0; } randi[i][j] = randData[x++]; } } HashMap<Integer, int[]> data = new HashMap<Integer, int[]>(randi.length); List<String> thisDoc = new ArrayList<String>(); for (int k=0;k<randi.length;k++) { thisDoc.clear(); thisDoc.add("id"); thisDoc.add(String.valueOf(k+start)); thisDoc.add("bibcode"); thisDoc.add("b" + (k+start)); thisDoc.add("year"); if (k % 2 == 0) { thisDoc.add("2000"); } else { thisDoc.add("1995"); } int[] row = new int[randi[k].length]; x = 0; for (int v: randi[k]) { row[x] = v+start; thisDoc.add("reference"); thisDoc.add("b" + String.valueOf(v+start)); thisDoc.add("ireference"); thisDoc.add(String.valueOf(v+start)); x++; } assertU(adoc(thisDoc.toArray(new String[thisDoc.size()]))); data.put(k+start, row); if (debug) System.out.println(thisDoc); } if (debug) System.out.println("Created random docs: " + start + " - " + numDocs); return data; } public void testCitesCollector() throws Exception { int maxHits = 1000; int maxHitsFound = new Float(maxHits * 0.3f).intValue(); createRandomDocs(0, new Float(maxHits * 0.4f).intValue()); assertU(commit("waitSearcher", "true")); // closes the writer, create a new segment createRandomDocs(new Float(maxHits * 0.3f).intValue(), new Float(maxHits * 0.7f).intValue()); assertU(commit("waitSearcher", "true")); // closes the writer, create a new segment createRandomDocs(new Float(maxHits * 0.71f).intValue(), new Float(maxHits * 1.0f).intValue()); assertU(commit("waitSearcher", "true")); // closes the writer, create a new segment createRandomDocs(0, new Float(maxHits * 0.2f).intValue()); assertU(commit("waitSearcher", "true")); // closes the writer, create a new segment // get the cache tempReq = req("test"); SolrIndexSearcher searcher = tempReq.getSearcher(); final CitationLRUCache cache = (CitationLRUCache) searcher.getCache("citations-cache-from-references"); assert cache != null; @SuppressWarnings("rawtypes") SolrCacheWrapper citationsWrapper = new SolrCacheWrapper.CitationsCache(cache); @SuppressWarnings("rawtypes") SolrCacheWrapper referencesWrapper = new SolrCacheWrapper.ReferencesCache(cache); // invert ourselves - this is what we expect to find HashMap<Integer, int[]> references = reconstructCitationCache(searcher); HashMap<Integer, int[]> citations = invert(references); for (Entry<Integer, int[]> es: references.entrySet()) { int docid = es.getKey(); int docids[] = es.getValue(); for (int reference: docids) { List<Integer> a = Arrays.stream(citations.get(reference)).boxed().collect(Collectors.toList()); List<Integer> b = Arrays.stream(citationsWrapper.getLuceneDocIds(reference)).boxed().collect(Collectors.toList()); assertTrue(a.contains(docid)); assertTrue(b.contains(docid)); assertEquals(a, b); } } for (Entry<Integer, int[]> es: citations.entrySet()) { int docid = es.getKey(); int docids[] = es.getValue(); for (int reference: docids) { List<Integer> a = Arrays.stream(references.get(reference)).boxed().collect(Collectors.toList()); List<Integer> b = Arrays.stream(referencesWrapper.getLuceneDocIds(reference)).boxed().collect(Collectors.toList()); Collections.sort(a); Collections.sort(b); assertTrue(a.contains(docid)); assertTrue(b.contains(docid)); assertEquals(docid + " produced diff cache results", a, b); } } // to collect the measurements data Map<Integer, Integer> histogram = new HashMap<Integer, Integer>(); SecondOrderCollectorCites coll = new SecondOrderCollectorCites(referencesWrapper, new String[]{"reference"}); coll.searcherInitialization(searcher, null); // run 2nd order through the whole index (no IO error should happen) searcher.search(new SecondOrderQuery(new MatchAllDocsQuery(), coll), 10); ScoreDoc[] hits; for (Integer i=0; i<maxHits; i++) { // int field types must be searched with bytes value (not strings) BytesRefBuilder br = new BytesRefBuilder(); LegacyNumericUtils.intToPrefixCoded(i, 0, br); ScoreDoc[] doc = searcher.search(new TermQuery(new Term("id", br.get().utf8ToString())), 1000).scoreDocs; if (doc.length == 0) // that's ok, some docs are missing continue; Document document = searcher.getIndexReader().document(doc[0].doc); assertEquals("Not found : " + i, 1, doc.length); int docid = doc[0].doc; // references(id:X) if (debug) System.out.println(i + " cites : " + join(references.get(docid)) + " -> " + join(referencesWrapper.getLuceneDocIds(docid))); hits = searcher.search(new SecondOrderQuery(new TermQuery(new Term("id", br.get().utf8ToString())), new SecondOrderCollectorCites(referencesWrapper, new String[] {"reference"})), maxHitsFound).scoreDocs; hitsEquals(docid, references, hits); hits = searcher.search(new SecondOrderQuery(new TermQuery(new Term("id", br.get().utf8ToString())), new SecondOrderCollectorCitesRAM(referencesWrapper), false), maxHitsFound).scoreDocs; hitsEquals(docid, references, hits); // citations(id:X) if (debug) System.out.println(i + " cited-by : " + join(citations.get(docid)) + " -> " + join(citationsWrapper.getLuceneDocIds(docid))); hits = searcher.search(new SecondOrderQuery(new TermQuery(new Term("id", br.get().utf8ToString())), new SecondOrderCollectorCitedBy(citationsWrapper), false), maxHitsFound).scoreDocs; hitsEquals(docid, citations, hits); if (!histogram.containsKey(hits.length)) { histogram.put(hits.length, 0); } histogram.put(hits.length, histogram.get(hits.length) + 1); // reference:X + year:195 Builder builder = new BooleanQuery.Builder(); builder.add(new BooleanClause(new TermQuery(new Term("reference",document.get("bibcode"))), Occur.MUST)); builder.add(new BooleanClause(new TermQuery(new Term("year", "1995")), Occur.MUST)); Query expected = builder.build(); builder = new BooleanQuery.Builder(); builder.add(new BooleanClause(new SecondOrderQuery(new TermQuery(new Term("id", br.get().utf8ToString())), new SecondOrderCollectorCitedBy(citationsWrapper), false), Occur.MUST)); builder.add(new BooleanClause(new TermQuery(new Term("year", "1995")), Occur.MUST)); Query seeked = builder.build(); ScoreDoc[] hitsA = searcher.search(expected, maxHitsFound).scoreDocs; ScoreDoc[] hitsB = searcher.search(seeked, maxHitsFound).scoreDocs; if (debug) { System.out.print("hitsA: "); prn(hitsA); System.out.print("hitsB: "); prn(hitsB); } assertScoreDocsEqual(hitsA, hitsB); if (i % 5000 == 0 && debug) { System.out.println("Done: " + i); } } int sum = 0; for ( Entry<Integer, Integer> x : histogram.entrySet()) { if (debug) System.out.println(x.getKey() + " : " + x.getValue()); sum += x.getValue(); } if (debug) System.out.println(sum); } private void assertScoreDocsEqual(ScoreDoc[] a, ScoreDoc[] b) { ArrayList<Integer> hitsA = new ArrayList<Integer>(); ArrayList<Integer> hitsB = new ArrayList<Integer>(); for (ScoreDoc d: a) { hitsA.add(d.doc); } for (ScoreDoc d: a) { hitsB.add(d.doc); } assertEquals(hitsA, hitsB); } private void prn(ScoreDoc[] hits) { for (ScoreDoc doc: hits) { System.out.print(doc); System.out.print(", "); } System.out.println(); } private String join(int[] l) { if (l == null) return ""; StringBuilder sb = new StringBuilder(); for(int s: l){ if (sb.length()>0) sb.append(","); sb.append(s); } return sb.toString(); } private HashMap<Integer, int[]> reconstructCitationCache(SolrIndexSearcher searcher) throws IOException { Map<String, Integer> bibcodeToDocid = new HashMap<String, Integer>(); Map<String, String[]> references = new HashMap<String, String[]>(); searcher.search(new MatchAllDocsQuery(), new SimpleCollector() { private LeafReaderContext context; @Override public boolean needsScores() { return false; } @Override protected void doSetNextReader(LeafReaderContext context) throws IOException { this.context = context; } @Override public void collect(int doc) throws IOException { Document d = searcher.doc(doc + this.context.docBase); bibcodeToDocid.put(d.get("bibcode"), doc + this.context.docBase); references.put(d.get("bibcode"), d.getValues("reference")); } }); HashMap<Integer, int[]> out = new HashMap<Integer, int[]>(); for (Entry<String, String[]> es: references.entrySet()) { int docid = bibcodeToDocid.get(es.getKey()); Set<Integer> docids = new HashSet<Integer>(); String[] refs = es.getValue(); for (int i=0; i<refs.length; i++) { if (bibcodeToDocid.get(refs[i]) == null) continue; docids.add(bibcodeToDocid.get(refs[i])); } out.put(docid, Arrays.stream(docids.toArray(new Integer[docids.size()])).mapToInt(Integer::intValue).toArray()); } return out; } private HashMap<Integer, int[]> invert(HashMap<Integer, int[]> cites) { HashMap<Integer, List<Integer>> result = new HashMap<Integer, List<Integer>>(cites.size()); for (Entry<Integer, int[]> e: cites.entrySet()) { for (int paperId: e.getValue()) { if (!result.containsKey(paperId)) { result.put(paperId, new ArrayList<Integer>()); } result.get(paperId).add(e.getKey()); } } HashMap<Integer, int[]> out = new HashMap<Integer, int[]>(); for (Entry<Integer, List<Integer>> e: result.entrySet()) { List<Integer> list = e.getValue(); int[] ret = new int[list.size()]; for(int i = 0;i < ret.length;i++) ret[i] = list.get(i); out.put(e.getKey(), ret); } return out; } private boolean hitsEquals(int docid, HashMap<Integer, int[]> cache, ScoreDoc[] hits) throws CorruptIndexException, IOException { int[] links = cache.get(docid); ArrayList<Integer> result = new ArrayList<Integer>(); for (ScoreDoc d: hits) { result.add(d.doc); } ArrayList<Integer> expected = new ArrayList<Integer>(); if (links != null) { for (int r: links) { expected.add(r); } } Collections.sort(expected); Collections.sort(result); assertEquals(docid + " differs", expected, result); return true; } // Uniquely for Junit 3 public static junit.framework.Test suite() { return new junit.framework.JUnit4TestAdapter(TestCitationsSearch.class); } }