package gate.mimir.test; import it.unimi.dsi.fastutil.IndirectPriorityQueue; import it.unimi.dsi.fastutil.ints.IntArrayPriorityQueue; import it.unimi.dsi.fastutil.ints.IntComparator; import it.unimi.dsi.fastutil.ints.IntHeapIndirectPriorityQueue; import it.unimi.dsi.fastutil.ints.IntHeapPriorityQueue; import it.unimi.dsi.fastutil.ints.IntIndirectPriorityQueue; import it.unimi.dsi.fastutil.ints.IntPriorityQueue; import it.unimi.di.big.mg4j.index.Index; import it.unimi.di.big.mg4j.index.IndexIterator; import it.unimi.di.big.mg4j.index.IndexReader; import it.unimi.di.big.mg4j.search.DocumentIterator; import it.unimi.di.big.mg4j.search.score.BM25FScorer; import it.unimi.di.big.mg4j.search.score.BM25Scorer; import it.unimi.di.big.mg4j.search.score.CountScorer; import it.unimi.di.big.mg4j.search.score.DelegatingScorer; import it.unimi.di.big.mg4j.search.score.Scorer; import it.unimi.di.big.mg4j.search.score.TfIdfScorer; import java.io.BufferedReader; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.File; import java.io.IOException; import java.io.InputStreamReader; import java.io.ObjectInputStream; import java.io.ObjectOutputStream; import java.lang.reflect.InvocationTargetException; import java.net.MalformedURLException; import java.net.URI; import java.net.URISyntaxException; import java.net.URL; import java.text.NumberFormat; import java.util.Arrays; import java.util.Enumeration; import java.util.concurrent.BlockingQueue; import java.util.concurrent.Callable; import java.util.concurrent.LinkedBlockingQueue; import java.util.zip.ZipEntry; import java.util.zip.ZipFile; import org.apache.commons.configuration.ConfigurationException; import gate.Document; import gate.Factory; import gate.Gate; import gate.corpora.DocumentImpl; import gate.creole.ResourceData; import gate.mimir.AbstractSemanticAnnotationHelper; import gate.mimir.IndexConfig; import gate.mimir.MimirIndex; import gate.mimir.index.AtomicIndex; import gate.mimir.index.AtomicTokenIndex; import gate.mimir.index.GATEDocument; import gate.mimir.index.IndexException; import gate.mimir.search.IndexReaderPool; import gate.mimir.search.QueryEngine; import gate.mimir.search.QueryEngine.IndexType; import gate.mimir.search.QueryRunner; import gate.mimir.search.RankingQueryRunnerImpl; import gate.mimir.search.RemoteQueryRunner; import gate.mimir.search.query.QueryExecutor; import gate.mimir.search.query.QueryNode; import gate.mimir.search.query.parser.QueryParser; import gate.mimir.search.score.BindingScorer; import gate.mimir.search.score.DelegatingScoringQueryExecutor; import gate.mimir.search.score.MimirScorer; import gate.mimir.search.terms.AndTermsQuery; import gate.mimir.search.terms.DocumentTermsQuery; import gate.mimir.search.terms.DocumentsAndTermsQuery; import gate.mimir.search.terms.DocumentsOrTermsQuery; import gate.mimir.search.terms.LimitTermsQuery; import gate.mimir.search.terms.OrTermsQuery; import gate.mimir.search.terms.SortedTermsQuery; import gate.mimir.search.terms.TermTypeTermsQuery; import gate.mimir.search.terms.TermsQuery; import gate.mimir.search.terms.TermsResultSet; import gate.mimir.tool.WebUtils; import gate.mimir.util.IndexUpgrader; import gate.mimir.util.MG4JTools; import gate.util.GateException; public class Scratch { public static void main (String[] args) throws Exception { // mainIndexConvert(args); // mainIndexer5(args); mainSimple(args); // mainDirectIndexes(args); // mainBuildDirectIndex(args); // mainQueryIndex(args); // mainRemote(args); } public static final void mainIndexConvert(String[] args) throws Exception { Gate.setGateHome(new File("gate-home")); Gate.setUserConfigFile(new File("gate-home/user-gate.xml")); Gate.init(); // load the tokeniser plugin Gate.getCreoleRegister().registerDirectories( new File("gate-home/plugins/ANNIE-tokeniser").toURI().toURL()); // load the DB plugin Gate.getCreoleRegister().registerDirectories( new File("../plugins/db-h2").toURI().toURL()); // load the measurements plugin Gate.getCreoleRegister().registerDirectories( new File("../plugins/measurements").toURI().toURL()); Gate.getCreoleRegister().registerDirectories( new File("../plugins/sparql").toURI().toURL()); if(args.length != 1) throw new RuntimeException( "You need to provide a single commnad line parameter, which should " + "be the path to a pre-5.0 Mímir index."); File indexDir = new File(args[0]); IndexUpgrader indexUpgrader = new IndexUpgrader(); indexUpgrader.upgradeIndex(indexDir); } /** * Interactive tool for querying a MG4J index (e.g. a Mímir sub-index, or a * Mímir sub-index batch). * * @param args * @throws NoSuchMethodException * @throws InvocationTargetException * @throws IllegalAccessException * @throws InstantiationException * @throws ClassNotFoundException * @throws URISyntaxException * @throws IOException * @throws SecurityException * @throws ConfigurationException */ public static void mainQueryIndex(String[] args) throws ConfigurationException, SecurityException, IOException, URISyntaxException, ClassNotFoundException, InstantiationException, IllegalAccessException, InvocationTargetException, NoSuchMethodException { IndexReaderPool termSource = null; // open the term supplying index URI termsIndexUri = new File("/data/mimir-indexes/index-fastvac-1M.mimir/mg4j/mimir-token-2").toURI(); Index termsIndex = MG4JTools.openMg4jIndex(termsIndexUri); termSource = new IndexReaderPool(termsIndex, termsIndexUri); if(args == null || args.length < 2) { System.out.println("Usage:\njava Scratch indexDir indexName...\n" + "where indexDir is a mimir index directory, indexName is the basename of an index file (the file name without any extension)."); return; } // open the MG4J index URI[] indexURIs = new URI[args.length - 1]; File mg4jDir = new File(new File(args[0]), "mg4j"); IndexReaderPool[] readerPools = new IndexReaderPool[args.length - 1]; for(int i = 0; i < indexURIs.length; i++) { indexURIs[i] = new File(mg4jDir, args[i + 1]).toURI(); Index theIndex = MG4JTools.openMg4jIndex(indexURIs[i]); readerPools[i] = new IndexReaderPool(theIndex, indexURIs[i]); } BufferedReader in = new BufferedReader(new InputStreamReader(System.in)); System.out.print("Query:"); String line = in.readLine(); while(line != null && line.length() > 0) { for(int i = 0; i < readerPools.length; i++) { System.out.print("From " + indexURIs[i] + ":\n\t"); IndexReader indexReader = readerPools[i].borrowReader(); try { IndexIterator indexIter = indexReader.documents(Long.parseLong(line)); long docId = indexIter.nextDocument(); boolean first = true; while(docId > 0 && docId != IndexIterator.END_OF_LIST) { if(first) first = false; else System.out.print(", "); System.out.print(Long.toString(docId)); if(termSource != null) { System.out.print("('" + termSource.getTerm(docId) + "')"); } docId = indexIter.nextDocument(); } System.out.println("\n"); } finally { readerPools[i].returnReader(indexReader); } } System.out.print("Query:"); line = in.readLine(); } } public static void mainSimple(String[] args) throws Exception { Gate.setGateHome(new File("gate-home")); Gate.setUserConfigFile(new File("gate-home/user-gate.xml")); Gate.init(); // load the tokeniser plugin Gate.getCreoleRegister().registerDirectories( new File("gate-home/plugins/ANNIE-tokeniser").toURI().toURL()); // load the DB plugin Gate.getCreoleRegister().registerDirectories( new File("../plugins/db-h2").toURI().toURL()); // load the measurements plugin Gate.getCreoleRegister().registerDirectories( new File("../plugins/measurements").toURI().toURL()); Gate.getCreoleRegister().registerDirectories( new File("../plugins/sparql").toURI().toURL()); MimirIndex mainIndex = new MimirIndex(new File(args[0])); QueryEngine qEngine = mainIndex.getQueryEngine(); // String query = "electrical"; // String query = "{Document date > 20070000}"; // String query = "{Abstract}"; String[] queries = new String[] {"{Mention inst=\"http://dbpedia.org/resource/Sean_Bean\"}"}; //new String[]{"electrical", "the", "{Document date > 20070000}"}; long start = System.currentTimeMillis(); NumberFormat nf = NumberFormat.getNumberInstance(); for(String query : queries) { System.out.println("Query: " + query); QueryNode qNode = QueryParser.parse(query); long startLocal = System.currentTimeMillis(); QueryExecutor qExecutor = qNode.getQueryExecutor(qEngine); long latestDoc = qExecutor.nextDocument(-1); int totalHitCount = 0; int docCount = 0; while(latestDoc >= 0) { docCount++; int hitCount = 0; while(qExecutor.nextHit() != null) hitCount++; totalHitCount += hitCount; System.out.println("Doc " + latestDoc + ", hits: " + hitCount); latestDoc = qExecutor.nextDocument(-1); } System.out.println("Found " + nf.format(totalHitCount) + " hits in " + nf.format(docCount) + " documents, in " + nf.format(System.currentTimeMillis() - startLocal) + " ms.\n" + "========================================================\n" + "========================================================"); qExecutor.close(); } System.out.println("Total time " + nf.format(System.currentTimeMillis() - start) + " ms."); mainIndex.close(); } /** * Scratch code to exercise the 5.0 indexer framework */ public static void mainIndexer5(String[] args) throws Exception { Gate.setGateHome(new File("gate-home")); Gate.setUserConfigFile(new File("gate-home/user-gate.xml")); Gate.init(); // load the tokeniser plugin Gate.getCreoleRegister().registerDirectories(new File("gate-home/plugins/ANNIE-tokeniser").toURI().toURL()); // load the DB plugin Gate.getCreoleRegister().registerDirectories(new File("../plugins/db-h2").toURI().toURL()); // load the measurements plugin Gate.getCreoleRegister().registerDirectories(new File("../plugins/measurements").toURI().toURL()); File indexDir = new File(args[0]); IndexConfig indexConfig = TestUtils.getTestIndexConfig(indexDir, Class.forName("gate.mimir.db.DBSemanticAnnotationHelper", true, Gate.getClassLoader()).asSubclass( AbstractSemanticAnnotationHelper.class)); MimirIndex mainIndex = new MimirIndex(indexConfig); mainIndex.setOccurrencesPerBatch(1000000); // index some documents File zipFile = new File(args[1]); String fileURI = zipFile.toURI().toString(); ZipFile zip = new ZipFile(args[1]); Enumeration<? extends ZipEntry> entries = zip.entries(); int copies = 10; boolean compress = true; ResourceData docRd = Gate.getCreoleRegister().get(DocumentImpl.class.getName()); while(entries.hasMoreElements()) { ZipEntry entry = entries.nextElement(); if(entry.isDirectory()) { continue; } URL url = new URL("jar:" + fileURI + "!/" + entry.getName()); Document doc = gate.Factory.newDocument(url, "UTF-8"); ByteArrayOutputStream baos = new ByteArrayOutputStream(); ObjectOutputStream oos = new ObjectOutputStream(baos); oos.writeObject(doc); oos.close(); Factory.deleteResource(doc); byte[] docBytes = baos.toByteArray(); for(int i = 0; i < copies; i++) { doc = (Document) new ObjectInputStream(new ByteArrayInputStream(docBytes)).readObject(); docRd.addInstantiation(doc); mainIndex.indexDocument(doc); } } if(compress){ mainIndex.requestSyncToDisk(); mainIndex.requestCompactIndex(); } mainIndex.compactDocumentCollection(); mainIndex.close(); } /** * Scratch code to exercise the 5.0 indexer framework */ public static void mainAtomicTokenIndexer5(String[] args) throws Exception { Gate.setGateHome(new File("gate-home")); Gate.setUserConfigFile(new File("gate-home/user-gate.xml")); Gate.init(); // load the tokeniser plugin Gate.getCreoleRegister().registerDirectories(new File("gate-home/plugins/ANNIE-tokeniser").toURI().toURL()); // load the DB plugin Gate.getCreoleRegister().registerDirectories(new File("../plugins/db-h2").toURI().toURL()); // load the measurements plugin Gate.getCreoleRegister().registerDirectories(new File("../plugins/measurements").toURI().toURL()); File indexDir = new File(args[0]); IndexConfig indexConfig = TestUtils.getTestIndexConfig(indexDir, Class.forName("gate.mimir.db.DBSemanticAnnotationHelper", true, Gate.getClassLoader()).asSubclass( AbstractSemanticAnnotationHelper.class)); MimirIndex mainIndex = new MimirIndex(indexDir); // build a token indexer BlockingQueue<GATEDocument> inputQueue = new LinkedBlockingQueue<GATEDocument>(); BlockingQueue<GATEDocument> outputQueue = new LinkedBlockingQueue<GATEDocument>(); AtomicTokenIndex ati = new AtomicTokenIndex(mainIndex, "tokens-0", false, inputQueue, outputQueue, indexConfig.getTokenIndexers()[0], false); File zipFile = new File(args[1]); String fileURI = zipFile.toURI().toString(); ZipFile zip = new ZipFile(args[1]); Enumeration<? extends ZipEntry> entries = zip.entries(); int copies = 100; while(entries.hasMoreElements()) { ZipEntry entry = entries.nextElement(); if(entry.isDirectory()) { continue; } URL url = new URL("jar:" + fileURI + "!/" + entry.getName()); Document doc = gate.Factory.newDocument(url, "UTF-8"); for(int i = 0; i < copies; i++) { GATEDocument gateDoc = new GATEDocument(doc, indexConfig); inputQueue.put(gateDoc); } // now let's do some searches Index index = ati.getIndex(); int resDocs = 0; if(index != null) { IndexIterator indexIter = index.getReader().documents("patent"); long docId = indexIter.nextDocument(); while(docId != DocumentIterator.END_OF_LIST) { resDocs ++; docId = indexIter.nextDocument(); } } System.out.println("=================================\n" + "Matched " + resDocs + " documents.\n" + "=================================\n"); } // compress the index ati.requestCompactIndex(); System.out.println("=================================\n" + "Compressing index.\n" + "=================================\n"); Thread.sleep(5000); // and search one last time Index index = ati.getIndex(); int resDocs = 0; if(index != null) { IndexIterator indexIter = index.getReader().documents("patent"); long docId = indexIter.nextDocument(); while(docId != DocumentIterator.END_OF_LIST) { resDocs ++; docId = indexIter.nextDocument(); } } System.out.println("=================================\n" + "Matched " + resDocs + " documents.\n" + "=================================\n"); ati.close(); } /** * Version that exercises the scorers * @param args */ public static void mainScorers(String[] args) throws Exception { Gate.setGateHome(new File("gate-home")); Gate.setUserConfigFile(new File("gate-home/user-gate.xml")); Gate.init(); // load the tokeniser plugin Gate.getCreoleRegister().registerDirectories( new File("gate-home/plugins/ANNIE-tokeniser").toURI().toURL()); // load the DB plugin Gate.getCreoleRegister().registerDirectories( new File("../plugins/db-h2").toURI().toURL()); // load the measurements plugin Gate.getCreoleRegister().registerDirectories( new File("../plugins/measurements").toURI().toURL()); // load the SPARQL plugin Gate.getCreoleRegister().registerDirectories( new File("../plugins/sparql").toURI().toURL()); QueryEngine qEngine = new MimirIndex(new File(args[0])).getQueryEngine(); qEngine.setScorerSource(new Callable<MimirScorer>() { @Override public MimirScorer call() throws Exception { return new BindingScorer(2, 0.9); // return new DelegatingScoringQueryExecutor(new TfIdfScorer()); // return new DelegatingScoringQueryExecutor(new CountScorer()); // return new DelegatingScoringQueryExecutor(new BM25Scorer()); } }); BufferedReader input = new BufferedReader(new InputStreamReader(System.in)); String query = null; do { try{ System.out.print("? "); query = input.readLine(); long start = System.currentTimeMillis(); if(query == null || query.trim().length() == 0) break; QueryRunner qRunner = qEngine.getQueryRunner(query); while(qRunner.getDocumentsCount() < 0) { Thread.sleep(100); } double minScore = Double.MAX_VALUE; double maxScore = Double.MIN_VALUE; long docCount = qRunner.getDocumentsCount(); for(int i = 0; i < docCount; i++) { double score = qRunner.getDocumentScore(i); if(score < minScore) minScore = score; if(score > maxScore) maxScore = score; // exercise the runner qRunner.getDocumentID(i); qRunner.getDocumentTitle(i); qRunner.getDocumentURI(i); qRunner.getDocumentHits(i); } System.out.println(String.format( "Matched %d documents, scores %02.4f - %02.4f, in %02.2f seconds", docCount, minScore, maxScore, (double)(System.currentTimeMillis() - start)/1000)); qRunner.close(); }catch(Exception e) { e.printStackTrace(System.err); } } while (query != null); qEngine.close(); } /** * Main version for testing direct indexes * @param args * @throws Exception sometimes */ public static void mainDirectIndexes(String[] args) throws Exception { Gate.setGateHome(new File("gate-home")); Gate.setUserConfigFile(new File("gate-home/user-gate.xml")); Gate.init(); // load the tokeniser plugin Gate.getCreoleRegister().registerDirectories( new File("gate-home/plugins/ANNIE-tokeniser").toURI().toURL()); // load the DB plugin Gate.getCreoleRegister().registerDirectories( new File("../plugins/db-h2").toURI().toURL()); // load the measurements plugin Gate.getCreoleRegister().registerDirectories( new File("../plugins/measurements").toURI().toURL()); Gate.getCreoleRegister().registerDirectories( new File("../plugins/sparql").toURI().toURL()); MimirIndex mainIndex = new MimirIndex(new File(args[0])); QueryEngine qEngine = mainIndex.getQueryEngine(); TermsQuery query = null; // query = new DocumentTermsQuery("root", IndexType.TOKENS, true, true, 0); // printTermQuery(query, qEngine); // System.out.println("\n======================================="); // query = new DocumentTermsQuery("root", IndexType.TOKENS, // true, true, DocumentTermsQuery.NO_LIMIT, 1); // printTermQuery(query, qEngine); // System.out.println("\n======================================="); // query = new DocumentsOrTermsQuery("root", IndexType.TOKENS, // true, true, TermsQuery.NO_LIMIT, 0, 1); // printTermQuery(query, qEngine); // System.out.println("\n======================================="); // TermsQuery q1 = new DocumentTermsQuery("root", IndexType.TOKENS, // true, true, TermsQuery.NO_LIMIT, 0); // TermsQuery q2 = new DocumentTermsQuery("root", IndexType.TOKENS, // true, true, TermsQuery.NO_LIMIT, 1); // query = new OrTermsQuery(true, true, TermsQuery.NO_LIMIT, q1, q2); // // query = new LimitTermsQuery(new SortedTermsQuery(query), 100); // query = new LimitTermsQuery( // new SortedTermsQuery( // new DocumentsOrTermsQuery("root", IndexType.TOKENS, true, false, 0, 1, 2)) // , 100); // printTermQuery(query, qEngine); // // System.out.println("\n======================================="); query = new LimitTermsQuery( new SortedTermsQuery( // new TermTypeTermsQuery("root", IndexType.TOKENS)) new DocumentTermsQuery("root", IndexType.TOKENS, true, true, 1) ), 100); printTermQuery(query, qEngine); System.out.println("\n======================================="); mainIndex.close(); } /** * Scratch code for using the remote query runner * @param args 2 string: index URL and query * @throws Exception */ public static void mainRemote(String[] args) throws Exception { if(args.length != 2) { System.out.println("Usage: Scratch indexUrl queryString"); return; } RemoteQueryRunner rqr = new RemoteQueryRunner(args[0], args[1], null, new WebUtils()); long docCount = rqr.getDocumentsCount(); while(docCount < 0) { System.out.println("Working (found " + rqr.getDocumentsCurrentCount() + " documents so far)"); Thread.sleep(1000); docCount = rqr.getDocumentsCount(); } System.out.println("Search complete; found: " + docCount + " documents."); } private static NumberFormat nf = NumberFormat.getNumberInstance(); private static void printTermQuery(TermsQuery query, QueryEngine qEngine) throws IOException { long start = System.currentTimeMillis(); TermsResultSet res = query.execute(qEngine); for(int i = 0; i < res.termStrings.length; i++) { System.out.print("\"" + res.termStrings[i] + "\"\t"); if(res.termLengths != null) { System.out.print("len:" + res.termLengths[i] + "\t"); } if(res.termCounts != null) { System.out.print("cnt:" + res.termCounts[i]); } System.out.println(); } System.out.println("Found " + nf.format(res.termStrings.length) + " hits in " + nf.format(System.currentTimeMillis() - start) + " ms."); } }