/** * Copyright 2010 T Jake Luciani * * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package lucandra.wikipedia; import java.util.List; import java.util.Random; import java.util.concurrent.Callable; import java.util.concurrent.ConcurrentLinkedQueue; import lucandra.CassandraUtils; import lucandra.IndexWriter; import org.apache.cassandra.thrift.Cassandra; import org.apache.cassandra.thrift.TokenRange; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.analysis.cjk.CJKAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.Field.Index; import org.apache.lucene.document.Field.Store; import org.apache.lucene.document.Field.TermVector; import org.apache.lucene.util.Version; import org.apache.thrift.transport.TTransportException; public class WikipediaIndexWorker implements Callable<Integer> { // each worker thread has a connection to cassandra private static ConcurrentLinkedQueue<lucandra.IndexWriter> allClients = new ConcurrentLinkedQueue<IndexWriter>(); private static ThreadLocal<lucandra.IndexWriter> clientPool = new ThreadLocal<lucandra.IndexWriter>(); private static ThreadLocal<Integer> batchCount = new ThreadLocal<Integer>(); // get ring info private static List<TokenRange> ring; static { try { Cassandra.Iface client = CassandraUtils.createConnection(); ring = client.describe_ring(CassandraUtils.keySpace); } catch (Exception e) { throw new RuntimeException(e); } } //Add shutdown hook for batched commits to complete static { Runtime.getRuntime().addShutdownHook(new Thread() { public void run() { lucandra.IndexWriter w; while ((w = allClients.poll()) != null) { w.commit(); } System.err.println("committed"); } }); } // this is shared by all workers private static Analyzer analyzer = new CJKAnalyzer(Version.LUCENE_CURRENT); // this is the article to index private Article article; public WikipediaIndexWorker(Article article) { this.article = article; } private lucandra.IndexWriter getIndexWriter() throws TTransportException { lucandra.IndexWriter indexWriter = clientPool.get(); if (indexWriter == null) { Random r = new Random(); List<String> endpoints = ring.get(r.nextInt(ring.size())).endpoints; String endpoint = endpoints.get(r.nextInt(endpoints.size())); indexWriter = new lucandra.IndexWriter("wikipedia", CassandraUtils.createRobustConnection(endpoint, 9160, false, false)); clientPool.set(indexWriter); indexWriter.setAutoCommit(false); batchCount.set(0); } return indexWriter; } public Integer call() throws Exception { lucandra.IndexWriter indexWriter = getIndexWriter(); Document d = new Document(); d.add(new Field("title", article.title, Store.YES, Index.ANALYZED,TermVector.WITH_POSITIONS)); if (article.text != null) d.add(new Field("text", new String(article.text,"UTF-8"), Store.YES, Index.ANALYZED,TermVector.WITH_POSITIONS_OFFSETS)); d.add(new Field("url", article.url, Store.YES, Index.NOT_ANALYZED)); indexWriter.addDocument(d, analyzer); Integer c = batchCount.get(); if ((c + 1) % 64 == 0) { indexWriter.commit(); } batchCount.set(c + 1); return article.getSize(); } }