/** * */ package com.facebook.infrastructure.loader; import com.facebook.infrastructure.config.DatabaseDescriptor; import com.facebook.infrastructure.db.RowMutation; import com.facebook.infrastructure.db.Table; import com.facebook.infrastructure.io.SSTable; import com.facebook.infrastructure.net.EndPoint; import com.facebook.infrastructure.service.StorageService; import com.facebook.infrastructure.utils.LogUtil; import org.apache.log4j.Logger; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.standard.StandardAnalyzer; import javax.xml.bind.JAXBContext; import javax.xml.bind.Unmarshaller; import java.io.*; import java.util.*; /** * This class is used to load the storage endpoints with the relevant data * The data should be both what they are responsible for and what should be replicated on the specific * endpoints. * Population is done based on a xml file which should adhere to a schema. * * Author : Avinash Lakshman ( alakshman@facebook.com) & Prashant Malik ( pmalik@facebook.com ) */ public class Loader { private static long siesta_ = 60*1000; private static Logger logger_ = Logger.getLogger( Loader.class ); private Importer importer_; private StorageService storageService_; public Loader(StorageService storageService) { storageService_ = storageService; } /* * This method loads all the keys into a special column family * called "RecycleBin". This column family is used for temporary * processing of data and then can be recycled. The idea is that * after the load is complete we have all the keys in the system. * Now we force a compaction and examine the single Index file * that is generated to determine how the nodes need to relocate * to be perfectly load balanced. * * param @ rootDirectory - rootDirectory at which the parsing begins. * param @ table - table that will be populated. * param @ cfName - name of the column that will be populated. This is * passed in so that we do not unncessary allocate temporary String objects. */ private void preParse(File rootDirectory, String table, String cfName) throws Throwable { File[] files = rootDirectory.listFiles(); for ( File file : files ) { if ( file.isDirectory() ) preParse(file, table, cfName); else { String fileName = file.getName(); RowMutation rm = new RowMutation(table, fileName); rm.add(cfName, fileName.getBytes(), 0); rm.apply(); } } } /* * Merges a list of strings with a particular combiner. */ String merge( List<String> listFields, String combiner) { if(listFields.size() == 0 ) return null; if(listFields.size() == 1) return listFields.get(0); String mergedKey = null; for(String field: listFields) { if(mergedKey == null) { mergedKey = field; } else { mergedKey = mergedKey + combiner + field; } } return mergedKey; } /* * This function checks if the local storage endpoint * is reponsible for storing this key . */ boolean checkIfProcessKey(String key) { EndPoint[] endPoints = storageService_.getNStorageEndPoint(key); EndPoint localEndPoint = StorageService.getLocalStorageEndPoint(); for(EndPoint endPoint : endPoints) { if(endPoint.equals(localEndPoint)) return true; } return false; } /* * This functions parses each file based on delimiters specified in the * xml file. It also looks at all the parameters specified in teh xml and based * on that populates the internal Row structure. */ void parse(String filepath) throws Throwable { BufferedReader bufReader = new BufferedReader(new InputStreamReader( new FileInputStream(filepath)), 16 * 1024 * 1024); String line = null; String delimiter_ = new String(","); RowMutation rm = null; Map<String, RowMutation> rms = new HashMap<String, RowMutation>(); if(importer_.columnFamily.delimiter != null) { delimiter_ = importer_.columnFamily.delimiter; } while ((line = bufReader.readLine()) != null) { StringTokenizer st = new StringTokenizer(line, delimiter_); List<String> tokenList = new ArrayList<String>(); String key = null; while (st.hasMoreElements()) { tokenList.add((String)st.nextElement()); } /* Construct the Key */ List<String> keyFields = new ArrayList<String> (); for(int fieldId: importer_.key.fields.field) { keyFields.add(tokenList.get(fieldId)); } key = merge(keyFields, importer_.key.combiner); if(importer_.key.optimizeIt != null && !importer_.key.optimizeIt) { if(!checkIfProcessKey(key)) { continue; } } rm = rms.get(key); if( rm == null) { rm = new RowMutation(importer_.table, key); rms.put(key, rm); } if(importer_.columnFamily.superColumn != null) { List<String> superColumnList = new ArrayList<String>(); for(int fieldId : importer_.columnFamily.superColumn.fields.field) { superColumnList.add(tokenList.get(fieldId)); } String superColumnName = merge(superColumnList, " "); superColumnList.clear(); if(importer_.columnFamily.superColumn.tokenize) { Analyzer analyzer = new StandardAnalyzer(); TokenStream ts = analyzer.tokenStream("superColumn", new StringReader(superColumnName)); Token token = null; token = ts.next(); while(token != null) { superColumnList.add(token.termText()); token = ts.next(); } } else { superColumnList.add(superColumnName); } for(String sName : superColumnList) { String cfName = importer_.columnFamily.name + ":" + sName; if(importer_.columnFamily.column != null) { for(ColumnType column : importer_.columnFamily.column ) { String cfColumn = cfName +":" + (column.name == null ? tokenList.get(column.field):column.name); rm.add(cfColumn, tokenList.get(column.value.field).getBytes(), Integer.parseInt(tokenList.get(column.timestamp.field))); } } } } else { if(importer_.columnFamily.column != null) { for(ColumnType column : importer_.columnFamily.column ) { String cfColumn = importer_.columnFamily.name +":" + (column.name == null ? tokenList.get(column.field):column.name); rm.add(cfColumn, tokenList.get(column.value.field).getBytes(), Integer.parseInt(tokenList.get(column.timestamp.field))); } } } } // Now apply the data for all keys // TODO : Add checks for large data // size maybe we want to check the // data size and then apply. Set<String> keys = rms.keySet(); for(String pKey : keys) { rm = rms.get(pKey); if( rm != null) { rm.apply(); } } } void parseFileList(File dir) { int fileCount = dir.list().length; for ( int i = 0 ; i < fileCount ; i++ ) { File file = new File(dir.list()[i]); if ( file.isDirectory()) { parseFileList(file); } else { try { if(importer_.key.optimizeIt != null && importer_.key.optimizeIt) { if(checkIfProcessKey(dir.list()[i])) { parse(dir.listFiles()[i].getAbsolutePath()); } } else { parse(dir.listFiles()[i].getAbsolutePath()); } } catch ( Throwable ex ) { logger_.error(ex.toString()); } } } } void preLoad(File rootDirectory) throws Throwable { String table = DatabaseDescriptor.getTables().get(0); String cfName = Table.recycleBin_ + ":" + "Keys"; /* populate just the keys. */ preParse(rootDirectory, table, cfName); /* dump the memtables */ Table.open(table).flush(false); /* force a compaction of the files. */ Table.open(table).forceCompaction(null,null,null); /* * This is a hack to let everyone finish. Just sleep for * a couple of minutes. */ logger_.info("Taking a nap after forcing a compaction ..."); Thread.sleep(Loader.siesta_); /* Figure out the keys in the index file to relocate the node */ List<String> ssTables = Table.open(table).getAllSSTablesOnDisk(); /* Load the indexes into memory */ SSTable.onStart(ssTables); /* We should have only one file since we just compacted. */ List<String> indexedKeys = SSTable.getSortedKeys(); storageService_.relocate(indexedKeys.toArray( new String[0]) ); /* * This is a hack to let everyone relocate and learn about * each other. Just sleep for a couple of minutes. */ logger_.info("Taking a nap after relocating ..."); Thread.sleep(Loader.siesta_); /* * Do the cleanup necessary. Delete all commit logs and * the SSTables and reset the load state in the StorageService. */ SSTable.delete(ssTables.get(0)); // File commitLogDirectory = new File( DatabaseDescriptor.getLogFileLocation() ); // FileUtils.delete(commitLogDirectory.listFiles()); storageService_.resetLoadState(); logger_.info("Finished all the requisite clean up ..."); } void load(String xmlFile) throws Throwable { try { JAXBContext jc = JAXBContext.newInstance(this.getClass().getPackage().getName()); Unmarshaller u = jc.createUnmarshaller(); importer_ = (Importer)u.unmarshal(new FileInputStream( xmlFile ) ); String directory = importer_.columnFamily.directory; File rootDirectory = new File(directory); preLoad(rootDirectory); parseFileList(rootDirectory); } catch (Exception e) { logger_.info(LogUtil.throwableToString(e)); } } /** * @param args */ public static void main(String[] args) throws Throwable { StorageService s = StorageService.instance(); s.start(); Loader loader = new Loader(s); loader.load("mbox_importer.xml"); } }