/** * Copyright 2008 - CommonCrawl Foundation * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * **/ package org.commoncrawl.service.pagerank.master; import java.io.BufferedInputStream; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.net.URL; import java.text.NumberFormat; import java.text.ParseException; import java.util.ArrayList; import java.util.StringTokenizer; import java.util.TreeSet; import java.util.Vector; import java.util.zip.CRC32; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.commoncrawl.async.Timer; import org.commoncrawl.crawl.common.internal.CrawlEnvironment; import org.commoncrawl.db.RecordStore; import org.commoncrawl.server.CommonCrawlServer; import org.commoncrawl.service.pagerank.BaseConfig; import org.commoncrawl.service.pagerank.IterationInfo; import org.commoncrawl.service.pagerank.PRMasterState; import org.commoncrawl.service.pagerank.PageRankJobConfig; import org.commoncrawl.service.pagerank.SlaveStatus; import org.commoncrawl.service.pagerank.SlaveStatus.State; import org.commoncrawl.util.CCStringUtils; /** * * @author rana * */ public class PageRankMaster extends CommonCrawlServer { private static final int MAX_ITERATION_DEFAULT = 50; private static final int POLL_TIMER_INTERVAL = 1000; private static final int DEFAULT_INSTANCES_PER_SLAVE=1; private static final String PRMasterStateKey = "PRMasterState"; private static final String PRJobConfigKey = "PRJobConfig"; private static final NumberFormat NUMBER_FORMAT = NumberFormat.getInstance(); static { NUMBER_FORMAT.setMinimumIntegerDigits(5); NUMBER_FORMAT.setGroupingUsed(false); } private String _slavesFile; private long _slavesFileCRC = -1; private FileSystem _fileSystem; private String _hdfsWorkingDir = "crawl/pageRank/jobs"; private long _jobId = -1; private boolean _pageRankStarted = false; private Timer _pollTimer = null; /** record store object used to persist state **/ private RecordStore _recordStore = new RecordStore(); private boolean _serverPaused = false; private Vector<PageRankRemoteSlave> _slaves = new Vector<PageRankRemoteSlave>(); private String _slavesList = null; private SlaveStatus _slaveStates[] = null; private PRMasterState _serverState = null; private PageRankJobConfig _jobConfig = null; //private Vector<PageRankJobConfig> _jobQueue = new Vector<PageRankJobConfig>(); //private Vector<PageRankJobConfig> _jobList = new Vector<PageRankJobConfig>(); public PageRankMaster() { setAsyncWebDispatch(true); } public PageRankJobConfig getActiveJobConfig() { return _serverState.getActiveJobConfig(); } public String getMasterState() { return PRMasterState.ServerStatus.toString(_serverState.getServerStatus()); } public PageRankJobConfig getActiveJob() { if (_serverState.getServerStatus() != PRMasterState.ServerStatus.IDLE) { return _serverState.getActiveJobConfig(); } return null; } public String getActiveJobName() { if (_serverState.getServerStatus() != PRMasterState.ServerStatus.IDLE) { return getJobName(_serverState.getActiveJobConfig()); } return ""; } //public Vector<PageRankJobConfig> getJobQueue() { return _jobQueue; } public static String getJobName(PageRankJobConfig jobConfig) { return "job-" + jobConfig.getJobId(); } /* public Vector<PageRankJobConfig> getQueueableJobList() { Vector<PageRankJobConfig> listOut = new Vector<PageRankJobConfig>(); HashSet<PageRankJobConfig> queuedJobs = new HashSet<PageRankJobConfig>(); queuedJobs.addAll(_jobQueue); for (PageRankJobConfig job : _jobList) { if (!queuedJobs.contains(job)) { listOut.add(job); } } return listOut; } */ public void createNewJob(String inputValuePath,String graphPath,int maxIterations, int slaveCount) throws IOException { PageRankJobConfig jobConfig = new PageRankJobConfig(); jobConfig.setJobId(System.currentTimeMillis()); jobConfig.setIterationNumber(0); jobConfig.setMaxIterationNumber(maxIterations); jobConfig.setSlaveCount(slaveCount); jobConfig.setInputValuesPath(inputValuePath); jobConfig.setOutlinksDataPath(graphPath); // set up job dir ... Path jobPath = new Path(_hdfsWorkingDir,Long.toString(jobConfig.getJobId())); // mk the job dir // _fileSystem.mkdirs(jobPath); jobConfig.setJobWorkPath(jobPath.toString()); jobConfig.setAlgorithmId(0); jobConfig.setAlpha(.85f); //serializeJobConfig(jobConfig); //_jobList.add(jobConfig); } /* public void queueJob(String jobIdStr) { long jobId = Long.parseLong(jobIdStr); for (PageRankJobConfig job : _jobList) { if (job.getJobId() == jobId) { _jobList.remove(job); _jobQueue.add(job); break; } } } */ Vector<PageRankRemoteSlave> getSlaves() { return _slaves; } public boolean isServerIdle() { return _serverState.getServerStatus() == PRMasterState.ServerStatus.IDLE; } public boolean isPageRankActive() { return _serverState.getServerStatus() != PRMasterState.ServerStatus.IDLE; } public boolean isPageRankTerminating() { return _serverState.getServerStatus() == PRMasterState.ServerStatus.FINISHING; } public boolean isServerPaused() { return _serverPaused; } public boolean isIterationActive() { return _serverState.getServerStatus() >= PRMasterState.ServerStatus.ITERATING_DISTRIBUTING && _serverState.getServerStatus() <= PRMasterState.ServerStatus.ITERATING_CALCULATING; } public int getSlaveIterationPhase() { if (_serverState.getServerStatus() == PRMasterState.ServerStatus.ITERATING_CALCULATING) { return IterationInfo.Phase.CALCULATE; } else { return IterationInfo.Phase.DISTRIBUTE; } } public long getCurrentJobNumber() { return (isPageRankActive()) ? _serverState.getActiveJobConfig().getJobId() : -1; } public int getCurrentIterationNumber() { return (isPageRankActive()) ? _serverState.getActiveJobConfig().getIterationNumber() : -1; } public int getMaxIteration() { return (isPageRankActive()) ? _serverState.getActiveJobConfig().getMaxIterationNumber() : -1; } public BaseConfig getBaseConfigForSlave(PageRankRemoteSlave slave) { BaseConfig baseConfig = new BaseConfig(); baseConfig.setBaseWorkingDir(_hdfsWorkingDir); // baseConfig.setFileSystem(_fileSystem.getUri().toString()); baseConfig.setSlaveCount(_slaves.size()); baseConfig.setSlaveId(slave.getSlaveId()); baseConfig.setSlavesList(_slavesList); return baseConfig; } public void createJob() { } //@Override protected String getDefaultLogFileName() { return "prmaster"; } @Override protected String getDefaultDataDir() { return CrawlEnvironment.DEFAULT_DATA_DIR; } @Override protected String getDefaultHttpInterface() { return CrawlEnvironment.DEFAULT_HTTP_INTERFACE; } @Override protected int getDefaultHttpPort() { return CrawlEnvironment.DEFAULT_PAGERANK_MASTER_HTTP_PORT; } @Override protected String getDefaultRPCInterface() { return CrawlEnvironment.DEFAULT_RPC_INTERFACE; } @Override protected int getDefaultRPCPort() { return CrawlEnvironment.DEFAULT_PAGERANK_MASTER_RPC_PORT; } @Override protected String getWebAppName() { return CrawlEnvironment.PAGERANK_MASTER_WEBAPP_NAME; } @Override protected boolean initServer() { if (_slavesFile == null || _jobId == -1) { LOG.error("Slaves File not specified. Specify Slaves file via --slaves"); return false; } else { try { // get a pointer to the hdfs file system // _fileSystem = CrawlEnvironment.getDefaultFileSystem(); // parse slaves file .. parseSlavesFile(); // load database state ... loadState(); // init slave states array _slaveStates = new SlaveStatus[_slaves.size()]; // and populate it ... for (int i=0;i<_slaveStates.length;++i) { _slaveStates[i] = new SlaveStatus(); } // connect to slaves ... connectToSlaves(); // setup poll timer _pollTimer = new Timer(POLL_TIMER_INTERVAL,true,new Timer.Callback() { @Override public void timerFired(Timer timer) { potentaillyUpdateServerStatus(); } }); getEventLoop().setTimer(_pollTimer); return true; } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); } } return false; } @Override protected boolean parseArguements(String[] argv) { for(int i=0; i < argv.length;++i) { if (argv[i].equalsIgnoreCase("--slaves")) { if (i+1 < argv.length) { _slavesFile = argv[++i]; } } else if (argv[i].equalsIgnoreCase("--jobId")) { _jobId = Long.parseLong(argv[++i]); } } return true; } @Override protected void printUsage() { } @Override protected boolean startDaemons() { return true; } @Override protected void stopDaemons() { } void parseSlavesFile()throws IOException { StringBuffer slavesListWriter = new StringBuffer(); LOG.info("Loading Slaves File from:" + _slavesFile); InputStream stream =null; URL resourceURL = CrawlEnvironment.getHadoopConfig().getResource(_slavesFile); if (resourceURL != null) { stream = resourceURL.openStream(); } // try as filename else { LOG.info("Could not load resource as an URL. Trying as an absolute pathname"); stream = new FileInputStream(new File(_slavesFile)); } if (stream == null) { throw new FileNotFoundException(); } BufferedReader reader = new BufferedReader(new InputStreamReader(new BufferedInputStream(stream))); String slaveHostPlusCount = null; int slaveCount = 0; LOG.info("Loading slaves file"); while ((slaveHostPlusCount = reader.readLine()) != null) { if (!slaveHostPlusCount.startsWith("#")) { StringTokenizer tokenizer = new StringTokenizer(slaveHostPlusCount,":"); if (tokenizer.countTokens() != 3){ throw new IOException("Invalid Slave Entry:" + slaveHostPlusCount + " in slaves File"); } else { String slaveName = tokenizer.nextToken(); int instanceCount = Integer.parseInt(tokenizer.nextToken()); String localInterfacePort = tokenizer.nextToken(); for (int i=0;i<instanceCount;++i) { PageRankRemoteSlave state = new PageRankRemoteSlave(this,slaveCount++,slaveName,i); LOG.info("Adding slave:" + slaveName + "instance:" + i); _slaves.add(state); slavesListWriter.append(state.getFullyQualifiedName() + ","); } } } // update finalized slaves list _slavesList = slavesListWriter.toString(); } // now close the file and reopen to to compute the crc ... reader.close(); stream.close(); CRC32 fileCRC = new CRC32(); InputStream crcStream = null; if (resourceURL != null) { crcStream = resourceURL.openStream(); } else { LOG.info("Could not load resource as an URL. Trying as an absolute pathname"); crcStream = new FileInputStream(new File(_slavesFile)); } byte[] buf = new byte[4096]; int nRead = 0; while ( (nRead = crcStream.read(buf, 0, buf.length)) > 0 ) { fileCRC.update(buf, 0, nRead); } _slavesFileCRC = fileCRC.getValue(); LOG.info("Slaves File CRC is:" + _slavesFileCRC); crcStream.close(); } void connectToSlaves() throws IOException { LOG.info("Connecting to Slaves"); for (PageRankRemoteSlave slave : _slaves) { slave.connect(); } } // react to a status change in a pagerank slave .. void slaveStatusChanged(PageRankRemoteSlave slave) { LOG.info("slaveStatusChanged from slave:" + slave.getFullyQualifiedName() + " NewStatus:" + slave.getLastKnowStatus() + " ServerState:" + _serverState.getServerStatus() ); try { _slaveStates[slave.getSlaveId()].merge(slave.getLastKnowStatus()); } catch (CloneNotSupportedException e) { } if (_serverState.getServerStatus() == PRMasterState.ServerStatus.IDLE) { if (!_pageRankStarted) { int idleCount = 0; // check to see if all slaves are ready for (SlaveStatus slaveState : _slaveStates) { // if this slave is finished with current iteration .. if (slaveState.getState() == SlaveStatus.State.IDLE) { ++idleCount; } } // if all slaves are in idle state if (idleCount == _slaveStates.length) { // ready to send page rank start command LOG.info("All Slaves Online and Idle. Sending Start PageRank Cmd"); // and send out start page rank command sendSlavesStartPageRankCmd(PRMasterState.ServerStatus.STARTED); _pageRankStarted = true; } else { LOG.info(idleCount + " Slave Idle/Online. Waiting on:" + (_slaveStates.length - idleCount)); } } } else { potentiallyResyncSlaveState(slave.getSlaveId()); potentaillyUpdateServerStatus(); } } private void potentiallyResyncSlaveState(int slaveIdx) { SlaveStatus status = _slaveStates[slaveIdx]; //check to see if the slave even has a potentially valid state ... if (status.isFieldDirty(SlaveStatus.Field_STATE)) { // do we want to reset this slave... boolean resetSlave = false; // ok now try to see if it is out of sync with the master ... switch (status.getState()) { // slave is in an initialized but idle state ... case SlaveStatus.State.IDLE: { // if page rank is active ... send the appropriate command to the slave ... if (isPageRankActive()) { LOG.info("Slave is IDLE while Master has PageRankActive. Sending Start Page Rank to Slave:" + _slaves.get(slaveIdx).getFullyQualifiedName()); _slaves.get(slaveIdx).sendStartPageRankCmd(_serverState.getServerStatus()); } } break; case SlaveStatus.State.STARTED_IDLE: case SlaveStatus.State.DONE_CALCULATING: { // if page rank is active ... figure out next steps based on host state if (isPageRankActive()) { // if we are iterating ... if (isIterationActive()) { boolean sendDoIterationCommand = true; if (status.getState() == SlaveStatus.State.DONE_CALCULATING && _serverState.getServerStatus() == PRMasterState.ServerStatus.ITERATING_CALCULATING) { if (status.getCurrentIteration() == _serverState.getActiveJobConfig().getIterationNumber()) { sendDoIterationCommand = false; LOG.info("Slave is DONE_CALCULATING and Master is in CALCULATING... IGORING Slave:" + _slaves.get(slaveIdx).getFullyQualifiedName()); } } if (sendDoIterationCommand) { LOG.info("Slave is in "+ SlaveStatus.State.toString(status.getState()) + " while Master is in Iteration. Sending StartIteration to Slave:" + _slaves.get(slaveIdx).getFullyQualifiedName()); sendSlaveDoIterationCmd(_slaves.get(slaveIdx)); } } // are we terminating ... ? else if (isPageRankTerminating()) { LOG.info("Slave is in "+ SlaveStatus.State.toString(status.getState()) +" while Master is in PageRankTerminating. Sending EndPageRank to Slave:" + _slaves.get(slaveIdx).getFullyQualifiedName()); // send end page rank command to slave ... _slaves.get(slaveIdx).sendEndPageRankCmd(); } } else { // bad slave is out of sync ... reset it ... resetSlave = true; } } break; // if slave is in iteration state ... case SlaveStatus.State.DISTRIBUTING: case SlaveStatus.State.DONE_DISTRIBUTING: case SlaveStatus.State.CALCULATING: { if (status.getState() == State.DONE_DISTRIBUTING && _deferredSlaves.size() != 0) { PageRankRemoteSlave nextSlave = _deferredSlaves.first(); LOG.info("Sending Do Iteration To Deferred Slave:" + nextSlave.getFullyQualifiedName()); _deferredSlaves.remove(nextSlave); sendSlaveDoIterationCmd(nextSlave); } // validate that master is in sync ... if (!isPageRankActive() && !isIterationActive()) { resetSlave = true; } } break; } if (resetSlave) { LOG.error("Slave:" + _slaves.get(slaveIdx).getFullyQualifiedName() +" out of Sync with Master." + "Master State:" + _serverState + "Slave State:" + status.getState() + " -- Sending RESET"); // reset slave state _slaveStates[slaveIdx].clear(); // and send reset cmd to slave... _slaves.get(slaveIdx).sendResetCmd(); } } } private void potentaillyUpdateServerStatus() { try { // if the server is in an idle state ... if (_serverState.getServerStatus() == PRMasterState.ServerStatus.IDLE) { // start the next job potentially //potentiallyStartNextPRJob(); } // paused state handling ... /* if (_serverState.getServerStatus() == PRMasterState.ServerStatus.PAUSED) { // if server is in a paused state but we are ok to start iterating ... if (!isServerPaused()) { LOG.info("Server Moving from PAUSED to NextIteration State"); advanceToNextPRIteration(); } } */ else if (_serverState.getServerStatus() == PRMasterState.ServerStatus.STARTED) { int completionCount = 0; for (SlaveStatus slaveState : _slaveStates) { // if this slave is finished with current iteration .. if (slaveState.getState() == SlaveStatus.State.STARTED_IDLE && slaveState.getActiveJobId() == getActiveJobConfig().getJobId()) { ++completionCount; } } if (completionCount == _slaves.size()) { LOG.info("Server in STARTED STATE and All Clients are STARTED_IDLE. Moving to NextIteration"); // and update server state ... _serverState.setServerStatus(PRMasterState.ServerStatus.ITERATING_DISTRIBUTING); // reset txn id _serverState.setCurrentTxnId(System.currentTimeMillis()); // finally serialize state //serializeServerState(); // and send out the next iteration command to the slaves ... sendSlavesDoIterationCmd(); } } // if in iteration state ... else if (_serverState.getServerStatus() == PRMasterState.ServerStatus.ITERATING_DISTRIBUTING || _serverState.getServerStatus() == PRMasterState.ServerStatus.ITERATING_CALCULATING ) { int completedSlaveCount = 0; int desiredTransitionState = (_serverState.getServerStatus() == PRMasterState.ServerStatus.ITERATING_DISTRIBUTING) ? SlaveStatus.State.DONE_DISTRIBUTING : SlaveStatus.State.DONE_CALCULATING; for (int slaveIdx=0;slaveIdx<_slaveStates.length;++slaveIdx) { SlaveStatus slaveState = _slaveStates[slaveIdx]; // if this slave is finished with current iteration .. if (slaveState.getState() == desiredTransitionState && slaveState.getCurrentIteration() == getCurrentIterationNumber()) { ++completedSlaveCount; } } // if all slaves are done with the current iteration ... if (completedSlaveCount == _slaves.size()) { int completedCheckpointsCount = 0; for (int slaveIdx=0;slaveIdx<_slaveStates.length;++slaveIdx) { SlaveStatus slaveState = _slaveStates[slaveIdx]; // check to see if we need to send this slave a checkpoint command if (slaveState.getCurrentCheckpointId() != _serverState.getCurrentTxnId()) { LOG.info("Sending Slave:" + _slaves.get(slaveIdx).getFullyQualifiedName() + " Checkpoint Command - CurrentCheckpointId:" + slaveState.getCurrentCheckpointId()); slaveState.setCurrentCheckpointId(_serverState.getCurrentTxnId()); sendSlaveCheckpointCommand(slaveIdx,_serverState.getCurrentTxnId(),getSlaveIterationPhase(),getCurrentIterationNumber()); } else { // ok the current slave completed active checkpoint command if (slaveState.getCommittedCheckpointId() == _serverState.getCurrentTxnId()) { // increment completion count ... completedCheckpointsCount++; LOG.info("Slave:"+_slaves.get(slaveIdx).getFullyQualifiedName() + " Has Completed Checkpoint. Completed CheckpointCount:" + completedCheckpointsCount); } } } // if everyone completed active checkpoint if (completedCheckpointsCount == _slaves.size()) { // OK. advance to next MASTER state .. LOG.info("All Slaves Report Successfull Checkpoint Status for Txn:" + _serverState.getCurrentTxnId()); // if we were in the distribute phase ... if (_serverState.getServerStatus() == PRMasterState.ServerStatus.ITERATING_DISTRIBUTING) { LOG.info("All Clients Done DISTRIBUTING. Moving to CALCULATING State"); // advance master state _serverState.setServerStatus(PRMasterState.ServerStatus.ITERATING_CALCULATING); // reset txn id _serverState.setCurrentTxnId(System.currentTimeMillis()); // serialize the state // /serializeServerState(); // notify slaves ... sendSlavesDoIterationCmd(); } // otherwise, if in the calculation phase ... else { // if we reached the last iteration .. .we need to do a clean shutdown ... if (getCurrentIterationNumber() == getMaxIteration()) { LOG.info("All Clients Done CALCULATING and Iteration Number == Max Iteration.Moving to FINISHING STATE"); // set our appropriate state ... _serverState.setServerStatus(PRMasterState.ServerStatus.FINISHING); // serialize the state //serializeServerState(); // and send shutdown command to slaves ... sendSlavesEndPageRankCmd(); } // otherwise, else { // if server is not paused ... if (!isServerPaused()) { LOG.info("All Clients Done CALCULATING and Iteration Number < Max Iteration.Moving to Next Iteration"); advanceToNextPRIteration(); } else { LOG.info("All Clients Done CALCULATING and Iteration Number < Max Iteration BUT Server is PAUSED.Moving to PAUSED STATE"); // set pause state _serverState.setServerStatus(PRMasterState.ServerStatus.PAUSED); // serialize state //serializeServerState(); } } } } } } // shutdown state handling ... else if (_serverState.getServerStatus() == PRMasterState.ServerStatus.FINISHING) { int idledCount = 0; for (SlaveStatus slaveState : _slaveStates) { // if this slave is finished with current iteration .. if ((slaveState.getState() == SlaveStatus.State.DONE_CALCULATING || slaveState.getState() == SlaveStatus.State.STARTED_IDLE) && slaveState.getCurrentIteration() == getCurrentIterationNumber()) { ++idledCount; } } // all slaves are done cleaning up ... if (idledCount == _slaves.size()) { LOG.info("SERVER in FINISHING State and ALL Clients FINISHED.Finishing PR Job"); // do cleanup ... //finishPageRankJob(_serverState.getActiveJobConfig(),false); _serverState.getActiveJobConfig().clear(); _serverState.setFieldClean(PRMasterState.Field_ACTIVEJOBCONFIG); // and reset state ... _serverState.setServerStatus(PRMasterState.ServerStatus.IDLE); // and serialize state //serializeServerState(); } } } catch (IOException e) { LOG.error("Unexpected IOException: " + CCStringUtils.stringifyException(e)); //TODO: KILL SERVER HERE ... } } private void advanceToNextPRIteration() throws IOException { //advance iteration number and restart distribution ... _serverState.getActiveJobConfig().setIterationNumber(_serverState.getActiveJobConfig().getIterationNumber() + 1); // and update server state ... _serverState.setServerStatus(PRMasterState.ServerStatus.ITERATING_DISTRIBUTING); // update transaction id _serverState.setCurrentTxnId(System.currentTimeMillis()); // finally serialize state //serializeServerState(); // and send out the next iteration command to the slaves ... sendSlavesDoIterationCmd(); } /* private void serializeServerState() throws IOException { _recordStore.beginTransaction(); _recordStore.updateRecordByKey(PRMasterStateKey, _serverState); if (_serverState.isFieldDirty(PRMasterState.Field_ACTIVEJOBCONFIG)) { // update the disk state of the active job config serializeJobConfig(_serverState.getActiveJobConfig()); } _recordStore.commitTransaction(); } */ private void clearAllCheckpointAndDistributionFiles(FileSystem fs,Path jobDataPath) throws IOException { // scan job directory for best value candidate Path checkpointSearchPattern = new Path(jobDataPath,"*-CheckpointComplete-*"); Path distroSearchPattern = new Path(jobDataPath,"OutlinkPR-*"); FileStatus checkpointCandidates[] = fs.globStatus(checkpointSearchPattern); for (FileStatus candidate : checkpointCandidates) { LOG.info("Deleting:" + candidate.getPath()); fs.delete(candidate.getPath(),false); } FileStatus distroCandidates[] = fs.globStatus(distroSearchPattern); for (FileStatus candidate : distroCandidates) { LOG.info("Deleting:" + candidate.getPath()); fs.delete(candidate.getPath(),false); } } private int findListValidIteration(FileSystem fs,Path jobDataPath) throws IOException { // scan job directory for best value candidate Path valueSearchPattern = new Path(jobDataPath,"value_*-00000"); FileStatus candidates[] = fs.globStatus(valueSearchPattern); int lastValidIterationNo = -1; ArrayList<Path> iterationSpecificValues = new ArrayList<Path>(); for (FileStatus candidate : candidates) { // extract iteration portion of name String iterationStr = candidate.getPath().getName().substring("value_".length(),"value_".length() + 5); // parse try { int iterationId = NUMBER_FORMAT.parse(iterationStr).intValue(); // now see if we up to PR_NUM_SLAVES values Path iterationSpecificSearchPattern = new Path(jobDataPath,"value_" + iterationStr + "-*"); // count result FileStatus iterationSpecificEntires[] = fs.globStatus(iterationSpecificSearchPattern); if (iterationSpecificEntires.length == CrawlEnvironment.PR_NUMSLAVES) { LOG.info("Iteration Number:" + iterationId + " has the proper number of results"); if (lastValidIterationNo == -1 || lastValidIterationNo < iterationId) { // set this iteration as the valid iteration lastValidIterationNo = iterationId; LOG.info("Setting Iteration:"+ iterationId + " as last valid iteration number"); // clear candidate list iterationSpecificValues.clear(); // add paths to candidate list ... for (FileStatus iterationSpecificEntry : iterationSpecificEntires) { iterationSpecificValues.add(iterationSpecificEntry.getPath()); } } } else { LOG.error("Skipping Iteration Number:" + iterationId + ". It only has:" + iterationSpecificEntires.length + "results"); } } catch (ParseException e) { LOG.error(CCStringUtils.stringifyException(e)); } } return lastValidIterationNo; } private void loadState() throws IOException { FileSystem fs = CrawlEnvironment.getDefaultFileSystem(); // allocate a new server state ... _serverState = new PRMasterState(); // initially in idle state _serverState.setServerStatus(PRMasterState.ServerStatus.IDLE); // paths Path valuesFilePath = new Path("crawl/pageRank/seed/" + _jobId + "/values"); Path edgesFilePath = new Path("crawl/pageRank/seed/" + _jobId + "/edges"); // figure out number of values int itemCount = fs.globStatus(new Path(valuesFilePath,"value_*")).length; LOG.info("There are:" + itemCount + " values for job:" + _jobId); Path jobsPath = new Path("crawl/pageRank/jobs/" + _jobId); fs.mkdirs(jobsPath); // find last valid iteration int lastValidIteration = findListValidIteration(fs,jobsPath); LOG.info("Last Valid Iteration for Job:" + _jobId + " is:" + lastValidIteration); int nextIteration = lastValidIteration + 1; // clear all check point and distribution files clearAllCheckpointAndDistributionFiles(fs,jobsPath); PageRankJobConfig jobConfig = new PageRankJobConfig(); jobConfig.setJobId(_jobId); jobConfig.setIterationNumber(nextIteration); jobConfig.setMaxIterationNumber(1000); jobConfig.setSlaveCount(itemCount); jobConfig.setInputValuesPath(valuesFilePath.toString()); jobConfig.setOutlinksDataPath(edgesFilePath.toString()); // set up job dir ... Path jobPath = new Path(_hdfsWorkingDir,Long.toString(jobConfig.getJobId())); jobConfig.setJobWorkPath(jobPath.toString()); jobConfig.setAlgorithmId(0); jobConfig.setAlpha(.85f); _jobConfig = jobConfig; // update server state ... _serverState.setActiveJobConfig(_jobConfig); _serverState.setFieldDirty(PRMasterState.Field_ACTIVEJOBCONFIG); _serverState.setServerStatus(PRMasterState.ServerStatus.STARTED); } /* private void loadState() throws IOException { // initialize database ... File databasePath = new File(getDataDirectory().getAbsolutePath() + "/" + CrawlEnvironment.PRMASTER_DB); LOG.info("Config says PRMaster State db path is: "+databasePath); // initialize record store _recordStore.initialize(databasePath, null); // load db state ... _serverState = (PRMasterState) _recordStore.getRecordByKey(PRMasterStateKey); if (_serverState == null) { // allocate a brand new state _serverState = new PRMasterState(); // update crc _serverState.setLastKnownSlaveFileCRC(_slavesFileCRC); // and write to disk ... _recordStore.beginTransaction(); _recordStore.insertRecord(null, PRMasterStateKey, _serverState); _recordStore.commitTransaction(); } else { // validate crc if (_serverState.getLastKnownSlaveFileCRC() != _slavesFileCRC) { LOG.warn("Slave Config changed since last load. Discarding any pending transactions"); if (_serverState.isFieldDirty(PRMasterState.Field_ACTIVEJOBCONFIG)) { // clear disk state for last job ... finishPageRankJob(_serverState.getActiveJobConfig(),true); // clear server state _serverState.getActiveJobConfig().clear(); // set field clean on job config _serverState.setFieldClean(PRMasterState.Field_ACTIVEJOBCONFIG); // set state to idle ... _serverState.setServerStatus(PRMasterState.ServerStatus.IDLE); // and write new state to disk serializeServerState(); } } } // now load job configs ... Vector<Long> jobRecordList = _recordStore.getChildRecordsByParentId(PRJobConfigKey); LOG.info("Found "+ jobRecordList.size() + " serialized jobs. Loading job configs..."); for (long recordId : jobRecordList) { PageRankJobConfig jobConfig = (PageRankJobConfig) _recordStore.getRecordById(recordId); _jobList.add(jobConfig); } //TODO:HACK _serverState.setServerStatus(PRMasterState.ServerStatus.ITERATING_CALCULATING); _serverState.getActiveJobConfig().setIterationNumber(10); } */ /* private void serializeJobConfig(PageRankJobConfig jobConfig) throws IOException { if (jobConfig.getRecordId() != 0) { _recordStore.beginTransaction(); _recordStore.updateRecordById(jobConfig.getRecordId(), jobConfig); _recordStore.commitTransaction(); } else { _recordStore.beginTransaction(); _recordStore.insertRecord(PRJobConfigKey, PRJobConfigKey + "_" + jobConfig.getJobId(), jobConfig); _recordStore.commitTransaction(); } } */ private static synchronized String getOutputName(String prefix,int instanceId) { return prefix + NUMBER_FORMAT.format(instanceId); } /* private void finishPageRankJob(PageRankJobConfig config,boolean jobFailed)throws IOException { LOG.info("Finishing PageRankJob:" + config.getJobId() + "JobFailed:" + jobFailed); Path jobBasePath = new Path(config.getJobWorkPath()); LOG.info("Constructing Output Directory:" + config.getOutputValuesPath()); if (!jobFailed) { Path outputPath = new Path(config.getOutputValuesPath()); //_fileSystem.mkdirs(outputPath); //_fileSystem.delete(new Path(outputPath,"*"), true); LOG.info("Copying data files to output directory"); // iterate slaves collecting data ... for (int i=0;i<_slaves.size();++i) { Path jobInstancePath = new Path(jobBasePath,getOutputName("",i)); Path jobOutputPath = new Path(jobInstancePath,Constants.PR_VALUES_FILE_DIR + "/part-0"); Path finalDestinationPath = new Path(outputPath,getOutputName("part-",i)); LOG.info("Moving " + jobOutputPath + " to:" + finalDestinationPath); //_fileSystem.rename(jobOutputPath, finalDestinationPath); } } LOG.info("Purging Job Dir"); //_fileSystem.delete(jobBasePath,true); } */ private String dumpPageRankJobInfo(PageRankJobConfig config) { return "InputPath:" + config.getInputValuesPath() + " OutputPath:" + config.getOutputValuesPath() + " Algorithm:" + config.getAlgorithmId() + " Alpha:" + config.getAlpha(); } /* private void potentiallyStartNextPRJob() throws IOException { if (true) { while (_jobQueue.size() != 0) { // get first element off of queue ... PageRankJobConfig config = _jobQueue.remove(0); LOG.info("Processing Job:" + dumpPageRankJobInfo(config)); /* // validate the input path ... Path inputValuePath = new Path(config.getInputValuesPath()); FileStatus[] valuesPaths = _fileSystem.globStatus(new Path(inputValuePath,"part-?????")); Path outlinksPath = new Path(config.getOutlinksDataPath()); FileStatus[] outlinksPaths = _fileSystem.globStatus(new Path(inputValuePath,"part-?????")); */ /* if (outlinksPaths.length != _slaves.size()) { LOG.error("Rejecting Job. NEED " + _slaves.size() + " Outlink Files - Found:" + outlinksPaths.length); } else */ /* { // set up job dir ... Path jobPath = new Path(_hdfsWorkingDir,"job-" + config.getJobId()); // mk the job dir dir _fileSystem.mkdirs(jobPath); LOG.info("Assigning Job WorkingDir:" + jobPath); config.setJobWorkPath(jobPath.toString()); */ /* // update server state ... _serverState.setActiveJobConfig(config); _serverState.setFieldDirty(PRMasterState.Field_ACTIVEJOBCONFIG); _serverState.setServerStatus(PRMasterState.ServerStatus.STARTED); // serilalize the state ... serializeServerState(); // and send out start page rank command sendSlavesStartPageRankCmd(PRMasterState.ServerStatus.STARTED); break; } } } } */ void sendSlavesStartPageRankCmd(int serverStatus){ for (PageRankRemoteSlave slave : _slaves) { slave.sendStartPageRankCmd(serverStatus); } } TreeSet<PageRankRemoteSlave> _deferredSlaves = new TreeSet<PageRankRemoteSlave>(); void sendSlavesDoIterationCmd() { if (_serverState.getServerStatus() == PRMasterState.ServerStatus.ITERATING_DISTRIBUTING) { int i = 0; int desiredInitialRunCount = _slaves.size(); for (PageRankRemoteSlave slave : _slaves) { if (i++ < desiredInitialRunCount) { sendSlaveDoIterationCmd(slave); } else { _deferredSlaves.add(slave); } } } else { for (PageRankRemoteSlave slave : _slaves) { sendSlaveDoIterationCmd(slave); } } } void sendSlaveDoIterationCmd(PageRankRemoteSlave slave) { if (_serverState.getServerStatus() == PRMasterState.ServerStatus.ITERATING_DISTRIBUTING) { if (_deferredSlaves.contains(slave)) { LOG.error("Invalid doIteration Cmd on Deferred Slave!"); return; } } slave.sendDoIterationCmd(); } void sendSlaveCheckpointCommand(int slaveIndex,long txnId,int currentPhase,int currentIterationNumber) { PageRankRemoteSlave slave = _slaves.get(slaveIndex); slave.sendCheckpointCommand(txnId,currentPhase,currentIterationNumber); } void sendSlavesEndPageRankCmd() { for (PageRankRemoteSlave slave : _slaves) { slave.sendEndPageRankCmd(); } } }